In [None]:
%load_ext autoreload
%autoreload 2

# Load a single ep and show some translations

In [None]:
from pathlib import Path

from transformers.pipelines import pipeline

from interleave_epub.epub.epub import EPub
from interleave_epub.nlp.cached_pipe import TranslationPipelineCache
from interleave_epub.nlp.local_spacy_model import spacy_load_local_model


### Load NLP models

In [None]:
spacy_cache = Path("~/.cache/spacy_my_models").expanduser()

nlp = {
    "en": spacy_load_local_model("en_core_web_md", spacy_cache, force_download=False),
    "fr": spacy_load_local_model("fr_core_news_md", spacy_cache, force_download=False),
}


In [None]:
hug_cache = Path("~/.cache/hug_my_trad").expanduser()
hug_trad_en = hug_cache / "translated_en-fr.json"
hug_trad_fr = hug_cache / "translated_fr-en.json"

# pipe_fren = pipeline("translation", model=f"Helsinki-NLP/opus-mt-fr-en")
pipe_fren = None

pipe = {
    "en-fr": TranslationPipelineCache(None, hug_trad_en, "en-fr"),
    "fr-en": TranslationPipelineCache(pipe_fren, hug_trad_fr, "fr-en"),
}

pipe["en-fr"]("Let's try this cool way to create a callable class.")


### Load Ep

In [None]:
# a sample epub, in French
epub_folder_path = Path("~").expanduser() / "snippet" / "datasets" / "ebook"
an_epub_path = epub_folder_path / "Gaston_Leroux_-_Le_Mystere_de_la_chambre_jaune.epub"
an_epub_path


In [None]:
# load it and translate the sentences
an_epub = EPub(an_epub_path, "Chambre Jaune", "fr", "en", nlp, pipe)


In [None]:
# show a sample paragraph
a_chap = an_epub.chapters[1]
a_par = a_chap.paragraphs[0]
a_par


In [None]:
len(a_chap.paragraphs)


In [None]:
len(a_chap.sents_text["orig"])


In [None]:
for (i_p, i_s), sent in a_chap.enumerate_sents("orig"):
    print(f"{i_p} {i_s} {sent}")


## Immediately derail everything to pick better sentences

In [None]:
a_par.par_doc.has_annotation("SENT_START")


In [None]:
a_doc = a_par.par_doc.copy()


In [None]:
for i_s, sents in enumerate(a_doc.sents):
    print(f"{i_s} {sents}")


In [None]:
# https://spacy.io/api/doc#retokenize
# https://stackoverflow.com/questions/58294798/spacy-doc-merge-to-using-retokenizer
# https://stackoverflow.com/questions/65083559/how-to-write-code-to-merge-punctuations-and-phrases-using-spacy

start_it = 15
for it, token in enumerate(a_doc[start_it:25]):
    print(
        f"{it+start_it:3d} {token.text:17s}"
        f"{token.is_sent_start} {token.is_sent_end}"
    )

    # does this work? No, of course
    # if token.is_sent_start: token.is_sent_start = False
    # Refusing to write to token.sent_start if its document is parsed, because this may cause inconsistent state.


In [None]:
a_doc[21 : 23 + 1]


In [None]:
with a_doc.retokenize() as retokenizer:
    retokenizer.merge(a_doc[21 : 22 + 1])


In [None]:
# check if the sentences were actually merged
a_doc[21].is_sent_start, a_doc[21].is_sent_end


In [None]:
# do it again
with a_doc.retokenize() as retokenizer:
    retokenizer.merge(a_doc[21 : 22 + 1])


In [None]:
# check if the sentences were actually merged
a_doc[21].is_sent_start, a_doc[21].is_sent_end


In [None]:
a_doc[21 : 23 + 1]


In [None]:
for i_s, sents in enumerate(a_doc.sents):
    print(f"{i_s} {sents}")


In [None]:
start_it = 15
for it, token in enumerate(a_doc[start_it:25]):
    print(
        f"{it+start_it:3d} {token.text:17s}"
        f"{token.is_sent_start} {token.is_sent_end}"
    )
