In [None]:
%load_ext autoreload
%autoreload 2

# Test and use the library

In [None]:
from pathlib import Path

from transformers import pipeline
import spacy

from cached_pipe import PipelineCache
from epub import (
    EPub,
    Chapter,
)
from utils import (
    get_ebook_folder,
)


### Load NLP objeccts

In [None]:
lts = ["en", "fr"]
lts_pair = list(zip(lts, lts[::-1]))
lts, lts_pair


In [None]:
nlp = {
    "en": spacy.load("en_core_web_md"),
    "fr": spacy.load("fr_core_news_md"),
}


In [None]:
# TODO:
# Should export env variables to avoid needing an internet connection.


In [None]:
pipe = {
    f"{lt}_{lt_other}": pipeline("translation", model=f"Helsinki-NLP/opus-mt-{lt}-{lt_other}")
    for lt, lt_other in lts_pair
}


## Load cached translator pipeline

In [None]:
cache_file_path = {
    f"{lt}_{lt_other}": Path(f"translated_{lt}_{lt_other}.json")
    for lt, lt_other in lts_pair
}


In [None]:
pipe_cache = {
    (lt_pair := f"{lt}_{lt_other}"): PipelineCache(
        pipe[lt_pair], cache_file_path[lt_pair], lt, lt_other
    )
    for lt, lt_other in lts_pair
}


In [None]:
pipe_cache["en_fr"]("Let's try this cool way to create a callable class.")


## Load epubs

In [None]:
ebook_folder = get_ebook_folder()
epub_path = {
    "fr": ebook_folder / "Gaston_Leroux_-_Le_Mystere_de_la_chambre_jaune.epub",
    "en": ebook_folder / "mystery_yellow_room.epub",
}
print(epub_path)


In [None]:
epub = {lt: EPub(epub_path[lt], nlp, pipe, lt, lt_other) for lt, lt_other in zip(lts, lts[::-1])}


### Translate a manazza and check for similarity

In [None]:
sent_fr = epub["fr"].chapters[1].paragraphs[0].sentences[0]
sent_fr.text


In [None]:
sent_fr_to_en = pipe["fr_en"](sent_fr.text)
sent_fr_to_en


In [None]:
doc_fr_to_en = nlp["en"](sent_fr_to_en[0]["translation_text"])
print(type(doc_fr_to_en))
doc_fr_to_en


In [None]:
sent_en = epub["en"].chapters[0].paragraphs[2].sentences[0]
print(type(sent_en))
sent_en


In [None]:
doc_fr_to_en.similarity(sent_en)


### Iterate over sentences

In [None]:
def iter_sent(chap: Chapter, start_chap: int = 0, end_chap: int = 0):
    """"""
    if end_chap == 0:
        end_chap = len(chap.paragraphs) + 1
    for i_p, par in enumerate(chap.paragraphs[start_chap:end_chap]):
        for i_s, sent in enumerate(par.sentences):
            yield (i_p + start_chap, i_s), sent


In [None]:
for k, sent in iter_sent(epub["en"].chapters[0], 0, 105):
    print(k, sent.text)
