In [None]:
%load_ext autoreload
%autoreload 2

# Test and use the library

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
from transformers import pipeline
from sentence_transformers import (
    SentenceTransformer,
    util,
)
import spacy

from cached_pipe import PipelineCache
from epub import (
    EPub,
    Chapter,
)
from utils import (
    get_ebook_folder,
    enumerate_sent,
)


### Load NLP objects

In [None]:
lts = ["en", "fr"]
lts_pair = list(zip(lts, lts[::-1]))
lts, lts_pair


In [None]:
nlp = {
    "en": spacy.load("en_core_web_md"),
    "fr": spacy.load("fr_core_news_md"),
}


In [None]:
# TODO:
# Should export env variables to avoid needing an internet connection.
# https://huggingface.co/docs/transformers/installation#offline-mode


In [None]:
pipe = {
    f"{lt}_{lt_other}": pipeline(
        "translation", model=f"Helsinki-NLP/opus-mt-{lt}-{lt_other}"
    )
    for lt, lt_other in lts_pair
}


## Load cached translator pipeline

In [None]:
cache_file_path = {
    f"{lt}_{lt_other}": Path(f"translated_{lt}_{lt_other}.json")
    for lt, lt_other in lts_pair
}


In [None]:
pipe_cache = {
    (lt_pair := f"{lt}_{lt_other}"): PipelineCache(
        pipe[lt_pair], cache_file_path[lt_pair], lt, lt_other
    )
    for lt, lt_other in lts_pair
}


In [None]:
pipe_cache["en_fr"]("Let's try this cool way to create a callable class.")


## Load epubs

In [None]:
ebook_folder = get_ebook_folder()
epub_path = {
    "fr": ebook_folder / "Gaston_Leroux_-_Le_Mystere_de_la_chambre_jaune.epub",
    "en": ebook_folder / "mystery_yellow_room.epub",
}
print(epub_path)


In [None]:
epub = {
    lt: EPub(epub_path[lt], nlp, pipe_cache, lt, lt_other)
    for lt, lt_other in zip(lts, lts[::-1])
}


### Translate a manazza and check for similarity with spacy

In [None]:
sent_fr = epub["fr"].chapters[0].paragraphs[0].sents_orig[0]
sent_fr.text


In [None]:
sent_fr_to_en = pipe["fr_en"](sent_fr.text)
sent_fr_to_en


In [None]:
doc_fr_to_en = nlp["en"](sent_fr_to_en[0]["translation_text"])
print(type(doc_fr_to_en))
doc_fr_to_en


In [None]:
sent_en = epub["en"].chapters[0].paragraphs[2].sents_orig[0]
print(type(sent_en))
sent_en


In [None]:
doc_fr_to_en.similarity(sent_en)


In [None]:
sent_en2 = epub["en"].chapters[0].paragraphs[2].sents_orig[2]
print(sent_en2)

doc_fr_to_en.similarity(sent_en2)


### Iterate over sentences

In [None]:
# def enumerate_sent(
#     chap: Chapter,
#     start_par: int = 0,
#     end_par: int = 0,
#     which_sent="orig",
# ):
#     """"""
#     if end_par == 0:
#         end_par = len(chap.paragraphs) + 1
#     for i_p, par in enumerate(chap.paragraphs[start_par:end_par]):
#         for i_s, sent in enumerate(par.sents_orig):
#             if which_sent == "orig":
#                 yield (i_p + start_par, i_s), sent
#             elif which_sent == "tran":
#                 yield (i_p + start_par, i_s), par.sents_tran[i_s]


In [None]:
ch_delta = 2

sent_text_en = []
for k, sent in enumerate_sent(epub["en"].chapters[0+ch_delta], which_sent="orig"):
    text_en = sent.text
    # print(k, text_en)
    sent_text_en.append(text_en)

sent_text_fr_tran = []
for k, sent in enumerate_sent(epub["fr"].chapters[0+ch_delta], which_sent="tran"):
    text_fr_tran = sent.text
    sent_text_fr_tran.append(text_fr_tran)

print(sent_text_en[4])
print(sent_text_fr_tran[2])


## Sentence encoder used for similarity

In [None]:
# sentence_transformer = SentenceTransformer("paraphrase-MiniLM-L6-v2")
sentence_transformer = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Our sentences we like to encode
sentences = [
    "This framework generates embeddings for each input sentence",
    "Sentences are passed as a list of string.",
    "The quick brown fox jumps over the lazy dog.",
]

# Sentences are encoded by calling sentence_transformer.encode()
embeddings = sentence_transformer.encode(sentences)

# # Print the embeddings
# for sentence, embedding in zip(sentences, embeddings):
#     print("Sentence:", sentence)
#     print("Embedding:", embedding)
#     print("")


In [None]:
sentences = ["I'm happy", "I'm full of happiness"]


# Compute embedding for both lists
embedding_1 = sentence_transformer.encode(sentences[0], convert_to_tensor=True)
embedding_2 = sentence_transformer.encode(sentences[1], convert_to_tensor=True)

util.pytorch_cos_sim(embedding_1, embedding_2)


In [None]:
embedding_1 = sentence_transformer.encode(sent_en.text, convert_to_tensor=True)
embedding_2 = sentence_transformer.encode(sent_en2.text, convert_to_tensor=True)
embedding_fr = sentence_transformer.encode(doc_fr_to_en.text, convert_to_tensor=True)
fr1 = util.pytorch_cos_sim(embedding_1, embedding_fr)
fr2 = util.pytorch_cos_sim(embedding_2, embedding_fr)
fr1, fr2


### Convert sents to embedding and compute the distance

In [None]:
sent_num_en = len(sent_text_en)
enc_en = sentence_transformer.encode(sent_text_en, convert_to_tensor=True)
print("en", enc_en.shape, sent_num_en, enc_en[0].shape)

sent_num_fr = len(sent_text_fr_tran)
enc_fr_tran = sentence_transformer.encode(sent_text_fr_tran, convert_to_tensor=True)
print("fr", enc_fr_tran.shape, sent_num_fr, enc_fr_tran[0].shape)


In [None]:
# sim = np.zeros((sent_num_en, sent_num_fr))
# for i in range(sent_num_en):
#     for ii in range(sent_num_fr):
#         sim[i][ii] = util.pytorch_cos_sim(enc_en[i], enc_fr_tran[ii])
#         # sim[i][ii] = util.pytorch_cos_sim(enc_en[i], enc_en[ii])
# plt.imshow(sim)


In [None]:
sim_torch = util.pytorch_cos_sim(enc_en, enc_fr_tran)
sim = sim_torch.detach().cpu().numpy()
plt.imshow(sim)
plt.title(f"Similarity *en* vs *fr_translated*")
plt.ylabel("en")
plt.xlabel("fr_tran")
plt.savefig(f"Similarity_en_vs_fr_translated_{ch_delta}.pdf")
plt.show()


In [None]:
win_len = 20
ratio = sent_num_en / sent_num_fr
print(f"{ratio=}")
sim_center = np.zeros((sent_num_en, win_len * 2 + 1))
for i in range(sent_num_en):
    # the similarity of this english sent to all the translated ones
    this_sent_sim = sim[i]
    # find the center rescaled because there are different number of sent in the two chapters
    ii = int(i / ratio)
    if ii < win_len:
        ii = win_len
    if ii > sent_num_fr - (win_len + 1):
        ii = sent_num_fr - (win_len + 1)
    # the chopped similarity array
    some_sent_sim = this_sent_sim[ii - win_len : ii + win_len + 1]
    sim_center[i] = some_sent_sim

fig, ax = plt.subplots()
ax.imshow(sim_center, aspect="auto")
ax.set_title(f"Similarity en vs fr_tran, shifted")
ax.set_ylabel("en")
ax.set_xlabel("fr_tran shifted")

# I hate matplotlib

# overlap
# xticks_lab = list(range(-win_len, win_len + 1))
# xticks_pos = list(range(win_len * 2))

# works but I hate it

# xticks_lab = list(range(-win_len, win_len + 1, 3))
# xticks_pos = list(range(0, win_len * 2 + 1, 3))

step = 3
xticks_lab = list(range(-step, -win_len, -step))[::-1] + list(range(0, win_len, step))
min_lab = xticks_lab[0]
min_shift = win_len + min_lab
xticks_pos = list(range(min_shift, win_len * 2 + 1, step))

ax.set_xticks(xticks_pos)
ax.set_xticklabels(xticks_lab)

print(ax.xaxis.get_ticklabels())
print(ax.get_xticks())

# ax.xaxis.set_major_locator(plt.MaxNLocator(10))

plt.savefig(f"Similarity_en_vs_fr_tran_shifted_{ch_delta}.pdf")
plt.show()


In [None]:
# list(range(-win_len, 0, 3))
step = 3
lab = list(range(-step, -win_len, -step))[::-1] + list(range(0, win_len, step))
print(lab)
min_lab = lab[0]
print(min_lab)
min_shift = win_len + min_lab
print(min_shift)
pos = list(range(min_shift, win_len * 2 + 1, step))
print(pos)


## Use similarity to pair up sentences

In [None]:
sim[0].shape, sim[:, 0].shape, sim.shape
