In [None]:
%load_ext autoreload
%autoreload 2

# Create and use the app

1. Create the app
1. Pick the languages
1. Load the NLP models
1. Load the books


In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
from transformers.pipelines import pipeline

from interleave_epub.epub.epub import EPub
from interleave_epub.interleave.constants import (
    hug_model_name_tmpl,
    hug_trad_cache_fol,
    sent_model_names,
    spa_model_cache_fol,
    spa_model_names,
)
from interleave_epub.interleave.interactive import InterleaverInteractive
from interleave_epub.nlp.cached_pipe import TranslationPipelineCache
from interleave_epub.nlp.local_spacy_model import spacy_load_local_model


## Constants


### Ep paths


In [None]:
epub_folder_path = Path("~").expanduser() / "snippet" / "datasets" / "ebook"

epub_paths = {
    "fr": epub_folder_path / "Gaston_Leroux_-_Le_Mystere_de_la_chambre_jaune.epub",
    "en": epub_folder_path / "mystery_yellow_room.epub",
}

### Language tags


In [None]:
lts = ["fr", "en"]


## App


In [None]:
ii = InterleaverInteractive()


In [None]:
# set the lang tags
ii.set_lang_tag("fr", "src")
ii.set_lang_tag("en", "dst")


In [None]:
# load the models
ii.load_nlp()


In [None]:
# load the books
ii.add_book(epub_paths["fr"], "src", "Chambre Jaune")
ii.add_book(epub_paths["en"], "dst", "Yellow Room")


In [None]:
an_epub = ii.epubs["src"]
a_chap = an_epub.chapters[0]
a_par = a_chap.paragraphs[0]
a_par


In [None]:
ii.align_auto()


In [None]:
# ii.aligner.compute_similarity()


In [None]:
al = ii.aligners[ii.ch_id_pair_str]


In [None]:
fig, ax = plt.subplots()
ax.imshow(al.sim.T[::-1, :])


In [None]:
# ii.aligner.align_auto(min_sent_len=5)


In [None]:
fig, ax = plt.subplots()
ax.scatter(al.all_good_ids_src, al.all_good_ids_dst_max, s=0.1)
ax.plot([0, al.sent_num_src], [0, al.sent_num_dst], linewidth=0.3)
fit_y = al.fit_func([0, al.sent_num_src])
ax.plot([0, al.sent_num_src], fit_y)
# ax.plot(all_good_i_rescaled, all_good_max_rescaled, linewidth=0.9)
ax.plot(al.all_ids_src, al.all_ids_dst_max, linewidth=0.9)
# ax.set_title(f"Matching")


In [None]:
fig, ax = plt.subplots(figsize=(10, 16))

# plot the fancy ones
ax.plot(
    al.all_ids_src,
    al.all_ids_dst_max,
    marker="x",
    color="r",
    linestyle="",
    alpha=0.9,
)

# plot the greedy ones, where they are different
bad_ids_src = []
bad_ids_dst = []
for i in range(len(al.all_good_ids_src)):
    if al.all_ids_dst_max[i] != al.all_good_ids_dst_max[i]:
        bad_ids_src.append(al.all_good_ids_src[i])
        bad_ids_dst.append(al.all_good_ids_dst_max[i])
ax.scatter(
    bad_ids_src,
    bad_ids_dst,
    marker="o",
    color="r",
    alpha=0.5,
)

ax.imshow(al.sim.T)


In [None]:
# ii.aligner.compute_ooo_ids()


In [None]:
(
    al.curr_id_src,
    al.curr_id_dst_interpolate,
    al.all_ids_dst_max[al.curr_id_src],
)


Almost done:

1. Manually set the dst id
1. Add curr src id to the fixed src ids
1. Recompute ooo ids, skipping fixed
1. Find the first src id to fix
1. Get the best guess for dst
1. Repeat


In [None]:
id_dst_correct = 5
ii.pick_dst_sent(id_dst_correct)

In [None]:
(
    al.curr_id_src,
    al.curr_id_dst_interpolate,
    al.all_ids_dst_max[al.curr_id_src],
)
