In [None]:
import regex as re
import unicodedata
from collections import defaultdict
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from  pprint import pprint
from wordcloud import WordCloud
from collections import Counter
import matplotlib.pyplot as plt
import genanki
from deep_translator import GoogleTranslator
import stanza
import os
import deepl
import hashlib, html
from typing import Dict, List, Tuple
import time, re, unicodedata as ud
from dotenv import load_dotenv
load_dotenv()


In [None]:
DEEPL_AUTH_KEY  = os.getenv('DEEPL_AUTH_KEY')
translator = deepl.Translator(DEEPL_AUTH_KEY)

In [None]:
nlp = stanza.Pipeline("sv", processors="tokenize,pos,lemma")
nltk.download("stopwords")
stopwords = stopwords.words("swedish")

In [None]:
# 1) Known ambiguous terms → add a hint (and optional forced gloss)
AMBIG = {
    "underkänd":  {"context": "School grading; means 'failed (an exam)'.", "override": "failed"},
    "underkända": {"context": "School grading; means 'failed (an exam)'.", "override": "failed"},
    "underkänt":  {"context": "School grading; means 'failed (an exam)'.", "override": "failed"},
    # add more false friends here...
}

# 2) Tiny domain detector (optional)
DOMAINS = {
    "school": {
        "keywords": {"prov","betyg","lärare","skola","elever","kurs","tentamen"},
        "context": "School / grading context."
    },
    "medical": {
        "keywords": {"sjukhus","läkare","behandling","symptom","diagnos"},
        "context": "Medical context."
    },
    "finance": {
        "keywords": {"bolag","aktier","börsen","fakturor","intäkter","kostnader"},
        "context": "Business / finance context."
    },
}

In [None]:
SWEDISH_DELETE_WORDS = [
    # greetings / interjections
    "hej","hejsan","hallå","tjena","tjenare","tjenixen","tja","goddag","godmorgon","godkväll","mors",
    "aha","oj","åh","hmm","mm","mmm","eh","öh","öhm","äh","asså","ba",
    # yes / no / acknowledgements
    "ja","japp","jo","visst","absolut","okej","ok","okey","nej","icke",
    # fillers / discourse markers
    "liksom","typ","alltså","ju","väl","likaså","likväl","så","då","bara","redan","också","dessutom","kanske","nog",
    # pronouns / determiners
    "jag","du","han","hon","den","det","vi","ni","de","mig","dig","honom","henne","oss","er","dem","man",
    "min","mitt","mina","din","ditt","dina","sin","sitt","sina","vår","vårt","våra","er","ert","era",
    "denna","detta","dessa","någon","något","några","ingen","inget","inga","vilken","vilket","vilka","som",
    # common verbs/aux/modals
    "är","var","vara","blir","blev","bli","ha","har","hade","gör","gjorde","göra",
    "kan","kunde","ska","skall","skulle","vill","ville","måste","bör","brukar","får","fick",
    # adverbs / particles
    "inte","aldrig","alltid","ofta","ibland","sällan","här","där","hit","dit","hem","borta","nu","sen","snart","igen","än",
    "mycket","lite","mer","mest","mindre","minst","kvar","både","antingen","heller","också",
    # prepositions
    "i","på","till","från","för","med","utan","över","under","mellan","genom","mot","bland","hos",
    "före","efter","kring","runt","enligt","trots","vid","omkring","om","åt","av","per","cirka","ca",
    # conjunctions / subjunctions
    "och","men","eller","samt","utan","att","för att","eftersom","därför","medan","när","innan","efter att","om",
    "fast","så att","såväl","både","dock","ty","varför",

    'ska', 'nej', 'hej','bra', 'ja','vill','lite', 'jaha', 'wow'
]


In [None]:
names = ['eddie', 'martin', 'william', 'lisa','eddies', 'patrik', 'bianca', "katja"]

In [None]:
to_delete = stopwords + SWEDISH_DELETE_WORDS + names

In [None]:
def read_file(file_name: str):
    with open(file_name, 'r', encoding = 'UTF-8') as f:
        file = f.read()
    return file

In [None]:
def clean_line(line, for_word):
    line = line.replace('-','')
    line = line.replace('...','')
    line = line.strip()

    if line.endswith(','):
        line = line [:-1]
        
    if for_word:
        line = re.sub(r'\p{P}+', ' ', line)
        line = line.lower()

    line = line.replace('  ', ' ')
    line = line.strip()
    return line

In [None]:
def clean_data(file, for_word = False):
    text = file.split('\n')
    text = [piece.strip() for txt in text for piece in txt.split('.') if piece.strip()]

    filtered = []
    for line in text:
        if len(line) <= 1:
            continue
        if re.findall('^\d{2}', line):
            continuez
        line = clean_line(line, for_word)
        
        line = ' '.join(line.split())
        filtered.append(line)
    return filtered

In [None]:
_SENT_END_RE = re.compile(
    r"""
    (?<!\b(?:dr|prof|mr|mrs|ms|nr|itp|np|tj|kap|art|al)\.)   # variable-width lookbehind OK here
    (?<=\.|!|\?|…)
    ["')\]]*
    \s+
    """,
    re.IGNORECASE | re.VERBOSE
)

# SRT timestamp line: 00:00:12,345 --> 00:00:14,567
_SRT_TIME_RE = re.compile(
    r'^\d{2}:\d{2}:\d{2}[,\.]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[,\.]\d{3}$'
)

def _split_sentences(text: str) -> list[str]:
    return [s.strip() for s in _SENT_END_RE.split(text) if s.strip()]

def clean_data(file: str, for_word: bool = False) -> list[str]:
    # 1) Strip SRT artifacts, collapse to a single text
    lines = []
    for raw in file.splitlines():
        line = raw.strip()
        if not line:
            continue
        if line.isdigit():               # SRT index lines
            continue
        if _SRT_TIME_RE.match(line):     # SRT time range lines
            continue
        lines.append(line)
    text = " ".join(lines)

    # 2) Split by sentences (not lines)
    sentences = _split_sentences(text)

    # 3) Clean and filter
    filtered = []
    for s in sentences:
        s = clean_line(s, for_word)      # your cleaner
        s = " ".join(s.split())
        if len(s) > 1:
            filtered.append(s)
    return filtered


In [None]:
def convert_to_words(lines:list) -> list:
    full_list = []
    for line in lines:
        for word in line.split(' '):
            if word.isdigit():
                continue
            if isinstance(word, str):
                full_list.append(word)
    
    return full_list

In [None]:
def generate_ngram(words_tokenized, n, min_count):
    generated_ngrams = ngrams(words_tokenized, n)
    counter_grams = Counter(list(generated_ngrams))
    counter_dict = dict(counter_grams)
    return {k: v for k,v in counter_dict.items() if v >= min_count}

In [None]:
def generate_multiple_ngrams(words_tokenized, min_counts):
    grams = {}
    gram_counts = {}
    for i in range(3,10):
        if generate_ngram(words_tokenized, i, min_counts):
            grams[i] = generate_ngram(words_tokenized, i, min_counts)
            gram_counts[i] = len(grams[i])
        else:
            break
    return grams, gram_counts

In [None]:
def get_art(word):
    doc = nlp(word)
    for sent in doc.sentences:
       for w in sent.words:
           if w.upos == "NOUN":
               feats = w.feats or ""          # e.g. "Definite=Ind|Gender=Neut|Number=Sing"
               art = "en" if "Gender=Com" in feats else ("ett" if "Gender=Neut" in feats else None)
               return art
           
    return None

In [None]:
def coverage(lemma_count, picked_by_pos):
    total_tokens = sum(cnt for d in lemma_count.values() for cnt in d.values())
    covered = set()
    for pos, d in picked_by_pos.items():
        covered |= {(pos, lemma) for lemma in d}
    # sum using best matching POS counts
    covered_tokens = 0
    for pos, d in picked_by_pos.items():
        for lemma, cnt in d.items():
            covered_tokens += lemma_count.get(pos, {}).get(lemma, 0)
    return covered_tokens / total_tokens if total_tokens else 0.0

In [None]:
def select_top_quota(lemma_count, target_total=250, quotas=None):
    """
    lemma_count: dict like {'NOUN': {'barn':12, 'dag':4, ...}, 'VERB': {...}, ...}
    target_total: total number of lemmas you want
    quotas: POS -> fraction, e.g. {'VERB':0.35,'NOUN':0.40,'ADJ':0.15,'ADV':0.10}
            If None, distribute evenly across POS present.
    Returns: (study_list, picked_by_pos)
      study_list = [(lemma, POS, count)] ordered by selection stage
      picked_by_pos = {'NOUN': {'barn':12, ...}, 'VERB': {...}, ...}
    """
    # Convert inner dicts to Counters
    pos_counters = {pos: Counter(d) for pos, d in lemma_count.items()}
    all_pos = list(pos_counters.keys())

    if not quotas:
        quotas = {pos: 1/len(all_pos) for pos in all_pos}

    # translate fractions to integer quotas, then backfill any shortfall
    raw = {pos: int(target_total * quotas.get(pos, 0)) for pos in all_pos}
    short = target_total - sum(raw.values())
    # give leftover slots to the biggest buckets by available items
    fill_order = sorted(all_pos, key=lambda p: sum(pos_counters[p].values()), reverse=True)
    i = 0
    while short > 0 and fill_order:
        pos = fill_order[i % len(fill_order)]
        raw[pos] += 1
        short -= 1
        i += 1

    picked = set()
    picked_by_pos = defaultdict(dict)
    study_list = []

    # 1) take top-k per POS by its quota
    for pos, k in raw.items():
        for lemma, cnt in pos_counters[pos].most_common():
            if len(picked_by_pos[pos]) >= k:
                break
            if lemma in picked:
                continue
            picked.add(lemma)
            picked_by_pos[pos][lemma] = cnt
            study_list.append((lemma))

    # 2) backfill if some POS had too few items or overlaps reduced selection
    if len(study_list) < target_total:
        # overall ranking across all POS
        overall = Counter()
        per_pos_for_lemma = defaultdict(dict)
        for pos, C in pos_counters.items():
            for lemma, cnt in C.items():
                overall[lemma] += cnt
                per_pos_for_lemma[lemma][pos] = cnt

        for lemma, _ in overall.most_common():
            if len(study_list) >= target_total:
                break
            if lemma in picked:
                continue
            # choose the POS where this lemma is most frequent
            pos = max(per_pos_for_lemma[lemma].items(), key=lambda x: x[1])[0]
            cnt = per_pos_for_lemma[lemma][pos]
            picked.add(lemma)
            picked_by_pos[pos][lemma] = cnt
            study_list.append((lemma))

    return study_list, picked_by_pos


In [None]:
def get_cleaned_sentences(cleaned_file):
    sentence_clean = {}
    for line in cleaned_file:
        sentence_clean[clean_line(line, for_word = True)] = line

    return sentence_clean

In [None]:
def match_grams_with_sentences(grams, sentence_clean):
    di = {}
    # sentence_clean: {clean -> original}
    for n, content in grams.items():
        for gram, _ in content.items():
            key = ' '.join(gram)
            pat = re.compile(rf"\b{re.escape(key)}\b", flags=re.IGNORECASE)
            di[gram] = [sent for sent in sentence_clean.keys() if pat.search(sent)]
    return di


In [None]:
def get_lemma(word):
    doc = nlp(word)
    out = []
    for sent in doc.sentences:
        for w in sent.words:
            if w.upos == "NOUN":
                feats = w.feats or ""          # e.g. "Definite=Ind|Gender=Neut|Number=Sing"
                art = "en" if "Gender=Com" in feats else ("ett" if "Gender=Neut" in feats else None)
                return [art, w.upos , w.lemma]
            else:
                return [None, w.upos, w.lemma]

In [None]:
def lemmatize_words(words):
    final_words = defaultdict(dict)
    for w in words:
        art, pos, lem = get_lemma(w)
        info = final_words.setdefault(lem, {"Artikel": art, "POS": pos, "Forms": set()})
        info["Forms"].add(w)
    return final_words

In [None]:
def get_lemma_count(words, lemmatized):
    surface_to_row = {}
    for lem, info in lemmatized.items():
        for form in info["Forms"]:
            surface_to_row[form] = (info["Artikel"], info["POS"], lem)

    counts = defaultdict(Counter)
    for w in words:
        row = surface_to_row.get(w)
        if not row: 
            continue
        _, upos, lemma = row
        counts[upos][lemma] += 1
    return {pos: dict(cnt) for pos, cnt in counts.items()}


In [None]:
def pick_shortest_by_lemma(
    final: Dict[str, Dict[str, List[str]]],
    prefer_inflected: bool = True,      # prefer forms where word_form != lemma
    measure: str = "tokens"             # "tokens" or "chars"
) -> List[Tuple[str, str, str]]:
    """
    Returns a list of (lemma, chosen_word_form, shortest_example_sentence).
    Chooses per lemma the word form whose shortest example is the shortest.
    """

    def key_for(s: str):
        # primary: token count, secondary: char length
        return (len(s.split()), len(s)) if measure == "tokens" else (len(s),)

    results = {}

    for lemma, forms in final.items():
        if not forms:
            continue

        candidates = []
        for form, sents in forms['examples'].items():
            if not sents:
                continue
            shortest_sent_for_form = min(sents, key=key_for)

            # Rank: 0 = inflected preferred, 1 = base (if prefer_inflected)
            rank = 0 if (prefer_inflected and form != lemma) else 1
            candidates.append((rank, key_for(shortest_sent_for_form), form, shortest_sent_for_form))

        if not candidates:
            continue

        # Choose minimal by (rank, length-key)
        _, _, best_form, best_sentence = min(candidates, key=lambda x: (x[0], x[1]))
        results[lemma] = ( best_form, best_sentence)

    return results


In [None]:
def get_sentence_example(lematized, sentences):
    res = defaultdict(list)
    for word, lemma_data in lematized.items():
        for word_form in lemma_data['Forms']:
            res[word].append(word_form) 

    for lemma, words in res.items():
        lemma_map = {}  
        for word in set(words):
            pattern = re.compile(rf"\b{re.escape(word)}\b", flags=re.IGNORECASE)
            hits = [target for sent, target in sentences.items() if pattern.search(sent)]
            if hits:
                lemma_map[word] = hits
        if lemma_map:
            lematized[lemma]['examples'] = lemma_map
    return lematized

In [None]:
_CACHE = {}
def _tokenize_sv(s: str) -> set[str]:
    s = ud.normalize("NFC", s.lower())
    return set(re.findall(r"[a-zåäöéüøß\-]+", s))

def guess_context_sv(sv_sentence: str) -> str | None:
    tokens = _tokenize_sv(sv_sentence)
    for dom in DOMAINS.values():  # first match wins
        if tokens & dom["keywords"]:
            return dom["context"]
    return None

def tag_first(s, target):
    # case-insensitive, whole-word; preserves original casing in the sentence
    pattern = re.compile(rf"\b{re.escape(target)}\b", flags=re.IGNORECASE)
    return pattern.sub(lambda m: "<term>"+m.group(0)+"</term>", s, count=1)

def extract_term(en_text: str) -> str:
    a, b = en_text.find("<term>"), en_text.find("</term>")
    if a != -1 and b != -1 and b > a:
        return en_text[a+6:b]
    a, b = en_text.find("&lt;term&gt;"), en_text.find("&lt;/term&gt;")
    if a != -1 and b != -1 and b > a:
        return en_text[a+12:b]
    return ""

def translate_tagged(sv_sentence: str, target: str, translator) -> tuple[str, str]:
    key = (sv_sentence, target)
    if key in _CACHE:
        return _CACHE[key]

    # 1) Build an optional context
    ctx = None
    amb = AMBIG.get(target.lower())
    if amb:
        ctx = amb["context"]
    if ctx is None:
        ctx = guess_context_sv(sv_sentence)

    # 2) Tag first occurrence and call DeepL (one quick retry on 429)
    tagged = tag_first(sv_sentence, target)
    kwargs = dict(
        source_lang="SV", target_lang="EN-GB",
        tag_handling="xml", non_splitting_tags=["term"],
        preserve_formatting=True, outline_detection=False
    )
    if ctx:  # only pass when we have one
        kwargs["context"] = ctx

    try:
        res = translator.translate_text(tagged, **kwargs)
    except deepl.TooManyRequestsException:
        time.sleep(3)
        res = translator.translate_text(tagged, **kwargs)

    en_sentence = res.text
    word_eng = extract_term(en_sentence)

    # 3) Optional last-resort override for known false friends
    if amb and amb.get("override"):
        en_sentence = re.sub(r"(<term>)(.*?)(</term>)",
                             r"\1"+amb["override"]+r"\3",
                             en_sentence, count=1, flags=re.DOTALL)
        word_eng = amb["override"]

    _CACHE[key] = (en_sentence, word_eng)
    return _CACHE[key]



In [None]:
# --- utils ---
def _bold_term_tags(s: str) -> str:
    # raw <term>…</term>
    s = s.replace("<term>", "<b>").replace("</term>", "</b>")
    # escaped &lt;term&gt;…&lt;/term&gt;
    s = s.replace("&lt;term&gt;", "<b>").replace("&lt;/term&gt;", "</b>")
    return s

def _highlight_once(sentence: str, target: str) -> str:
    # bold FIRST whole-word target (case-insensitive), preserving original case
    pat = re.compile(rf"\b{re.escape(target)}\b", re.IGNORECASE)
    return pat.sub(lambda m: f"<b>{m.group(0)}</b>", sentence, count=1)

# --- FRONT ---
def _front_text(rec: dict) -> str:
    """
    Front shows:
      1) English sentence (italic, with bolded term)
      2) 'gloss (pos)'
    """
    ts = rec.get('to_study', {}) or {}
    en_sent = ts.get('Sentence_translated') or ''
    gloss   = ts.get('Word_translated') or ''   # <-- fixed source
    pos     = (rec.get('POS') or '').lower()

    # clean "None"
    if gloss == 'None': gloss = ''
    if en_sent == 'None': en_sent = ''

    en_sent = _bold_term_tags(en_sent)

    gloss_line = f"{gloss}  ({pos})" if gloss and pos else (gloss or (f"({pos})" if pos else ""))

    parts = []
    if en_sent:
        parts.append(f"<div style='font-style:italic'>{en_sent}</div>")
    if gloss_line:
        parts.append(f"<div style='margin-top:6px'>{html.escape(gloss_line)}</div>")
    return "".join(parts)

# --- BACK & CARD BUILDER ---
def build_card(rec: dict) -> tuple[str, str]:
    pos   = rec.get('POS', '') or ''
    art   = rec.get('Artikel') or ''   # 'en' / 'ett' / ''
    ts    = rec.get('to_study', {}) or {}
    word  = ts.get('Word', '') or ''
    sv    = ts.get('Sentence', '') or ''
    en    = ts.get('Sentence_translated', '') or ''
    en    = _bold_term_tags(en)

    front = _front_text(rec)

    badge = (
        f"<span style='background:#eee;border-radius:6px;padding:2px 6px;margin-left:6px'>{art}</span>"
        if (pos == 'NOUN' and art) else ""
    )

    back = (
        f"<div style='font-size:1.35em;line-height:1.2'><b>{html.escape(word)}</b>{badge}</div>"
        f"<div style='margin-top:8px'>{_highlight_once(html.escape(sv), word)}</div>"
        f"<div style='margin-top:6px;font-style:italic'>{en}</div>"
        f"<div style='margin-top:6px;color:#777'>{pos.lower()}</div>"
    )
    return front, back

# --- DECK ---
def _note_guid(word: str, sv: str) -> str:
    h = hashlib.sha1(f"{word}||{sv}".encode('utf-8')).hexdigest()
    return h

def generate_deck(name: str, db: dict) -> genanki.Deck:
    model = genanki.Model(
        1607392319,  # keep stable once chosen
        'EN→SV Minimal',
        fields=[{'name': 'Front'}, {'name': 'Back'}],
        templates=[{
            'name': 'Card 1',
            'qfmt': '{{Front}}',
            'afmt': '{{FrontSide}}<hr id="answer">{{Back}}',
        }],
        css="""
        .card { font-family: Inter, Arial; font-size: 18px; line-height: 1.4; }
        """
    )

    deck = genanki.Deck(2059200110, name)

    for _, rec in sorted(db.items()):
        front, back = build_card(rec)
        word = (rec.get('to_study', {}) or {}).get('Word', '')
        sv   = (rec.get('to_study', {}) or {}).get('Sentence', '')
        if not front or not word:
            continue
        note = genanki.Note(
            model=model,
            fields=[front, back],
            guid=_note_guid(word, sv),
        )
        deck.add_note(note)

    return deck

def save_deck(deck: genanki.Deck, filename: str):
    genanki.Package(deck).write_to_file(filename)


In [None]:
file = read_file('ep1.srt')
cleaned_file_words = clean_data(file, for_word= True)
cleaned_file = clean_data(file, for_word= False)
sentence_clean = get_cleaned_sentences(cleaned_file)

In [None]:
words = convert_to_words(cleaned_file_words)
words_clean = [w for w in words if (w not in to_delete)]
words_tokenized = [word.lower() for word in word_tokenize(file) if word.isalpha()]

In [None]:
grams, grams_counts = generate_multiple_ngrams(words_tokenized, 3)
di = match_grams_with_sentences(grams, sentence_clean)

In [None]:
lematized = lemmatize_words(words_clean)

In [None]:
lemma_count = get_lemma_count(words, lematized)
final = get_sentence_example(lematized, sentence_clean)
picked = pick_shortest_by_lemma(final, prefer_inflected=True, measure="tokens")

In [None]:
quotas = {"VERB": 0.35, "NOUN": 0.40, "ADJ": 0.15, "ADV": 0.10}
study_list, picked_by_pos = select_top_quota(lemma_count, target_total=300, quotas=quotas)
cov = coverage(lemma_count, picked_by_pos)
print(f"Estimated token coverage: {cov:.1%}")

In [None]:
for word, content in final.items():
    sv  = content['to_study']['Sentence']           # keep original casing
    tgt = content['to_study']['Word']               # original form; tagger is case-insensitive
    en_sentence, word_eng = translate_tagged(sv, tgt, translator)
    content['to_study']['Sentence_translated'] = en_sentence
    content['to_study']['Word_translated'] = word_eng
    time.sleep(0.7)  # pacing; adjust to your plan's limits

In [None]:
deck = generate_deck('test_final_final',final)

In [None]:
save_deck(deck, "swedish_cards.apkg")