In [1]:
# Setup
!pip install --only-binary :all: pynini
!pip install wurlitzer
import pynini
%load_ext wurlitzer

Collecting pynini
  Downloading pynini-2.1.7-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.7 kB)
Downloading pynini-2.1.7-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (165.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.5/165.5 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynini
Successfully installed pynini-2.1.7


In [8]:
from __future__ import annotations

import json
import logging
import math
import re
import unicodedata
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Tuple, Optional

import pynini

logger = logging.getLogger(__name__)

EPS = pynini.cross("", "")
VOWELS = "aeıioöuü"
TR_LETTERS = "abcçdefgğhıijklmnoöprsştuüvyz"

# ----------------------------
# Normalization helpers (For turkish correct lowercate transitions)
# ----------------------------
_TR_LOWER_MAP = str.maketrans({"I": "ı", "İ": "i"})

def tr_lower(s: str) -> str:
    return s.translate(_TR_LOWER_MAP).lower()


def normalize_text(s: str) -> str:
    s = unicodedata.normalize("NFC", s)
    s = s.replace("’", "'").replace("`", "'")
    return tr_lower(s.strip())

# -----------------------------------------------------------------------------
# Lexicon Loading / Normalization
# -----------------------------------------------------------------------------
def load_lexicon(json_file: str = "turkish_lexicon.json") -> Dict[str, List[str]]:
    """
    Load Turkish lexicon from a JSON file.

    Expected schema (recommended):
      {
        "nouns": [...],
        "verbs": [...],        # may be stems or infinitives
        "adjectives": [...],
        ...
      }

    Also supports:
      { "NOUN": [...], "VERB": [...], ... }
    """
    path = Path(json_file)
    if not path.exists():
        logger.warning("%s not found. Using fallback minimal lexicon.", json_file)
        # Slightly richer fallback so your demo sentences behave better even w/out JSON.
        return {
            "nouns": [
                "ev", "kitap", "masa", "sabah", "ayna", "yüz", "gül", "çiçek",
                "bahçe", "yaz", "mevsim", "kelime", "okul", "çocuk", "çanta",
                "deniz", "çay", "gün", "plan", "zaman", "bulut", "hava", "cam",
                "söz", "karar", "emir", "kenar", "bütün", "dolu",
            ],
            "verbs": [
                "gelmek", "gitmek", "okumak", "bakmak", "söylemek", "demek",
                "sanmak", "anlamak", "sevmek", "bahsetmek", "dikmek", "seçmek",
                "geçirmek", "yürümek", "düşünmek", "çıkmak", "yağmak",
                "bozmak", "yapmak", "karıştırmak", "kapanmak", "almak",
                "kastetmek", "gülümsemek",
            ],
            "adjectives": ["güzel", "iyi", "kırmızı", "romantik", "baskın", "ciddi", "kara"],
            "pronouns": ["ben", "sen", "o", "biz", "siz", "onlar", "herkes", "hangi", "ne"],
            "adverbs": ["çok", "az", "hâlâ", "bile", "aslında", "gerçekten", "sonra", "önce", "daha"],
            "conjunctions": ["ve", "ama", "yoksa", "da", "de"],
            "postpositions": ["gibi", "için", "yerine", "yüzünden"],
            "proper_nouns": [],
            "interjections": [],
        }

    with path.open("r", encoding="utf-8") as f:
        return json.load(f)


def normalize_lexicon(lex: Dict[str, List[str]]) -> Dict[str, List[str]]:
    mapping = {
        "NOUN": "nouns",
        "VERB": "verbs",
        "ADJ": "adjectives",
        "ADV": "adverbs",
        "PRON": "pronouns",
        "CONJ": "conjunctions",
        "POSTP": "postpositions",
        "PROPN": "proper_nouns",
        "INTERJ": "interjections",
        "NUM": "numbers",
    }

    out: Dict[str, List[str]] = defaultdict(list)
    for k, v in lex.items():
        kk = mapping.get(k, k)
        if not isinstance(v, list):
            continue
        for w in v:
            if isinstance(w, str) and w.strip():
                out[kk].append(normalize_text(w))
    # de-dup while keeping order
    for kk in list(out.keys()):
        seen = set()
        deduped = []
        for w in out[kk]:
            if w not in seen:
                seen.add(w)
                deduped.append(w)
        out[kk] = deduped
    return dict(out)


# -----------------------------------------------------------------------------
# Root Helpers
# -----------------------------------------------------------------------------
def extract_verb_root(verb_infinitive: str) -> str:
    """
    Extract verb root from infinitive form by removing -mak/-mek if present.
    """
    verb_infinitive = normalize_text(verb_infinitive)
    if verb_infinitive.endswith(("mak", "mek")) and len(verb_infinitive) > 3:
        return verb_infinitive[:-3]
    return verb_infinitive


def create_alternating_roots(words: List[str], tag: str) -> pynini.Fst:
    """
    Create root FSTs with consonant softening:

    p → b
    ç → c
    t → d
    k → g / ğ

    Softened form maps back to original lemma in output analysis.
    """
    roots = []
    for w in words:
        word = normalize_text(w)
        if not word:
            continue

        # Base form
        roots.append(pynini.cross(word, f"{word}+{tag}"))

        # Softened stem variants
        if len(word) > 1 and word[-1] in {"p", "ç", "t", "k"}:
            stem = word[:-1]
            final = word[-1]
            softened: Optional[str] = None

            if final == "p":
                softened = stem + "b"
            elif final == "ç":
                softened = stem + "c"
            elif final == "t":
                softened = stem + "d"
            elif final == "k":
                softened = stem + ("ğ" if (stem and stem[-1] in VOWELS) else "g")

            if softened:
                roots.append(pynini.cross(softened, f"{word}+{tag}"))

    if not roots:
        # IMPORTANT: do NOT return EPS here, or you'll create epsilon-roots in unions.
        # Return an "empty language" acceptor by intersecting disjoint acceptors.
        return pynini.intersect(pynini.accep("a"), pynini.accep("b"))
    return pynini.union(*roots)

def union_nonempty(fsts: List[pynini.Fst]) -> pynini.Fst:
    """
    Union only non-empty FSTs; avoids accidentally introducing epsilon-roots.
    """
    kept = []
    for fst in fsts:
        # There's no perfect "is empty" API; this heuristic works for our constructed empties.
        try:
            # If it has no start state, it's empty; but Pynini objects vary.
            _ = fst.start()
            kept.append(fst)
        except Exception:
            pass
    if not kept:
        return pynini.intersect(pynini.accep("a"), pynini.accep("b"))
    return pynini.union(*kept)
# -----------------------------------------------------------------------------
# Build Morphology FST
# -----------------------------------------------------------------------------
def build_analyzer(lexicon: Dict[str, List[str]]) -> pynini.Fst:
    """
    Build the full Turkish analyzer FST from lexicon + suffix grammars.

    Quick-fix changes vs original:
    - Avoid EPS roots in root unions (prevents suffix-only parses like 'de' -> '+LOC')
    - Add fused FUT 1SG/1PL: -acağım/-eceğim, -acağız/-eceğiz (e.g., 'gideceğim')
    - Add common converbs: -ınca/-yince, -ken, -ip (e.g., 'bakınca', 'deyince')
    - Add vowel-final possessive shortcuts (e.g., yüz+ü for 3SG, ev+im for 1SG, etc.)
    """

    # -------------------------------------------------------------------------
    # Helper: "empty language" FST (NOT epsilon)
    # -------------------------------------------------------------------------
    def EMPTY_FST() -> pynini.Fst:
        # Language with no strings. Useful as a safe "nothing here" placeholder.
        return pynini.intersect(pynini.accep("a"), pynini.accep("b"))

    # -------------------------------------------------------------------------
    # Roots (IMPORTANT: never use EPS here)
    # -------------------------------------------------------------------------
    noun_roots = (
        create_alternating_roots(lexicon.get("nouns", []), "NOUN")
        if lexicon.get("nouns")
        else EMPTY_FST()
    )

    adj_roots = (
        create_alternating_roots(lexicon.get("adjectives", []), "ADJ")
        if lexicon.get("adjectives")
        else EMPTY_FST()
    )

    verb_root_list = [extract_verb_root(w) for w in lexicon.get("verbs", [])]
    verb_roots = (
        create_alternating_roots(verb_root_list, "VERB")
        if verb_root_list
        else EMPTY_FST()
    )

    pronoun_roots = (
        pynini.union(
            *[pynini.cross(w, f"{w}+PRON") for w in lexicon.get("pronouns", [])]
        )
        if lexicon.get("pronouns")
        else EMPTY_FST()
    )

    adverb_roots = (
        pynini.union(*[pynini.cross(w, f"{w}+ADV") for w in lexicon.get("adverbs", [])])
        if lexicon.get("adverbs")
        else EMPTY_FST()
    )

    postposition_roots = (
        pynini.union(
            *[pynini.cross(w, f"{w}+POSTP") for w in lexicon.get("postpositions", [])]
        )
        if lexicon.get("postpositions")
        else EMPTY_FST()
    )

    interjection_roots = (
        pynini.union(
            *[pynini.cross(w, f"{w}+INTERJ") for w in lexicon.get("interjections", [])]
        )
        if lexicon.get("interjections")
        else EMPTY_FST()
    )

    conjunction_roots = (
        pynini.union(
            *[pynini.cross(w, f"{w}+CONJ") for w in lexicon.get("conjunctions", [])]
        )
        if lexicon.get("conjunctions")
        else EMPTY_FST()
    )

    proper_noun_roots = (
        create_alternating_roots(lexicon.get("proper_nouns", []), "PROPN")
        if lexicon.get("proper_nouns")
        else EMPTY_FST()
    )

    # -------------------------------------------------------------------------
    # Question particles
    # -------------------------------------------------------------------------
    question_particles = pynini.union(
        pynini.cross("mi", "mi+QUES"),
        pynini.cross("mı", "mı+QUES"),
        pynini.cross("mu", "mu+QUES"),
        pynini.cross("mü", "mü+QUES"),
        pynini.cross("misin", "mi+QUES+2SG"),
        pynini.cross("mısın", "mı+QUES+2SG"),
        pynini.cross("musun", "mu+QUES+2SG"),
        pynini.cross("müsün", "mü+QUES+2SG"),
        pynini.cross("miyim", "mi+QUES+1SG"),
        pynini.cross("mıyım", "mı+QUES+1SG"),
        pynini.cross("muyum", "mu+QUES+1SG"),
        pynini.cross("müyüm", "mü+QUES+1SG"),
        pynini.cross("miyiz", "mi+QUES+1PL"),
        pynini.cross("mıyız", "mı+QUES+1PL"),
        pynini.cross("muyuz", "mu+QUES+1PL"),
        pynini.cross("müyüz", "mü+QUES+1PL"),
        pynini.cross("misiniz", "mi+QUES+2PL"),
        pynini.cross("mısınız", "mı+QUES+2PL"),
        pynini.cross("musunuz", "mu+QUES+2PL"),
        pynini.cross("müsünüz", "mü+QUES+2PL"),
    )

    # -------------------------------------------------------------------------
    # Derivational suffixes
    # -------------------------------------------------------------------------
    derivational = pynini.union(
        pynini.cross("lık", "+DER.lık"),
        pynini.cross("lik", "+DER.lik"),
        pynini.cross("luk", "+DER.luk"),
        pynini.cross("lük", "+DER.lük"),
        pynini.cross("cı", "+DER.cı"),
        pynini.cross("ci", "+DER.ci"),
        pynini.cross("cu", "+DER.cu"),
        pynini.cross("cü", "+DER.cü"),
        pynini.cross("çı", "+DER.çı"),
        pynini.cross("çi", "+DER.çi"),
        pynini.cross("çu", "+DER.çu"),
        pynini.cross("çü", "+DER.çü"),
        pynini.cross("sız", "+DER.sız"),
        pynini.cross("siz", "+DER.siz"),
        pynini.cross("suz", "+DER.suz"),
        pynini.cross("süz", "+DER.süz"),
        pynini.cross("lı", "+DER.lı"),
        pynini.cross("li", "+DER.li"),
        pynini.cross("lu", "+DER.lu"),
        pynini.cross("lü", "+DER.lü"),
        pynini.cross("", ""),  # optional
    )

    # -------------------------------------------------------------------------
    # Nominal morphology
    # -------------------------------------------------------------------------
    nominal_roots = pynini.union(noun_roots, adj_roots, pronoun_roots, proper_noun_roots)
    nominal_derived = nominal_roots + derivational

    plural = pynini.union(
        pynini.cross("lar", "+PL"),
        pynini.cross("ler", "+PL"),
        pynini.cross("", ""),  # optional
    )
    nominal_pl = nominal_derived + plural

    # Possessive (add vowel-final shortcuts so things like 'yüzü' don't get forced to ACC)
    possessive = pynini.union(
        # plurals
        pynini.cross("imiz", "+POSS.1PL"),
        pynini.cross("ımız", "+POSS.1PL"),
        pynini.cross("umuz", "+POSS.1PL"),
        pynini.cross("ümüz", "+POSS.1PL"),
        pynini.cross("iniz", "+POSS.2PL"),
        pynini.cross("ınız", "+POSS.2PL"),
        pynini.cross("unuz", "+POSS.2PL"),
        pynini.cross("ünüz", "+POSS.2PL"),
        pynini.cross("leri", "+POSS.3PL"),
        pynini.cross("ları", "+POSS.3PL"),
        # 1sg (consonant-final + vowel-final)
        pynini.cross("im", "+POSS.1SG"),
        pynini.cross("ım", "+POSS.1SG"),
        pynini.cross("um", "+POSS.1SG"),
        pynini.cross("üm", "+POSS.1SG"),
        pynini.cross("m", "+POSS.1SG"),
        # 2sg (consonant-final + vowel-final)
        pynini.cross("in", "+POSS.2SG"),
        pynini.cross("ın", "+POSS.2SG"),
        pynini.cross("un", "+POSS.2SG"),
        pynini.cross("ün", "+POSS.2SG"),
        pynini.cross("n", "+POSS.2SG"),
        # 3sg (vowel-final with -sI and consonant-final with bare vowel)
        pynini.cross("si", "+POSS.3SG"),
        pynini.cross("sı", "+POSS.3SG"),
        pynini.cross("su", "+POSS.3SG"),
        pynini.cross("sü", "+POSS.3SG"),
        pynini.cross("i", "+POSS.3SG"),
        pynini.cross("ı", "+POSS.3SG"),
        pynini.cross("u", "+POSS.3SG"),
        pynini.cross("ü", "+POSS.3SG"),
    )

    # Cases (keep yours, but optional epsilon must be "", "" not EPS)
    case_after_poss = pynini.union(
        pynini.cross("dan", "+ABL"),
        pynini.cross("den", "+ABL"),
        pynini.cross("tan", "+ABL"),
        pynini.cross("ten", "+ABL"),
        pynini.cross("ndan", "+ABL"),
        pynini.cross("nden", "+ABL"),
        pynini.cross("ntan", "+ABL"),
        pynini.cross("nten", "+ABL"),
        pynini.cross("nın", "+GEN"),
        pynini.cross("nin", "+GEN"),
        pynini.cross("nun", "+GEN"),
        pynini.cross("nün", "+GEN"),
        pynini.cross("da", "+LOC"),
        pynini.cross("de", "+LOC"),
        pynini.cross("ta", "+LOC"),
        pynini.cross("te", "+LOC"),
        pynini.cross("nda", "+LOC"),
        pynini.cross("nde", "+LOC"),
        pynini.cross("nta", "+LOC"),
        pynini.cross("nte", "+LOC"),
        pynini.cross("ya", "+DAT"),
        pynini.cross("ye", "+DAT"),
        pynini.cross("na", "+DAT"),
        pynini.cross("ne", "+DAT"),
        pynini.cross("yı", "+ACC"),
        pynini.cross("yi", "+ACC"),
        pynini.cross("yu", "+ACC"),
        pynini.cross("yü", "+ACC"),
        pynini.cross("nı", "+ACC"),
        pynini.cross("ni", "+ACC"),
        pynini.cross("nu", "+ACC"),
        pynini.cross("nü", "+ACC"),
        pynini.cross("yla", "+INS"),
        pynini.cross("yle", "+INS"),
        pynini.cross("ca", "+EQU"),
        pynini.cross("ce", "+EQU"),
        pynini.cross("", ""),  # optional
    )

    ki_suffix = pynini.union(
        pynini.cross("ki", "+KI"),
        pynini.cross("kü", "+KI"),
        pynini.cross("", ""),
    )

    plural_after_ki = pynini.union(
        pynini.cross("ler", "+PL"),
        pynini.cross("lar", "+PL"),
        pynini.cross("", ""),
    )

    possessive_path = nominal_pl + possessive + case_after_poss + ki_suffix + plural_after_ki

    case_no_poss = pynini.union(
        pynini.cross("ların", "+GEN"),
        pynini.cross("lerin", "+GEN"),
        pynini.cross("dan", "+ABL"),
        pynini.cross("den", "+ABL"),
        pynini.cross("tan", "+ABL"),
        pynini.cross("ten", "+ABL"),
        pynini.cross("nın", "+GEN"),
        pynini.cross("nin", "+GEN"),
        pynini.cross("nun", "+GEN"),
        pynini.cross("nün", "+GEN"),
        pynini.cross("da", "+LOC"),
        pynini.cross("de", "+LOC"),
        pynini.cross("ta", "+LOC"),
        pynini.cross("te", "+LOC"),
        pynini.cross("ya", "+DAT"),
        pynini.cross("ye", "+DAT"),
        pynini.cross("a", "+DAT"),
        pynini.cross("e", "+DAT"),
        pynini.cross("yı", "+ACC"),
        pynini.cross("yi", "+ACC"),
        pynini.cross("yu", "+ACC"),
        pynini.cross("yü", "+ACC"),
        pynini.cross("ı", "+ACC"),
        pynini.cross("i", "+ACC"),
        pynini.cross("u", "+ACC"),
        pynini.cross("ü", "+ACC"),
        pynini.cross("la", "+INS"),
        pynini.cross("le", "+INS"),
        pynini.cross("yla", "+INS"),
        pynini.cross("yle", "+INS"),
        pynini.cross("ca", "+EQU"),
        pynini.cross("ce", "+EQU"),
        pynini.cross("", ""),  # optional
    )

    case_only_path = nominal_pl + case_no_poss + ki_suffix + plural_after_ki
    nominal_base = pynini.union(possessive_path, case_only_path).optimize()

    # Copula
    copula = pynini.union(
        pynini.cross("ydi", "+COP.PAST"),
        pynini.cross("ydı", "+COP.PAST"),
        pynini.cross("ydu", "+COP.PAST"),
        pynini.cross("ydü", "+COP.PAST"),
        pynini.cross("ymış", "+COP.EVID"),
        pynini.cross("ymiş", "+COP.EVID"),
        pynini.cross("ymuş", "+COP.EVID"),
        pynini.cross("ymüş", "+COP.EVID"),
        pynini.cross("yse", "+COP.COND"),
        pynini.cross("ysa", "+COP.COND"),
        pynini.cross("di", "+COP.PAST"),
        pynini.cross("dı", "+COP.PAST"),
        pynini.cross("du", "+COP.PAST"),
        pynini.cross("dü", "+COP.PAST"),
        pynini.cross("ti", "+COP.PAST"),
        pynini.cross("tı", "+COP.PAST"),
        pynini.cross("tu", "+COP.PAST"),
        pynini.cross("tü", "+COP.PAST"),
        pynini.cross("miş", "+COP.EVID"),
        pynini.cross("mış", "+COP.EVID"),
        pynini.cross("muş", "+COP.EVID"),
        pynini.cross("müş", "+COP.EVID"),
        pynini.cross("se", "+COP.COND"),
        pynini.cross("sa", "+COP.COND"),
        pynini.cross("dir", "+COP.PRES"),
        pynini.cross("dır", "+COP.PRES"),
        pynini.cross("dur", "+COP.PRES"),
        pynini.cross("dür", "+COP.PRES"),
        pynini.cross("tir", "+COP.PRES"),
        pynini.cross("tır", "+COP.PRES"),
        pynini.cross("tur", "+COP.PRES"),
        pynini.cross("tür", "+COP.PRES"),
    )

    person = pynini.union(
        pynini.cross("im", "+1SG"),
        pynini.cross("ım", "+1SG"),
        pynini.cross("um", "+1SG"),
        pynini.cross("üm", "+1SG"),
        pynini.cross("in", "+2SG"),
        pynini.cross("ın", "+2SG"),
        pynini.cross("un", "+2SG"),
        pynini.cross("ün", "+2SG"),
        pynini.cross("ız", "+1PL"),
        pynini.cross("iz", "+1PL"),
        pynini.cross("uz", "+1PL"),
        pynini.cross("üz", "+1PL"),
        pynini.cross("nız", "+2PL"),
        pynini.cross("niz", "+2PL"),
        pynini.cross("nuz", "+2PL"),
        pynini.cross("nüz", "+2PL"),
        pynini.cross("lar", "+3PL"),
        pynini.cross("ler", "+3PL"),
        pynini.cross("", ""),
    )

    nominal_with_cop = copula + person
    nominal_complete = nominal_base + pynini.union(nominal_with_cop, pynini.cross("", "")).optimize()

    # -------------------------------------------------------------------------
    # Verb morphology
    # -------------------------------------------------------------------------
    voice = pynini.union(
        pynini.cross("ıl", "+PASS"),
        pynini.cross("il", "+PASS"),
        pynini.cross("ul", "+PASS"),
        pynini.cross("ül", "+PASS"),
        pynini.cross("ın", "+REFL"),
        pynini.cross("in", "+REFL"),
        pynini.cross("un", "+REFL"),
        pynini.cross("ün", "+REFL"),
        pynini.cross("lan", "+REFL"),
        pynini.cross("len", "+REFL"),
        pynini.cross("ış", "+RECIP"),
        pynini.cross("iş", "+RECIP"),
        pynini.cross("uş", "+RECIP"),
        pynini.cross("üş", "+RECIP"),
        pynini.cross("t", "+CAUS"),
        pynini.cross("d", "+CAUS"),
        pynini.cross("dır", "+CAUS"),
        pynini.cross("dir", "+CAUS"),
        pynini.cross("dur", "+CAUS"),
        pynini.cross("dür", "+CAUS"),
        pynini.cross("tır", "+CAUS"),
        pynini.cross("tir", "+CAUS"),
        pynini.cross("tur", "+CAUS"),
        pynini.cross("tür", "+CAUS"),
        pynini.cross("", ""),
    )

    ability = pynini.union(
        pynini.cross("ebil", "+ABIL"),
        pynini.cross("abil", "+ABIL"),
        pynini.cross("", ""),
    )

    negation = pynini.union(
        pynini.cross("ma", "+NEG"),
        pynini.cross("me", "+NEG"),
        pynini.cross("", ""),
    )

    # Add fused FUT endings to cover gideceğim, yapacağım, gideceğiz, etc.
    fused_future = pynini.union(
        pynini.cross("acağım", "+FUT+1SG"),
        pynini.cross("eceğim", "+FUT+1SG"),
        pynini.cross("acağız", "+FUT+1PL"),
        pynini.cross("eceğiz", "+FUT+1PL"),
    )

    indicative_tense = pynini.union(
        pynini.cross("iyor", "+PRES.CONT"),
        pynini.cross("ıyor", "+PRES.CONT"),
        pynini.cross("uyor", "+PRES.CONT"),
        pynini.cross("üyor", "+PRES.CONT"),
        fused_future,
        pynini.cross("ecek", "+FUT"),
        pynini.cross("acak", "+FUT"),
        pynini.cross("ır", "+AOR"),
        pynini.cross("ir", "+AOR"),
        pynini.cross("ur", "+AOR"),
        pynini.cross("ür", "+AOR"),
        pynini.cross("ar", "+AOR"),
        pynini.cross("er", "+AOR"),
        pynini.cross("r", "+AOR"),
        pynini.cross("dı", "+PAST"),
        pynini.cross("di", "+PAST"),
        pynini.cross("du", "+PAST"),
        pynini.cross("dü", "+PAST"),
        pynini.cross("tı", "+PAST"),
        pynini.cross("ti", "+PAST"),
        pynini.cross("tu", "+PAST"),
        pynini.cross("tü", "+PAST"),
        pynini.cross("mış", "+INFER"),
        pynini.cross("miş", "+INFER"),
        pynini.cross("muş", "+INFER"),
        pynini.cross("müş", "+INFER"),
    )

    indicative_person = pynini.union(
        pynini.cross("um", "+1SG"),
        pynini.cross("üm", "+1SG"),
        pynini.cross("ım", "+1SG"),
        pynini.cross("im", "+1SG"),
        pynini.cross("m", "+1SG"),
        pynini.cross("sun", "+2SG"),
        pynini.cross("sün", "+2SG"),
        pynini.cross("sın", "+2SG"),
        pynini.cross("sin", "+2SG"),
        pynini.cross("n", "+2SG"),
        pynini.cross("uz", "+1PL"),
        pynini.cross("üz", "+1PL"),
        pynini.cross("ız", "+1PL"),
        pynini.cross("iz", "+1PL"),
        pynini.cross("k", "+1PL"),
        pynini.cross("sunuz", "+2PL"),
        pynini.cross("sünüz", "+2PL"),
        pynini.cross("sınız", "+2PL"),
        pynini.cross("siniz", "+2PL"),
        pynini.cross("nız", "+2PL"),
        pynini.cross("niz", "+2PL"),
        pynini.cross("nuz", "+2PL"),
        pynini.cross("nüz", "+2PL"),
        pynini.cross("lar", "+3PL"),
        pynini.cross("ler", "+3PL"),
        pynini.cross("", "+3SG"),
    )

    # Converbs (so bakınca/deyince/alınca don't go UNKNOWN)
    converbs = pynini.union(
        pynini.cross("ınca", "+CVB.INCA"),
        pynini.cross("ince", "+CVB.INCA"),
        pynini.cross("unca", "+CVB.INCA"),
        pynini.cross("ünce", "+CVB.INCA"),
        pynini.cross("yınca", "+CVB.INCA"),
        pynini.cross("yince", "+CVB.INCA"),
        pynini.cross("yunca", "+CVB.INCA"),
        pynini.cross("yünce", "+CVB.INCA"),
        pynini.cross("ken", "+CVB.KEN"),
        pynini.cross("ip", "+CVB.IP"),
        pynini.cross("ıp", "+CVB.IP"),
        pynini.cross("up", "+CVB.IP"),
        pynini.cross("üp", "+CVB.IP"),
        pynini.cross("madan", "+CVB.MADAN"),
        pynini.cross("meden", "+CVB.MEDEN"),
    )

    optative_mood_person = pynini.union(
        pynini.cross("ayım", "+OPT+1SG"),
        pynini.cross("eyim", "+OPT+1SG"),
        pynini.cross("ayum", "+OPT+1SG"),
        pynini.cross("eyüm", "+OPT+1SG"),
        pynini.cross("asın", "+OPT+2SG"),
        pynini.cross("esin", "+OPT+2SG"),
        pynini.cross("asun", "+OPT+2SG"),
        pynini.cross("esün", "+OPT+2SG"),
        pynini.cross("asana", "+OPT+2SG+EMPH"),
        pynini.cross("esene", "+OPT+2SG+EMPH"),
        pynini.cross("sana", "+OPT+2SG+EMPH"),
        pynini.cross("sene", "+OPT+2SG+EMPH"),
        pynini.cross("asan", "+OPT+2SG"),
        pynini.cross("esen", "+OPT+2SG"),
        pynini.cross("a", "+OPT+3SG"),
        pynini.cross("e", "+OPT+3SG"),
        pynini.cross("alım", "+OPT+1PL"),
        pynini.cross("elim", "+OPT+1PL"),
        pynini.cross("alum", "+OPT+1PL"),
        pynini.cross("elüm", "+OPT+1PL"),
        pynini.cross("asınız", "+OPT+2PL"),
        pynini.cross("esiniz", "+OPT+2PL"),
        pynini.cross("asunuz", "+OPT+2PL"),
        pynini.cross("esünüz", "+OPT+2PL"),
        pynini.cross("alar", "+OPT+3PL"),
        pynini.cross("eler", "+OPT+3PL"),
    )

    conditional_mood_person = pynini.union(
        pynini.cross("sam", "+COND+1SG"),
        pynini.cross("sem", "+COND+1SG"),
        pynini.cross("san", "+COND+2SG"),
        pynini.cross("sen", "+COND+2SG"),
        pynini.cross("sa", "+COND+3SG"),
        pynini.cross("se", "+COND+3SG"),
        pynini.cross("sak", "+COND+1PL"),
        pynini.cross("sek", "+COND+1PL"),
        pynini.cross("sanız", "+COND+2PL"),
        pynini.cross("seniz", "+COND+2PL"),
        pynini.cross("sanuz", "+COND+2PL"),
        pynini.cross("senüz", "+COND+2PL"),
        pynini.cross("salar", "+COND+3PL"),
        pynini.cross("seler", "+COND+3PL"),
    )

    necessitative_mood_person = pynini.union(
        pynini.cross("malıyım", "+NEC+1SG"),
        pynini.cross("meliyim", "+NEC+1SG"),
        pynini.cross("malıyum", "+NEC+1SG"),
        pynini.cross("meliyüm", "+NEC+1SG"),
        pynini.cross("malısın", "+NEC+2SG"),
        pynini.cross("melisin", "+NEC+2SG"),
        pynini.cross("malısun", "+NEC+2SG"),
        pynini.cross("melisün", "+NEC+2SG"),
        pynini.cross("malı", "+NEC+3SG"),
        pynini.cross("meli", "+NEC+3SG"),
        pynini.cross("malıyız", "+NEC+1PL"),
        pynini.cross("meliyiz", "+NEC+1PL"),
        pynini.cross("malıyuz", "+NEC+1PL"),
        pynini.cross("meliyüz", "+NEC+1PL"),
        pynini.cross("malısınız", "+NEC+2PL"),
        pynini.cross("melisiniz", "+NEC+2PL"),
        pynini.cross("malısunuz", "+NEC+2PL"),
        pynini.cross("melisünüz", "+NEC+2PL"),
        pynini.cross("malılar", "+NEC+3PL"),
        pynini.cross("meliler", "+NEC+3PL"),
    )

    imperative_mood_person = pynini.union(
        pynini.cross("sin", "+IMP+3SG"),
        pynini.cross("sın", "+IMP+3SG"),
        pynini.cross("sun", "+IMP+3SG"),
        pynini.cross("sün", "+IMP+3SG"),
        pynini.cross("in", "+IMP+2PL"),
        pynini.cross("ın", "+IMP+2PL"),
        pynini.cross("un", "+IMP+2PL"),
        pynini.cross("ün", "+IMP+2PL"),
        pynini.cross("iniz", "+IMP+2PL"),
        pynini.cross("ınız", "+IMP+2PL"),
        pynini.cross("unuz", "+IMP+2PL"),
        pynini.cross("ünüz", "+IMP+2PL"),
        pynini.cross("sinler", "+IMP+3PL"),
        pynini.cross("sınlar", "+IMP+3PL"),
        pynini.cross("sunlar", "+IMP+3PL"),
        pynini.cross("sünler", "+IMP+3PL"),
    )

    imperative_2sg_bare = pynini.cross("", "+IMP+2SG")

    verb_base = verb_roots + voice + ability + negation

    # Add converbs as another valid verb continuation
    verb_converb = verb_base + converbs

    verb_indicative = verb_base + indicative_tense + indicative_person
    verb_optative = verb_base + optative_mood_person
    verb_conditional = verb_base + conditional_mood_person
    verb_necessitative = verb_base + necessitative_mood_person
    verb_imperative = verb_base + pynini.union(imperative_mood_person, imperative_2sg_bare)

    verb_complete = pynini.union(
        verb_indicative,
        verb_optative,
        verb_conditional,
        verb_necessitative,
        verb_imperative,
        verb_converb,
    ).optimize()

    # -------------------------------------------------------------------------
    # Punctuation
    # -------------------------------------------------------------------------
    punctuation_suffix = pynini.union(
        pynini.cross(".", "+PUNCT.period"),
        pynini.cross(",", "+PUNCT.comma"),
        pynini.cross("?", "+PUNCT.question"),
        pynini.cross("!", "+PUNCT.exclamation"),
        pynini.cross(":", "+PUNCT.colon"),
        pynini.cross(";", "+PUNCT.semicolon"),
        pynini.cross("", ""),  # optional suffix AFTER a word
    )

    # Punctuation as a standalone token (NO epsilon here)
    punctuation_only = pynini.union(
        pynini.cross(".", ".+PUNCT.period"),
        pynini.cross(",", ",+PUNCT.comma"),
        pynini.cross("?", "?+PUNCT.question"),
        pynini.cross("!", "!+PUNCT.exclamation"),
        pynini.cross(":", ":+PUNCT.colon"),
        pynini.cross(";", ";+PUNCT.semicolon"),
    ).optimize()

    # Define simple_categories here
    simple_categories = pynini.union(
        adverb_roots,
        conjunction_roots,
        interjection_roots,
        question_particles,
        postposition_roots
    ).optimize()

    nominal_fst = (nominal_complete + punctuation_suffix).optimize()
    verb_fst = (verb_complete + punctuation_suffix).optimize()
    simple_fst = (simple_categories + punctuation_suffix).optimize()

    return pynini.union(nominal_fst, verb_fst, simple_fst, punctuation_only).optimize()


# -----------------------------------------------------------------------------
# Analysis API
# -----------------------------------------------------------------------------
def analyze_word(word: str, analyzer: pynini.Fst) -> List[str]:
    """
    Analyze a word and return all possible analyses.
    """
    try:
        lattice = pynini.compose(word, analyzer)
        analyses: List[str] = []
        seen = set()

        try:
            for path in lattice.paths().ostrings():
                if path not in seen:
                    analyses.append(path)
                    seen.add(path)
        except Exception:
            # Paths iteration fails when there are no paths or lattice isn't enumerable
            pass

        return sorted(analyses) if analyses else [f"No analysis found for: {word}"]

    except Exception as e:
        return [f"Error: {e}"]


# -----------------------------------------------------------------------------
# Context-Aware Disambiguation (Viterbi)
# -----------------------------------------------------------------------------
@dataclass
class Candidate:
    word: str
    analysis: str
    tag: str
    source: str = "fst"   # "fst" | "oov"


class ContextAwareDisambiguator:
    """
    Simple Viterbi decoder using a hand-written bigram POS transition model
    + morphology-based heuristic boosts.
    """

    def __init__(self, analyzer: pynini.Fst):
        self.analyzer = analyzer

        self.transitions = {
            "START": {"NOUN": 0.4, "PRON": 0.3, "ADV": 0.1, "VERB": 0.1, "ADJ": 0.1},
            "ADJ": {"NOUN": 0.9, "ADJ": 0.1, "VERB": 0.01},
            "NOUN": {"VERB": 0.4, "NOUN": 0.2, "CONJ": 0.1, "POSTP": 0.2, "ADV": 0.1},
            "PRON": {"VERB": 0.5, "NOUN": 0.2, "POSTP": 0.2, "ADJ": 0.1},
            "ADV": {"VERB": 0.6, "ADJ": 0.3, "ADV": 0.1},
            "NUM": {"NOUN": 0.95},
            "VERB": {"PUNCT": 0.8, "CONJ": 0.1, "NOUN": 0.05, "PRON": 0.05},
            "QUES": {"PUNCT": 0.9, "VERB": 0.1},
            "UNKNOWN": {"NOUN": 0.3, "VERB": 0.3, "ADJ": 0.1, "ADV": 0.1, "PRON": 0.1, "PUNCT": 0.1},
            "DEFAULT": {"NOUN": 0.3, "VERB": 0.3, "ADJ": 0.1, "ADV": 0.1, "PRON": 0.1, "PUNCT": 0.1},
        }

        self.tags = ["NOUN", "VERB", "ADJ", "ADV", "PRON", "POSTP", "CONJ", "QUES", "INTERJ", "UNKNOWN"]

    def get_tag_from_analysis(self, analysis: str) -> str:
        if "No analysis" in analysis:
            return "UNKNOWN"
        if "+UNK" in analysis or "+OOV" in analysis:
            return "UNKNOWN"
        if "+QUES" in analysis:
            return "QUES"
        if "+PUNCT" in analysis:
            return "PUNCT"

        for tag in self.tags:
            if f"+{tag}" in analysis:
                return tag
        return "NOUN"

    def get_transition_prob(self, prev_tag: str, current_tag: str) -> float:
        if prev_tag in self.transitions:
            return self.transitions[prev_tag].get(current_tag, 0.001)
        return self.transitions["DEFAULT"].get(current_tag, 0.001)

    def heuristic_weight(self, word: str, analysis: str, tag: str, position: int, sentence_len: int) -> float:
        score = 0.0
        if tag == "UNKNOWN":
          score -= 3.0
        # Morphology hints
        if tag == "VERB":
            if word.endswith(("yor", "yorum", "yorsun", "dı", "di", "du", "dü", "acak", "ecek", "malı", "meli")):
                score += 2.0
        elif tag == "NOUN":
            if word.endswith(("lar", "ler", "in", "un", "nın", "nin", "da", "de", "dan", "den")):
                score += 1.5
        elif tag == "QUES":
            if word.lower().startswith(("mi", "mı", "mu", "mü")):
                score += 5.0

        # Turkish is often SOV; verbs are likely at end
        if position == sentence_len - 1:
            if tag in {"VERB", "QUES"}:
                score += 1.5
            if tag == "NOUN":
                score -= 0.5

        # Very short tokens rarely function as verbs mid-sentence
        if len(word) <= 2 and tag == "VERB" and position != sentence_len - 1:
            score -= 1.0

        return score

    def decode_sentence(self, sentence_tokens: List[str]) -> List[Candidate]:
        lattice: List[List[Candidate]] = []

        for word in sentence_tokens:
            raw_analyses = analyze_word(word, self.analyzer)
            candidates: List[Candidate] = []

            no_path = (not raw_analyses) or (len(raw_analyses) == 1 and raw_analyses[0].startswith("No analysis found for:"))
            if no_path:
              candidates.append(Candidate(word=word, analysis=f"{word}+UNK+OOV", tag="UNKNOWN", source="oov"))
            else:
                for ana in raw_analyses:
                    tag = self.get_tag_from_analysis(ana)
                    candidates.append(Candidate(word=word, analysis=ana, tag=tag, source="fst"))

            lattice.append(candidates)

        n = len(lattice)
        if n == 0:
            return []

        best_scores: List[Dict[int, float]] = [{} for _ in range(n)]
        backpointers: List[Dict[int, int]] = [{} for _ in range(n)]

        # Initialization
        for i, cand in enumerate(lattice[0]):
            trans_prob = self.get_transition_prob("START", cand.tag)
            heuristic = self.heuristic_weight(cand.word, cand.analysis, cand.tag, 0, n)
            best_scores[0][i] = math.log(trans_prob) + heuristic

        # Forward pass
        for t in range(1, n):
            for i, curr in enumerate(lattice[t]):
                max_score = -float("inf")
                best_prev = -1

                for j, prev in enumerate(lattice[t - 1]):
                    prev_score = best_scores[t - 1][j]
                    trans_prob = self.get_transition_prob(prev.tag, curr.tag)
                    heuristic = self.heuristic_weight(curr.word, curr.analysis, curr.tag, t, n)

                    score = prev_score + math.log(trans_prob) + heuristic
                    if score > max_score:
                        max_score = score
                        best_prev = j

                best_scores[t][i] = max_score
                backpointers[t][i] = best_prev

        # Backtracking
        best_last_idx = max(best_scores[n - 1], key=best_scores[n - 1].get)

        path: List[Candidate] = []
        curr_idx = best_last_idx

        for t in range(n - 1, -1, -1):
            path.append(lattice[t][curr_idx])
            if t > 0:
                curr_idx = backpointers[t][curr_idx]

        return list(reversed(path))


def tokenize(sentence: str) -> List[str]:
    """
    Basic tokenizer: words + punctuation.
    """
    return re.findall(r"[\w']+|[.,!?;:]", sentence)


def analyze_sentence_context_aware(sentence: str, disambiguator: ContextAwareDisambiguator):
    tokens = tokenize(sentence)
    best_path = disambiguator.decode_sentence(tokens)

    return [
        {
            "token": cand.word,
            "best_analysis": cand.analysis,
            "tag": cand.tag,
            "source": cand.source,
            "is_oov": (cand.source == "oov")
        }
        for cand in best_path
    ]


def save_fst(analyzer: pynini.Fst, filename: str) -> None:
    analyzer.write(filename)
    logger.info("FST saved to %s", filename)


# -----------------------------------------------------------------------------
# Debug / CLI
# -----------------------------------------------------------------------------
def main():
    logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

    lex = normalize_lexicon(load_lexicon())
    analyzer = build_analyzer(lex)
    disambiguator = ContextAwareDisambiguator(analyzer)

    logger.info("Lexicon loaded")
    logger.info("Nouns: %d", len(lex.get("nouns", [])))
    logger.info("Verbs: %d", len(lex.get("verbs", [])))
    logger.info("Adjectives: %d", len(lex.get("adjectives", [])))
    logger.info("Pronouns: %d", len(lex.get("pronouns", [])))
    logger.info("Adverbs: %d", len(lex.get("adverbs", [])))
    logger.info("Conjunctions: %d", len(lex.get("conjunctions", [])))
    logger.info("Postpositions: %d", len(lex.get("postpositions", [])))
    logger.info("Proper nouns: %d", len(lex.get("proper_nouns", [])))

    # Sample tests
    test_words = [
        "git",
        "gitsin",
        "giderim",
        "gittim",
        "gideceğim",
        "gidiyorum",
        "gitmeliyim",
        "yapabilir",
        "yapacaksın",
        "anlasana",
        "duysalar",
        "baksaydım",
        "çocuklardan",
        "çantamdan",
        "okuldakiler",
    ]

    print("\nSINGLE WORD ANALYSIS")
    print("-" * 60)
    for w in test_words:
        print(f"\n{w}:")
        for a in analyze_word(w, analyzer)[:5]:
            print(f"  {a}")

    ambiguous_sentences = [
        "Çay demledim ama çay kenarında yürümeyi de seviyorum; hangisini önce yapalım diye düşünürken zaman geçti.",
        "Bana gül deyince çiçekten mi bahsediyorsun yoksa sadece gülümsememi mi istiyorsun, karar veremedim."
    ]

    print("\nCONTEXT-AWARE ANALYSIS (Viterbi)")
    print("-" * 60)
    for sent in ambiguous_sentences:
        print(f"\nSentence: '{sent}'")
        results = analyze_sentence_context_aware(sent, disambiguator)

        print(f" {'Word':<15} | {'Tag':<6} | {'Selected Analysis'}")
        print("-" * 70)
        for r in results:
            print(f" {r['token']:<15} | {r['tag']:<6} | {r['best_analysis']}")


if __name__ == "__main__":
    main()

IOStream.flush timed out



SINGLE WORD ANALYSIS
------------------------------------------------------------

git:
  git+VERB+IMP+2SG

gitsin:
  git+VERB+IMP+3SG

giderim:
  gider+NOUN+POSS.1SG
  git+VERB+AOR+1SG

gittim:
  git+VERB+PAST+1SG

gideceğim:
  git+VERB+FUT+1SG+3SG

gidiyorum:
  git+VERB+PRES.CONT+1SG

gitmeliyim:
  git+VERB+NEC+1SG

yapabilir:
  yap+VERB+ABIL+AOR+3SG

yapacaksın:
  yap+VERB+FUT+2SG

anlasana:
  anla+VERB+OPT+2SG+EMPH

duysalar:
  d+NOUN+ACC+COP.COND+3PL
  d+NOUN+POSS.3SG+COP.COND+3PL
  d+PROPN+ACC+COP.COND+3PL
  d+PROPN+POSS.3SG+COP.COND+3PL
  duy+NOUN+COP.COND+3PL

baksaydım:
  No analysis found for: baksaydım

çocuklardan:
  çocuk+NOUN+PL+ABL

çantamdan:
  çanta+NOUN+POSS.1SG+ABL

okuldakiler:
  okul+NOUN+LOC+KI+PL

CONTEXT-AWARE ANALYSIS (Viterbi)
------------------------------------------------------------

Sentence: 'Çay demledim ama çay kenarında yürümeyi de seviyorum; hangisini önce yapalım diye düşünürken zaman geçti.'
 Word            | Tag    | Selected Analysis
----------