In [1]:
# Setup
!pip install --only-binary :all: pynini
!pip install wurlitzer
import pynini
%load_ext wurlitzer

Collecting pynini
  Downloading pynini-2.1.7-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.7 kB)
Downloading pynini-2.1.7-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (165.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.5/165.5 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynini
Successfully installed pynini-2.1.7


In [None]:
import pynini
import json
import math
from collections import defaultdict

# ===== LOAD LEXICON FROM JSON =====
def load_lexicon(json_file='turkish_lexicon.json'):
    """Load Turkish lexicon from JSON file."""
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"Warning: {json_file} not found. Using default minimal lexicon.")
        return {
            "nouns": ["ev", "kitap", "masa"],
            "verbs": ["gel", "git", "oku"],
            "adjectives": ["güzel", "iyi"],
            "pronouns": ["ben", "sen", "o"],
            "adverbs": ["çok", "az"],
            "conjunctions": ["ve", "da", "de"],
            "postpositions": ["gibi", "için"],
            "proper_nouns": []
        }

# Load lexicon
lexicon = load_lexicon()

# ===== HELPER FUNCTIONS =====
def extract_verb_root(verb_infinitive):
    """Extract verb root from infinitive form (remove -mak/-mek)."""
    if verb_infinitive.endswith('mak'):
        return verb_infinitive[:-3]
    elif verb_infinitive.endswith('mek'):
        return verb_infinitive[:-3]
    else:
        # If it doesn't end with -mak/-mek, return as is (might be already a root)
        return verb_infinitive

def create_alternating_roots(words, tag):
    """
    Create FST roots with consonant softening (ünsüz yumuşaması).
    p→b, ç→c, t→d, k→g/ğ when followed by vowel-initial suffix.
    """
    roots = []

    for word in words:
        # Original form (before consonant-initial suffixes)
        roots.append(pynini.cross(word, f"{word}+{tag}"))

        # Check if word ends with p, ç, t, k (sert ünsüz)
        if len(word) > 1 and word[-1] in ['p', 'ç', 't', 'k']:
            # Get the stem without final consonant
            stem = word[:-1]
            final = word[-1]

            # Determine softened form (yumuşak ünsüz)
            if final == 'p':
                softened = stem + 'b'
            elif final == 'ç':
                softened = stem + 'c'
            elif final == 't':
                softened = stem + 'd'
            elif final == 'k':
                # k → ğ after vowels, k → g after consonants
                if len(stem) > 0 and stem[-1] in 'aeıioöuü':
                    softened = stem + 'ğ'
                else:
                    softened = stem + 'g'

            # Add softened form (before vowel-initial suffixes)
            roots.append(pynini.cross(softened, f"{word}+{tag}"))

    return pynini.union(*roots) if roots else pynini.cross("", "")

# ===== ROOTS BY LEXICAL CATEGORY =====

# Nouns (Ad) - with consonant softening
noun_roots = create_alternating_roots(
    lexicon.get('nouns', []),
    'NOUN'
) if lexicon.get('nouns') else pynini.cross("", "")

# Adjectives (Sıfat) - with consonant softening
adj_roots = create_alternating_roots(
    lexicon.get('adjectives', []),
    'ADJ'
) if lexicon.get('adjectives') else pynini.cross("", "")

# Verbs (Fiil) - extract roots and apply consonant softening
verb_root_list = [extract_verb_root(word) for word in lexicon.get('verbs', [])]
verb_roots = create_alternating_roots(
    verb_root_list,
    'VERB'
) if verb_root_list else pynini.cross("", "")

# Pronouns (Zamir) - usually don't undergo consonant softening, but include for completeness
pronoun_roots = pynini.union(*[
    pynini.cross(word, f"{word}+PRON")
    for word in lexicon.get('pronouns', [])
]) if lexicon.get('pronouns') else pynini.cross("", "")

# Adverbs (Zarf) - no inflection
adverb_roots = pynini.union(*[
    pynini.cross(word, f"{word}+ADV")
    for word in lexicon.get('adverbs', [])
]) if lexicon.get('adverbs') else pynini.cross("", "")

# Postpositions (Edat) - no inflection
postposition_roots = pynini.union(*[
    pynini.cross(word, f"{word}+POSTP")
    for word in lexicon.get('postpositions', [])
]) if lexicon.get('postpositions') else pynini.cross("", "")

# Interjections (Ünlem) - no inflection
interjection_roots = pynini.union(*[
    pynini.cross(word, f"{word}+INTERJ")
    for word in lexicon.get('interjections', [])
]) if lexicon.get('interjections') else pynini.cross("", "")

# Conjunctions (Bağlaç) - no inflection
conjunction_roots = pynini.union(*[
    pynini.cross(word, f"{word}+CONJ")
    for word in lexicon.get('conjunctions', [])
]) if lexicon.get('conjunctions') else pynini.cross("", "")

# Proper Nouns (Özel İsimler) - with consonant softening
proper_noun_roots = create_alternating_roots(
    lexicon.get('proper_nouns', []),
    'PROPN'
) if lexicon.get('proper_nouns') else pynini.cross("", "")

# Question Particles
question_particles = pynini.union(
    pynini.cross("mi", "mi+QUES"),
    pynini.cross("mı", "mı+QUES"),
    pynini.cross("mu", "mu+QUES"),
    pynini.cross("mü", "mü+QUES"),
    pynini.cross("misin", "mi+QUES+2SG"),
    pynini.cross("mısın", "mı+QUES+2SG"),
    pynini.cross("musun", "mu+QUES+2SG"),
    pynini.cross("müsün", "mü+QUES+2SG"),
    pynini.cross("miyim", "mi+QUES+1SG"),
    pynini.cross("mıyım", "mı+QUES+1SG"),
    pynini.cross("muyum", "mu+QUES+1SG"),
    pynini.cross("müyüm", "mü+QUES+1SG"),
    pynini.cross("miyiz", "mi+QUES+1PL"),
    pynini.cross("mıyız", "mı+QUES+1PL"),
    pynini.cross("muyuz", "mu+QUES+1PL"),
    pynini.cross("müyüz", "mü+QUES+1PL"),
    pynini.cross("misiniz", "mi+QUES+2PL"),
    pynini.cross("mısınız", "mı+QUES+2PL"),
    pynini.cross("musunuz", "mu+QUES+2PL"),
    pynini.cross("müsünüz", "mü+QUES+2PL")
)

# ===== DERIVATIONAL SUFFIXES =====
derivational = pynini.union(
    pynini.cross("lık", "+DER.lık"), pynini.cross("lik", "+DER.lik"),
    pynini.cross("luk", "+DER.luk"), pynini.cross("lük", "+DER.lük"),
    pynini.cross("cı", "+DER.cı"), pynini.cross("ci", "+DER.ci"),
    pynini.cross("cu", "+DER.cu"), pynini.cross("cü", "+DER.cü"),
    pynini.cross("çı", "+DER.çı"), pynini.cross("çi", "+DER.çi"),
    pynini.cross("çu", "+DER.çu"), pynini.cross("çü", "+DER.çü"),
    pynini.cross("sız", "+DER.sız"), pynini.cross("siz", "+DER.siz"),
    pynini.cross("suz", "+DER.suz"), pynini.cross("süz", "+DER.süz"),
    pynini.cross("lı", "+DER.lı"), pynini.cross("li", "+DER.li"),
    pynini.cross("lu", "+DER.lu"), pynini.cross("lü", "+DER.lü"),
    pynini.cross("", "")
)

# ===== NOUN/ADJECTIVE MORPHOLOGY =====
# Include proper nouns that can take case markers
nominal_roots = pynini.union(noun_roots, adj_roots, pronoun_roots, proper_noun_roots)
nominal_derived = nominal_roots + derivational

plural = pynini.union(
    pynini.cross("lar", "+PL"),
    pynini.cross("ler", "+PL"),
    pynini.cross("", "")
)

nominal_pl = nominal_derived + plural

possessive = pynini.union(
    pynini.cross("imiz", "+POSS.1PL"), pynini.cross("ımız", "+POSS.1PL"),
    pynini.cross("umuz", "+POSS.1PL"), pynini.cross("ümüz", "+POSS.1PL"),
    pynini.cross("iniz", "+POSS.2PL"), pynini.cross("ınız", "+POSS.2PL"),
    pynini.cross("unuz", "+POSS.2PL"), pynini.cross("ünüz", "+POSS.2PL"),
    pynini.cross("leri", "+POSS.3PL"), pynini.cross("ları", "+POSS.3PL"),
    pynini.cross("im", "+POSS.1SG"), pynini.cross("ım", "+POSS.1SG"),
    pynini.cross("um", "+POSS.1SG"), pynini.cross("üm", "+POSS.1SG"),
    pynini.cross("in", "+POSS.2SG"), pynini.cross("ın", "+POSS.2SG"),
    pynini.cross("un", "+POSS.2SG"), pynini.cross("ün", "+POSS.2SG"),
    pynini.cross("si", "+POSS.3SG"), pynini.cross("sı", "+POSS.3SG"),
    pynini.cross("su", "+POSS.3SG"), pynini.cross("sü", "+POSS.3SG")
)

case_after_poss = pynini.union(
    pynini.cross("dan", "+ABL"), pynini.cross("den", "+ABL"),
    pynini.cross("tan", "+ABL"), pynini.cross("ten", "+ABL"),
    pynini.cross("ndan", "+ABL"), pynini.cross("nden", "+ABL"),
    pynini.cross("ntan", "+ABL"), pynini.cross("nten", "+ABL"),
    pynini.cross("nın", "+GEN"), pynini.cross("nin", "+GEN"),
    pynini.cross("nun", "+GEN"), pynini.cross("nün", "+GEN"),
    pynini.cross("da", "+LOC"), pynini.cross("de", "+LOC"),
    pynini.cross("ta", "+LOC"), pynini.cross("te", "+LOC"),
    pynini.cross("nda", "+LOC"), pynini.cross("nde", "+LOC"),
    pynini.cross("nta", "+LOC"), pynini.cross("nte", "+LOC"),
    pynini.cross("ya", "+DAT"), pynini.cross("ye", "+DAT"),
    pynini.cross("na", "+DAT"), pynini.cross("ne", "+DAT"),
    pynini.cross("yı", "+ACC"), pynini.cross("yi", "+ACC"),
    pynini.cross("yu", "+ACC"), pynini.cross("yü", "+ACC"),
    pynini.cross("nı", "+ACC"), pynini.cross("ni", "+ACC"),
    pynini.cross("nu", "+ACC"), pynini.cross("nü", "+ACC"),
    pynini.cross("yla", "+INS"), pynini.cross("yle", "+INS"),
    pynini.cross("ca", "+EQU"), pynini.cross("ce", "+EQU"),
    pynini.cross("", "")
)

ki_suffix = pynini.union(
    pynini.cross("ki", "+KI"),
    pynini.cross("kü", "+KI"),
    pynini.cross("", "")
)

plural_after_ki = pynini.union(
    pynini.cross("ler", "+PL"),
    pynini.cross("lar", "+PL"),
    pynini.cross("", "")
)

possessive_path = nominal_pl + possessive + case_after_poss + ki_suffix + plural_after_ki

case_no_poss = pynini.union(
    pynini.cross("ların", "+GEN"), pynini.cross("lerin", "+GEN"),
    pynini.cross("dan", "+ABL"), pynini.cross("den", "+ABL"),
    pynini.cross("tan", "+ABL"), pynini.cross("ten", "+ABL"),
    pynini.cross("nın", "+GEN"), pynini.cross("nin", "+GEN"),
    pynini.cross("nun", "+GEN"), pynini.cross("nün", "+GEN"),
    pynini.cross("da", "+LOC"), pynini.cross("de", "+LOC"),
    pynini.cross("ta", "+LOC"), pynini.cross("te", "+LOC"),
    pynini.cross("ya", "+DAT"), pynini.cross("ye", "+DAT"),
    pynini.cross("a", "+DAT"), pynini.cross("e", "+DAT"),
    pynini.cross("yı", "+ACC"), pynini.cross("yi", "+ACC"),
    pynini.cross("yu", "+ACC"), pynini.cross("yü", "+ACC"),
    pynini.cross("ı", "+ACC"), pynini.cross("i", "+ACC"),
    pynini.cross("u", "+ACC"), pynini.cross("ü", "+ACC"),
    pynini.cross("la", "+INS"), pynini.cross("le", "+INS"),
    pynini.cross("yla", "+INS"), pynini.cross("yle", "+INS"),
    pynini.cross("ca", "+EQU"), pynini.cross("ce", "+EQU"),
    pynini.cross("", "")
)

ki_suffix_case = pynini.union(
    pynini.cross("ki", "+KI"),
    pynini.cross("kü", "+KI"),
    pynini.cross("", "")
)

plural_after_ki_case = pynini.union(
    pynini.cross("ler", "+PL"),
    pynini.cross("lar", "+PL"),
    pynini.cross("", "")
)

case_only_path = nominal_pl + case_no_poss + ki_suffix_case + plural_after_ki_case

nominal_base = pynini.union(possessive_path, case_only_path).optimize()

copula = pynini.union(
    pynini.cross("ydi", "+COP.PAST"), pynini.cross("ydı", "+COP.PAST"),
    pynini.cross("ydu", "+COP.PAST"), pynini.cross("ydü", "+COP.PAST"),
    pynini.cross("ymış", "+COP.EVID"), pynini.cross("ymiş", "+COP.EVID"),
    pynini.cross("ymuş", "+COP.EVID"), pynini.cross("ymüş", "+COP.EVID"),
    pynini.cross("yse", "+COP.COND"), pynini.cross("ysa", "+COP.COND"),
    pynini.cross("di", "+COP.PAST"), pynini.cross("dı", "+COP.PAST"),
    pynini.cross("du", "+COP.PAST"), pynini.cross("dü", "+COP.PAST"),
    pynini.cross("ti", "+COP.PAST"), pynini.cross("tı", "+COP.PAST"),
    pynini.cross("tu", "+COP.PAST"), pynini.cross("tü", "+COP.PAST"),
    pynini.cross("miş", "+COP.EVID"), pynini.cross("mış", "+COP.EVID"),
    pynini.cross("muş", "+COP.EVID"), pynini.cross("müş", "+COP.EVID"),
    pynini.cross("se", "+COP.COND"), pynini.cross("sa", "+COP.COND"),
    pynini.cross("dir", "+COP.PRES"), pynini.cross("dır", "+COP.PRES"),
    pynini.cross("dur", "+COP.PRES"), pynini.cross("dür", "+COP.PRES"),
    pynini.cross("tir", "+COP.PRES"), pynini.cross("tır", "+COP.PRES"),
    pynini.cross("tur", "+COP.PRES"), pynini.cross("tür", "+COP.PRES")
)

person = pynini.union(
    pynini.cross("im", "+1SG"), pynini.cross("ım", "+1SG"),
    pynini.cross("um", "+1SG"), pynini.cross("üm", "+1SG"),
    pynini.cross("in", "+2SG"), pynini.cross("ın", "+2SG"),
    pynini.cross("un", "+2SG"), pynini.cross("ün", "+2SG"),
    pynini.cross("ız", "+1PL"), pynini.cross("iz", "+1PL"),
    pynini.cross("uz", "+1PL"), pynini.cross("üz", "+1PL"),
    pynini.cross("nız", "+2PL"), pynini.cross("niz", "+2PL"),
    pynini.cross("nuz", "+2PL"), pynini.cross("nüz", "+2PL"),
    pynini.cross("lar", "+3PL"), pynini.cross("ler", "+3PL"),
    pynini.cross("", "")
)

nominal_with_cop = copula + person
nominal_without_cop = pynini.cross("", "")
nominal_complete = nominal_base + pynini.union(nominal_with_cop, nominal_without_cop).optimize()

# ===== VERB MORPHOLOGY =====

# Voice, Ability, Negation (all optional)
voice = pynini.union(
    pynini.cross("ıl", "+PASS"), pynini.cross("il", "+PASS"),
    pynini.cross("ul", "+PASS"), pynini.cross("ül", "+PASS"),
    pynini.cross("ın", "+REFL"), pynini.cross("in", "+REFL"),
    pynini.cross("un", "+REFL"), pynini.cross("ün", "+REFL"),
    pynini.cross("lan", "+REFL"), pynini.cross("len", "+REFL"),
    pynini.cross("ış", "+RECIP"), pynini.cross("iş", "+RECIP"),
    pynini.cross("uş", "+RECIP"), pynini.cross("üş", "+RECIP"),
    pynini.cross("t", "+CAUS"), pynini.cross("d", "+CAUS"),
    pynini.cross("dır", "+CAUS"), pynini.cross("dir", "+CAUS"),
    pynini.cross("dur", "+CAUS"), pynini.cross("dür", "+CAUS"),
    pynini.cross("tır", "+CAUS"), pynini.cross("tir", "+CAUS"),
    pynini.cross("tur", "+CAUS"), pynini.cross("tür", "+CAUS"),
    pynini.cross("", "")
)

ability = pynini.union(
    pynini.cross("ebil", "+ABIL"),
    pynini.cross("abil", "+ABIL"),
    pynini.cross("", "")
)

negation = pynini.union(
    pynini.cross("ma", "+NEG"),
    pynini.cross("me", "+NEG"),
    pynini.cross("", "")
)

# Bildirme Kipleri (Indicative)
indicative_tense = pynini.union(
    pynini.cross("iyor", "+PRES.CONT"), pynini.cross("ıyor", "+PRES.CONT"),
    pynini.cross("uyor", "+PRES.CONT"), pynini.cross("üyor", "+PRES.CONT"),
    pynini.cross("ecek", "+FUT"), pynini.cross("acak", "+FUT"),
    pynini.cross("ır", "+AOR"), pynini.cross("ir", "+AOR"),
    pynini.cross("ur", "+AOR"), pynini.cross("ür", "+AOR"),
    pynini.cross("ar", "+AOR"), pynini.cross("er", "+AOR"),
    pynini.cross("r", "+AOR"),
    pynini.cross("dı", "+PAST"), pynini.cross("di", "+PAST"),
    pynini.cross("du", "+PAST"), pynini.cross("dü", "+PAST"),
    pynini.cross("tı", "+PAST"), pynini.cross("ti", "+PAST"),
    pynini.cross("tu", "+PAST"), pynini.cross("tü", "+PAST"),
    pynini.cross("mış", "+INFER"), pynini.cross("miş", "+INFER"),
    pynini.cross("muş", "+INFER"), pynini.cross("müş", "+INFER")
)

indicative_person = pynini.union(
    pynini.cross("um", "+1SG"), pynini.cross("üm", "+1SG"),
    pynini.cross("ım", "+1SG"), pynini.cross("im", "+1SG"),
    pynini.cross("m", "+1SG"),
    pynini.cross("sun", "+2SG"), pynini.cross("sün", "+2SG"),
    pynini.cross("sın", "+2SG"), pynini.cross("sin", "+2SG"),
    pynini.cross("n", "+2SG"),
    pynini.cross("uz", "+1PL"), pynini.cross("üz", "+1PL"),
    pynini.cross("ız", "+1PL"), pynini.cross("iz", "+1PL"),
    pynini.cross("k", "+1PL"),
    pynini.cross("sunuz", "+2PL"), pynini.cross("sünüz", "+2PL"),
    pynini.cross("sınız", "+2PL"), pynini.cross("siniz", "+2PL"),
    pynini.cross("nız", "+2PL"), pynini.cross("niz", "+2PL"),
    pynini.cross("nuz", "+2PL"), pynini.cross("nüz", "+2PL"),
    pynini.cross("lar", "+3PL"), pynini.cross("ler", "+3PL"),
    pynini.cross("", "+3SG")
)

# Dilek/Tasarlama Kipleri (Subjunctive/Optative)

# İstek Kipi (Optative)
optative_mood_person = pynini.union(
    pynini.cross("ayım", "+OPT+1SG"), pynini.cross("eyim", "+OPT+1SG"),
    pynini.cross("ayum", "+OPT+1SG"), pynini.cross("eyüm", "+OPT+1SG"),
    pynini.cross("asın", "+OPT+2SG"), pynini.cross("esin", "+OPT+2SG"),
    pynini.cross("asun", "+OPT+2SG"), pynini.cross("esün", "+OPT+2SG"),
    pynini.cross("asana", "+OPT+2SG+EMPH"), pynini.cross("esene", "+OPT+2SG+EMPH"),
    pynini.cross("sana", "+OPT+2SG+EMPH"), pynini.cross("sene", "+OPT+2SG+EMPH"),
    pynini.cross("asan", "+OPT+2SG"), pynini.cross("esen", "+OPT+2SG"),
    pynini.cross("a", "+OPT+3SG"), pynini.cross("e", "+OPT+3SG"),
    pynini.cross("alım", "+OPT+1PL"), pynini.cross("elim", "+OPT+1PL"),
    pynini.cross("alum", "+OPT+1PL"), pynini.cross("elüm", "+OPT+1PL"),
    pynini.cross("asınız", "+OPT+2PL"), pynini.cross("esiniz", "+OPT+2PL"),
    pynini.cross("asunuz", "+OPT+2PL"), pynini.cross("esünüz", "+OPT+2PL"),
    pynini.cross("alar", "+OPT+3PL"), pynini.cross("eler", "+OPT+3PL")
)

# Dilek-Koşul Kipi (Conditional)
conditional_mood_person = pynini.union(
    pynini.cross("sam", "+COND+1SG"), pynini.cross("sem", "+COND+1SG"),
    pynini.cross("san", "+COND+2SG"), pynini.cross("sen", "+COND+2SG"),
    pynini.cross("sa", "+COND+3SG"), pynini.cross("se", "+COND+3SG"),
    pynini.cross("sak", "+COND+1PL"), pynini.cross("sek", "+COND+1PL"),
    pynini.cross("sanız", "+COND+2PL"), pynini.cross("seniz", "+COND+2PL"),
    pynini.cross("sanuz", "+COND+2PL"), pynini.cross("senüz", "+COND+2PL"),
    pynini.cross("salar", "+COND+3PL"), pynini.cross("seler", "+COND+3PL")
)

# ...


# Gereklilik Kipi (Necessitative)
necessitative_mood_person = pynini.union(
    pynini.cross("malıyım", "+NEC+1SG"), pynini.cross("meliyim", "+NEC+1SG"),
    pynini.cross("malıyum", "+NEC+1SG"), pynini.cross("meliyüm", "+NEC+1SG"),
    pynini.cross("malısın", "+NEC+2SG"), pynini.cross("melisin", "+NEC+2SG"),
    pynini.cross("malısun", "+NEC+2SG"), pynini.cross("melisün", "+NEC+2SG"),
    pynini.cross("malı", "+NEC+3SG"), pynini.cross("meli", "+NEC+3SG"),
    pynini.cross("malıyız", "+NEC+1PL"), pynini.cross("meliyiz", "+NEC+1PL"),
    pynini.cross("malıyuz", "+NEC+1PL"), pynini.cross("meliyüz", "+NEC+1PL"),
    pynini.cross("malısınız", "+NEC+2PL"), pynini.cross("melisiniz", "+NEC+2PL"),
    pynini.cross("malısunuz", "+NEC+2PL"), pynini.cross("melisünüz", "+NEC+2PL"),
    pynini.cross("malılar", "+NEC+3PL"), pynini.cross("meliler", "+NEC+3PL")
)

# Emir Kipi (Imperative)
imperative_mood_person = pynini.union(
    pynini.cross("sin", "+IMP+3SG"), pynini.cross("sın", "+IMP+3SG"),
    pynini.cross("sun", "+IMP+3SG"), pynini.cross("sün", "+IMP+3SG"),
    pynini.cross("in", "+IMP+2PL"), pynini.cross("ın", "+IMP+2PL"),
    pynini.cross("un", "+IMP+2PL"), pynini.cross("ün", "+IMP+2PL"),
    pynini.cross("iniz", "+IMP+2PL"), pynini.cross("ınız", "+IMP+2PL"),
    pynini.cross("unuz", "+IMP+2PL"), pynini.cross("ünüz", "+IMP+2PL"),
    pynini.cross("sinler", "+IMP+3PL"), pynini.cross("sınlar", "+IMP+3PL"),
    pynini.cross("sunlar", "+IMP+3PL"), pynini.cross("sünler", "+IMP+3PL")
)

imperative_2sg_bare = pynini.cross("", "+IMP+2SG")

# Build verb structure
verb_base = verb_roots + voice + ability + negation

# Verb paths
verb_indicative = verb_base + indicative_tense + indicative_person
verb_optative = verb_base + optative_mood_person
verb_conditional = verb_base + conditional_mood_person
verb_necessitative = verb_base + necessitative_mood_person
verb_imperative = verb_base + pynini.union(imperative_mood_person, imperative_2sg_bare)

# Combine all verb paths
verb_complete = pynini.union(
    verb_indicative,
    verb_optative,
    verb_conditional,
    verb_necessitative,
    verb_imperative
).optimize()

# ===== PUNCTUATION =====
punctuation = pynini.union(
    pynini.cross(".", "+PUNCT.period"),
    pynini.cross(",", "+PUNCT.comma"),
    pynini.cross("?", "+PUNCT.question"),
    pynini.cross("!", "+PUNCT.exclamation"),
    pynini.cross(":", "+PUNCT.colon"),
    pynini.cross(";", "+PUNCT.semicolon"),
    pynini.cross("", "")
)

# ===== COMPLETE ANALYZER =====
simple_categories = pynini.union(
    adverb_roots,
    postposition_roots,
    interjection_roots,
    conjunction_roots,
    question_particles
)

nominal_fst = (nominal_complete + punctuation).optimize()
verb_fst = (verb_complete + punctuation).optimize()
simple_fst = (simple_categories + punctuation).optimize()

turkish_analyzer = pynini.union(nominal_fst, verb_fst, simple_fst).optimize()

def analyze(word):
    """Analyze a Turkish word and return all possible analyses."""
    try:
        lattice = pynini.compose(word, turkish_analyzer)
        analyses = []
        seen = set()
        try:
            for path in lattice.paths().ostrings():
                if path not in seen:
                    analyses.append(path)
                    seen.add(path)
        except:
            pass
        return sorted(analyses) if analyses else [f"No analysis found for: {word}"]
    except Exception as e:
        return [f"Error: {str(e)}"]
# ===== CONTEXT AWARENESS & DISAMBIGUATION =====

class ContextAwareDisambiguator:
    def __init__(self):
        # 1. Define Transition Probabilities (Bigram Model)
        # What is the probability of Tag B following Tag A?
        # Higher number = more likely.
        self.transitions = {
            # Start of sentence usually starts with Noun, Pronoun, or Adverb
            'START': {'NOUN': 0.4, 'PRON': 0.3, 'ADV': 0.1, 'VERB': 0.1, 'ADJ': 0.1},
            
            # Adjectives almost always modify Nouns
            'ADJ': {'NOUN': 0.9, 'ADJ': 0.1, 'VERB': 0.01},
            
            # Nouns can be followed by almost anything, but often Postpositions or Verbs
            'NOUN': {'VERB': 0.4, 'NOUN': 0.2, 'CONJ': 0.1, 'POSTP': 0.2, 'ADV': 0.1},
            
            # Pronouns act like Nouns
            'PRON': {'VERB': 0.5, 'NOUN': 0.2, 'POSTP': 0.2, 'ADJ': 0.1},
            
            # Adverbs modify Verbs or Adjectives
            'ADV': {'VERB': 0.6, 'ADJ': 0.3, 'ADV': 0.1},
            
            # Numbers (if you had them) precede Nouns
            'NUM': {'NOUN': 0.95},
            
            # Verbs usually end clauses, followed by Punctuation or Conjunctions
            'VERB': {'PUNCT': 0.8, 'CONJ': 0.1, 'NOUN': 0.05, 'PRON': 0.05},
            
            # Question particles
            'QUES': {'PUNCT': 0.9, 'VERB': 0.1},
            
            # Default fallback
            'DEFAULT': {'NOUN': 0.3, 'VERB': 0.3, 'ADJ': 0.1, 'ADV': 0.1, 'PRON': 0.1, 'PUNCT': 0.1}
        }
        
        # Tags we care about extracting from the analysis string
        self.tags = ['NOUN', 'VERB', 'ADJ', 'ADV', 'PRON', 'POSTP', 'CONJ', 'QUES', 'INTERJ']

    def get_tag_from_analysis(self, analysis_str):
        """Extracts the primary POS tag from an analysis string."""
        if "No analysis" in analysis_str:
            return "UNKNOWN"
            
        # Check specific special cases first
        if "+QUES" in analysis_str: return "QUES"
        if "+PUNCT" in analysis_str: return "PUNCT"
        
        # Search for standard tags
        for tag in self.tags:
            if f"+{tag}" in analysis_str:
                return tag
        return "NOUN" # Default fallback

    def get_transition_prob(self, prev_tag, current_tag):
        """Returns the probability of current_tag following prev_tag."""
        if prev_tag in self.transitions:
            return self.transitions[prev_tag].get(current_tag, 0.001) # Return small prob instead of 0
        return self.transitions['DEFAULT'].get(current_tag, 0.001)

    def heuristic_weight(self, word, analysis, tag, position, sentence_len):
        """
        Apply rule-based boosts (Stronger Heuristics).
        Returns a score boost (log probability).
        """
        score = 0.0
        
        # Rule 1: Suffix matching (Morphology hints)
        if tag == "VERB":
            if word.endswith(("yor", "yorum", "yorsun", "dı", "di", "du", "dü", "acak", "ecek", "malı", "meli")):
                score += 2.0
        elif tag == "NOUN":
            if word.endswith(("lar", "ler", "in", "un", "nın", "nin", "da", "de", "dan", "den")):
                score += 1.5
        elif tag == "QUES":
            if word.lower().startswith("mi") or word.lower().startswith("mı"):
                score += 5.0 # Massive boost, almost certainly a question particle

        # Rule 2: Sentence Position
        # Turkish is SOV (Subject-Object-Verb). Verbs strictly preferred at end.
        if position == sentence_len - 1:
            if tag == "VERB": score += 1.5
            if tag == "QUES": score += 1.5
            if tag == "NOUN": score -= 0.5 # Sentences rarely end in raw nouns (unless copula)
        
        # Rule 3: Length heuristic (very short words usually aren't verbs unless imperative)
        if len(word) <= 2 and tag == "VERB" and position != sentence_len - 1:
            score -= 1.0

        return score

    def decode_sentence(self, sentence_tokens):
        """
        Viterbi Algorithm for finding the most likely sequence of analyses.
        """
        # 1. Get all possible analyses for every word
        # structure: [ [{'analysis': '...', 'tag': 'NOUN'}, ...], ... ]
        lattice = []
        for word in sentence_tokens:
            raw_analyses = analyze(word)
            word_candidates = []
            
            # If FST fails, treat as Unknown Noun
            if not raw_analyses or "No analysis" in raw_analyses[0]:
                word_candidates.append({'analysis': f"{word}+NOUN+UNKNOWN", 'tag': 'NOUN', 'word': word})
            else:
                for ana in raw_analyses:
                    tag = self.get_tag_from_analysis(ana)
                    word_candidates.append({'analysis': ana, 'tag': tag, 'word': word})
            lattice.append(word_candidates)

        n = len(lattice)
        if n == 0: return []

        # 2. Viterbi Initialization
        # best_scores[i][candidate_index] = max_score
        # backpointers[i][candidate_index] = index_of_best_prev_candidate
        best_scores = [ {} for _ in range(n) ]
        backpointers = [ {} for _ in range(n) ]

        # Step 0: Start of sentence
        for i, candidate in enumerate(lattice[0]):
            trans_prob = self.get_transition_prob('START', candidate['tag'])
            heuristic = self.heuristic_weight(candidate['word'], candidate['analysis'], candidate['tag'], 0, n)
            
            # Use Log probability to prevent underflow
            best_scores[0][i] = math.log(trans_prob) + heuristic

        # 3. Viterbi Recursion (Forward Pass)
        for t in range(1, n):
            for i, curr_cand in enumerate(lattice[t]):
                max_score = -float('inf')
                best_prev_idx = -1
                
                # Try coming from every possible analysis of the previous word
                for j, prev_cand in enumerate(lattice[t-1]):
                    # Score = Previous Score + Transition(Prev->Curr) + Heuristic(Curr)
                    prev_score = best_scores[t-1][j]
                    trans_prob = self.get_transition_prob(prev_cand['tag'], curr_cand['tag'])
                    heuristic = self.heuristic_weight(curr_cand['word'], curr_cand['analysis'], curr_cand['tag'], t, n)
                    
                    score = prev_score + math.log(trans_prob) + heuristic
                    
                    if score > max_score:
                        max_score = score
                        best_prev_idx = j
                
                best_scores[t][i] = max_score
                backpointers[t][i] = best_prev_idx

        # 4. Backtracking (Backward Pass)
        # Find best end state
        best_last_idx = max(best_scores[n-1], key=best_scores[n-1].get)
        
        result_path = []
        curr_idx = best_last_idx
        
        for t in range(n-1, -1, -1):
            cand = lattice[t][curr_idx]
            result_path.append(cand)
            if t > 0:
                curr_idx = backpointers[t][curr_idx]
                
        return result_path[::-1] # Reverse to get correct order

# Create instance
disambiguator = ContextAwareDisambiguator()

def analyze_sentence_context_aware(sentence):
    """Wrapper function to replace the old analyze_sentence"""
    # Simple tokenization (handling punctuation loosely)
    # In a real app, split punctuation marks effectively
    import re
    tokens = re.findall(r"[\w']+|[.,!?;]", sentence)
    
    results = disambiguator.decode_sentence(tokens)
    
    formatted_output = []
    for item in results:
        formatted_output.append({
            'token': item['word'],
            'best_analysis': item['analysis'],
            'tag': item['tag']
        })
    return formatted_output

def save_fst(filename):
    """Save the compiled FST to a file."""
    turkish_analyzer.write(filename)
    print(f"FST saved to {filename}")

# Test
if __name__ == "__main__":
    print("="*60)
    print("LEXICON LOADED")
    print("="*60)
    print(f"Nouns: {len(lexicon.get('nouns', []))} words")
    print(f"Verbs: {len(lexicon.get('verbs', []))} words (infinitives → roots extracted)")
    print(f"Adjectives: {len(lexicon.get('adjectives', []))} words")
    print(f"Pronouns: {len(lexicon.get('pronouns', []))} words")
    print(f"Adverbs: {len(lexicon.get('adverbs', []))} words")
    print(f"Conjunctions: {len(lexicon.get('conjunctions', []))} words")
    print(f"Postpositions: {len(lexicon.get('postpositions', []))} words")
    print(f"Proper Nouns: {len(lexicon.get('proper_nouns', []))} words")

    # Show some verb root examples
    if lexicon.get('verbs'):
        print("\nVerb root examples:")
        for verb in lexicon.get('verbs', [])[:5]:
            root = extract_verb_root(verb)
            print(f"  {verb} → {root}")

    print("\n" + "="*60)
    print("DEBUG: Testing verb compositions")
    print("="*60)

    test = "gel"
    try:
        lattice = pynini.compose(test, verb_imperative)
        print(f"\n'{test}' + verb_imperative:")
        for path in lattice.paths().ostrings():
            print(f"  ✓ {path}")
    except Exception as e:
        print(f"  ✗ Error: {e}")

    test2 = "gelsin"
    try:
        lattice = pynini.compose(test2, verb_imperative)
        print(f"\n'{test2}' + verb_imperative:")
        for path in lattice.paths().ostrings():
            print(f"  ✓ {path}")
    except Exception as e:
        print(f"  ✗ Error: {e}")

    print("\n" + "="*60)
    print("SINGLE WORD ANALYSIS")
    print("="*60)

    test_words = [
        # Dilek Kipleri
        "gel",            # come! (imperative 2sg)
        "gelin",          # come! (imperative 2pl)
        "gelsin",         # let him/her come (imperative 3sg)
        "gelsem",         # if I come (conditional)
        "geleyim",        # let me come (optative)
        "gelmeli",        # must come (necessitative)
        "yazmalıyım",     # I must write
        "okusana",        # read then! (optative emphatic)
        "görseler",       # if they see (conditional)
        # Bildirme Kipleri
        "okudum",         # I read (past)
        "gelebilecek",    # will be able to come (future)
        "geliyorum",      # I am coming (present continuous)
        # Nouns
        "kitaplardan",    # from books
        "kalemlik",       # pencil case
        "evdekiler",      # those in the house
    ]

    for word in test_words:
        print(f"\n{word}:")
        analyses = analyze(word)
        for analysis in analyses[:5]:
            print(f"  {analysis}")
        if len(analyses) > 5:
            print(f"  ... and {len(analyses) - 5} more analyses")

    print("\n" + "="*60)
    print("MULTI-WORD ANALYSIS")
    print("="*60)

    test_sentences = [
        "evde misin?",
        "kitap da güzel",
        "gel buraya!",
        "oraya git",
        "yukarı çık",
        "ben yemeğe gidiyorum gelecek misin?",
        "kitabı aldım",
        "ağaca çıktı",
    ]

    for sentence in test_sentences:
        print(f"\n'{sentence}':")
        results = analyze_sentence(sentence)
        for result in results:
            print(f"  {result['token']}:")
            for analysis in result['analyses'][:3]:
                print(f"    {analysis}")
            if len(result['analyses']) > 3:
                print(f"    ... and {len(result['analyses']) - 3} more")

    print("\n" + "="*60)
    print("CONTEXT-AWARE ANALYSIS (Viterbi)")
    print("="*60)

    # These sentences contain ambiguous words
    ambiguous_sentences = [
        "yüzü güzel",           # yüz: Face (Noun) vs Swim (Verb) -> Expect NOUN
        "denizde yüz",          # yüz: Face (Noun) vs Swim (Verb) -> Expect VERB (Imperative)
        "bana gül",             # gül: Rose (Noun) vs Smile (Verb) -> Expect VERB
        "kırmızı gül",          # gül: Rose (Noun) vs Smile (Verb) -> Expect NOUN (Adj modifies Noun)
        "okula git",            # git: Expect VERB
        "güzel bir ev",         # güzel: Adj, bir: Det, ev: Noun
        "evde misin",           # misin: Expect QUES
        "kitap okumayı severim" # severim: Expect VERB (End of sentence)
    ]

    for sent in ambiguous_sentences:
        print(f"\nSentence: '{sent}'")
        results = analyze_sentence_context_aware(sent)
        
        # Print formatted table
        print(f" {'Word':<15} | {'Tag':<6} | {'Selected Analysis'}")
        print("-" * 50)
        for res in results:
            print(f" {res['token']:<15} | {res['tag']:<6} | {res['best_analysis']}")

LEXICON LOADED
Nouns: 56691 words
Verbs: 3455 words (infinitives → roots extracted)
Adjectives: 5504 words
Pronouns: 50 words
Adverbs: 1300 words
Conjunctions: 53 words
Postpositions: 0 words
Proper Nouns: 0 words

Verb root examples:
  polenlemek → polenle
  kubaşmak → kubaş
  sayıklamak → sayıkla
  ölçeklemek → ölçekle
  atkılamak → atkıla

DEBUG: Testing verb compositions

'gel' + verb_imperative:
  ✓ gel+VERB+IMP+2SG

'gelsin' + verb_imperative:
  ✓ gel+VERB+IMP+3SG

SINGLE WORD ANALYSIS

gel:
  gel+VERB+IMP+2SG

gelin:
  gel+VERB+IMP+2PL
  gel+VERB+REFL+IMP+2SG
  gelin+NOUN

gelsin:
  gel+VERB+IMP+3SG

gelsem:
  gel+VERB+COND+1SG

geleyim:
  gel+VERB+OPT+1SG

gelmeli:
  gel+VERB+NEC+3SG

yazmalıyım:
  yaz+VERB+NEC+1SG

okusana:
  oku+VERB+OPT+2SG+EMPH

görseler:
  gör+VERB+COND+3PL

okudum:
  oku+VERB+PAST+1SG

gelebilecek:
  gel+VERB+ABIL+FUT+3SG

geliyorum:
  gel+VERB+PRES.CONT+1SG

kitaplardan:
  kitap+NOUN+PL+ABL

kalemlik:
  kalem+NOUN+DER.lik

evdekiler:
  ev+NOUN+LOC+KI+PL


In [8]:
for word in ["gelecek misin", "yapacak mısın", "gidiyorum", "çıktı", "gelecek", "ben"]:
    print(f"\n{word}:")
    raw = analyze(word)
    filtered = analyze_with_disambiguation(word)
    print(f"  RAW ({len(raw)}): {raw}")
    print(f"  FILTERED ({len(filtered)}): {filtered}")



gelecek misin:
  RAW (1): ['No analysis found for: gelecek misin']
  FILTERED (1): ['No analysis found for: gelecek misin']

yapacak mısın:
  RAW (1): ['No analysis found for: yapacak mısın']
  FILTERED (1): ['No analysis found for: yapacak mısın']

gidiyorum:
  RAW (1): ['git+VERB+PRES.CONT+1SG']
  FILTERED (1): ['git+VERB+PRES.CONT+1SG']

çıktı:
  RAW (2): ['çık+VERB+PAST+3SG', 'çıktı+NOUN']
  FILTERED (3): ['çık+VERB+PAST+3SG', 'çık+VERB+PAST+3SG', 'çıktı+NOUN']

gelecek:
  RAW (3): ['gel+VERB+FUT+3SG', 'gelecek+ADJ', 'gelecek+NOUN']
  FILTERED (6): ['gel+VERB+FUT+3SG', 'gel+VERB+FUT+3SG', 'gelecek+ADJ', 'gelecek+ADJ', 'gelecek+NOUN', 'gelecek+NOUN']

ben:
  RAW (2): ['ben+NOUN', 'ben+PRON']
  FILTERED (3): ['ben+NOUN', 'ben+PRON', 'ben+PRON']
