In [1]:
# Setup
!pip install --only-binary :all: pynini
!pip install wurlitzer
import pynini
%load_ext wurlitzer

Collecting pynini
  Downloading pynini-2.1.7-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.7 kB)
Downloading pynini-2.1.7-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (165.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.5/165.5 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynini
Successfully installed pynini-2.1.7


In [2]:
import pynini
import json

# ===== LOAD LEXICON FROM JSON =====
def load_lexicon(json_file='turkish_lexicon.json'):
    """Load Turkish lexicon from JSON file."""
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"Warning: {json_file} not found. Using default minimal lexicon.")
        return {
            "nouns": ["ev", "kitap", "masa"],
            "verbs": ["gel", "git", "oku"],
            "adjectives": ["güzel", "iyi"],
            "pronouns": ["ben", "sen", "o"],
            "adverbs": ["çok", "az"],
            "conjunctions": ["ve", "da", "de"],
            "postpositions": ["gibi", "için"],
            "proper_nouns": []
        }

# Load lexicon
lexicon = load_lexicon()

# ===== HELPER FUNCTIONS =====
def extract_verb_root(verb_infinitive):
    """Extract verb root from infinitive form (remove -mak/-mek)."""
    if verb_infinitive.endswith('mak'):
        return verb_infinitive[:-3]
    elif verb_infinitive.endswith('mek'):
        return verb_infinitive[:-3]
    else:
        # If it doesn't end with -mak/-mek, return as is (might be already a root)
        return verb_infinitive

def create_alternating_roots(words, tag):
    """
    Create FST roots with consonant softening (ünsüz yumuşaması).
    p→b, ç→c, t→d, k→g/ğ when followed by vowel-initial suffix.
    """
    roots = []

    for word in words:
        # Original form (before consonant-initial suffixes)
        roots.append(pynini.cross(word, f"{word}+{tag}"))

        # Check if word ends with p, ç, t, k (sert ünsüz)
        if len(word) > 1 and word[-1] in ['p', 'ç', 't', 'k']:
            # Get the stem without final consonant
            stem = word[:-1]
            final = word[-1]

            # Determine softened form (yumuşak ünsüz)
            if final == 'p':
                softened = stem + 'b'
            elif final == 'ç':
                softened = stem + 'c'
            elif final == 't':
                softened = stem + 'd'
            elif final == 'k':
                # k → ğ after vowels, k → g after consonants
                if len(stem) > 0 and stem[-1] in 'aeıioöuü':
                    softened = stem + 'ğ'
                else:
                    softened = stem + 'g'

            # Add softened form (before vowel-initial suffixes)
            roots.append(pynini.cross(softened, f"{word}+{tag}"))

    return pynini.union(*roots) if roots else pynini.cross("", "")

# ===== ROOTS BY LEXICAL CATEGORY =====

# Nouns (Ad) - with consonant softening
noun_roots = create_alternating_roots(
    lexicon.get('nouns', []),
    'NOUN'
) if lexicon.get('nouns') else pynini.cross("", "")

# Adjectives (Sıfat) - with consonant softening
adj_roots = create_alternating_roots(
    lexicon.get('adjectives', []),
    'ADJ'
) if lexicon.get('adjectives') else pynini.cross("", "")

# Verbs (Fiil) - extract roots and apply consonant softening
verb_root_list = [extract_verb_root(word) for word in lexicon.get('verbs', [])]
verb_roots = create_alternating_roots(
    verb_root_list,
    'VERB'
) if verb_root_list else pynini.cross("", "")

# Pronouns (Zamir) - usually don't undergo consonant softening, but include for completeness
pronoun_roots = pynini.union(*[
    pynini.cross(word, f"{word}+PRON")
    for word in lexicon.get('pronouns', [])
]) if lexicon.get('pronouns') else pynini.cross("", "")

# Adverbs (Zarf) - no inflection
adverb_roots = pynini.union(*[
    pynini.cross(word, f"{word}+ADV")
    for word in lexicon.get('adverbs', [])
]) if lexicon.get('adverbs') else pynini.cross("", "")

# Postpositions (Edat) - no inflection
postposition_roots = pynini.union(*[
    pynini.cross(word, f"{word}+POSTP")
    for word in lexicon.get('postpositions', [])
]) if lexicon.get('postpositions') else pynini.cross("", "")

# Interjections (Ünlem) - no inflection
interjection_roots = pynini.union(*[
    pynini.cross(word, f"{word}+INTERJ")
    for word in lexicon.get('interjections', [])
]) if lexicon.get('interjections') else pynini.cross("", "")

# Conjunctions (Bağlaç) - no inflection
conjunction_roots = pynini.union(*[
    pynini.cross(word, f"{word}+CONJ")
    for word in lexicon.get('conjunctions', [])
]) if lexicon.get('conjunctions') else pynini.cross("", "")

# Proper Nouns (Özel İsimler) - with consonant softening
proper_noun_roots = create_alternating_roots(
    lexicon.get('proper_nouns', []),
    'PROPN'
) if lexicon.get('proper_nouns') else pynini.cross("", "")

# Question Particles
question_particles = pynini.union(
    pynini.cross("mi", "mi+QUES"),
    pynini.cross("mı", "mı+QUES"),
    pynini.cross("mu", "mu+QUES"),
    pynini.cross("mü", "mü+QUES"),
    pynini.cross("misin", "mi+QUES+2SG"),
    pynini.cross("mısın", "mı+QUES+2SG"),
    pynini.cross("musun", "mu+QUES+2SG"),
    pynini.cross("müsün", "mü+QUES+2SG"),
    pynini.cross("miyim", "mi+QUES+1SG"),
    pynini.cross("mıyım", "mı+QUES+1SG"),
    pynini.cross("muyum", "mu+QUES+1SG"),
    pynini.cross("müyüm", "mü+QUES+1SG"),
    pynini.cross("miyiz", "mi+QUES+1PL"),
    pynini.cross("mıyız", "mı+QUES+1PL"),
    pynini.cross("muyuz", "mu+QUES+1PL"),
    pynini.cross("müyüz", "mü+QUES+1PL"),
    pynini.cross("misiniz", "mi+QUES+2PL"),
    pynini.cross("mısınız", "mı+QUES+2PL"),
    pynini.cross("musunuz", "mu+QUES+2PL"),
    pynini.cross("müsünüz", "mü+QUES+2PL")
)

# ===== DERIVATIONAL SUFFIXES =====
derivational = pynini.union(
    pynini.cross("lık", "+DER.lık"), pynini.cross("lik", "+DER.lik"),
    pynini.cross("luk", "+DER.luk"), pynini.cross("lük", "+DER.lük"),
    pynini.cross("cı", "+DER.cı"), pynini.cross("ci", "+DER.ci"),
    pynini.cross("cu", "+DER.cu"), pynini.cross("cü", "+DER.cü"),
    pynini.cross("çı", "+DER.çı"), pynini.cross("çi", "+DER.çi"),
    pynini.cross("çu", "+DER.çu"), pynini.cross("çü", "+DER.çü"),
    pynini.cross("sız", "+DER.sız"), pynini.cross("siz", "+DER.siz"),
    pynini.cross("suz", "+DER.suz"), pynini.cross("süz", "+DER.süz"),
    pynini.cross("lı", "+DER.lı"), pynini.cross("li", "+DER.li"),
    pynini.cross("lu", "+DER.lu"), pynini.cross("lü", "+DER.lü"),
    pynini.cross("", "")
)

# ===== NOUN/ADJECTIVE MORPHOLOGY =====
# Include proper nouns that can take case markers
nominal_roots = pynini.union(noun_roots, adj_roots, pronoun_roots, proper_noun_roots)
nominal_derived = nominal_roots + derivational

plural = pynini.union(
    pynini.cross("lar", "+PL"),
    pynini.cross("ler", "+PL"),
    pynini.cross("", "")
)

nominal_pl = nominal_derived + plural

possessive = pynini.union(
    pynini.cross("imiz", "+POSS.1PL"), pynini.cross("ımız", "+POSS.1PL"),
    pynini.cross("umuz", "+POSS.1PL"), pynini.cross("ümüz", "+POSS.1PL"),
    pynini.cross("iniz", "+POSS.2PL"), pynini.cross("ınız", "+POSS.2PL"),
    pynini.cross("unuz", "+POSS.2PL"), pynini.cross("ünüz", "+POSS.2PL"),
    pynini.cross("leri", "+POSS.3PL"), pynini.cross("ları", "+POSS.3PL"),
    pynini.cross("im", "+POSS.1SG"), pynini.cross("ım", "+POSS.1SG"),
    pynini.cross("um", "+POSS.1SG"), pynini.cross("üm", "+POSS.1SG"),
    pynini.cross("in", "+POSS.2SG"), pynini.cross("ın", "+POSS.2SG"),
    pynini.cross("un", "+POSS.2SG"), pynini.cross("ün", "+POSS.2SG"),
    pynini.cross("si", "+POSS.3SG"), pynini.cross("sı", "+POSS.3SG"),
    pynini.cross("su", "+POSS.3SG"), pynini.cross("sü", "+POSS.3SG")
)

case_after_poss = pynini.union(
    pynini.cross("dan", "+ABL"), pynini.cross("den", "+ABL"),
    pynini.cross("tan", "+ABL"), pynini.cross("ten", "+ABL"),
    pynini.cross("ndan", "+ABL"), pynini.cross("nden", "+ABL"),
    pynini.cross("ntan", "+ABL"), pynini.cross("nten", "+ABL"),
    pynini.cross("nın", "+GEN"), pynini.cross("nin", "+GEN"),
    pynini.cross("nun", "+GEN"), pynini.cross("nün", "+GEN"),
    pynini.cross("da", "+LOC"), pynini.cross("de", "+LOC"),
    pynini.cross("ta", "+LOC"), pynini.cross("te", "+LOC"),
    pynini.cross("nda", "+LOC"), pynini.cross("nde", "+LOC"),
    pynini.cross("nta", "+LOC"), pynini.cross("nte", "+LOC"),
    pynini.cross("ya", "+DAT"), pynini.cross("ye", "+DAT"),
    pynini.cross("na", "+DAT"), pynini.cross("ne", "+DAT"),
    pynini.cross("yı", "+ACC"), pynini.cross("yi", "+ACC"),
    pynini.cross("yu", "+ACC"), pynini.cross("yü", "+ACC"),
    pynini.cross("nı", "+ACC"), pynini.cross("ni", "+ACC"),
    pynini.cross("nu", "+ACC"), pynini.cross("nü", "+ACC"),
    pynini.cross("yla", "+INS"), pynini.cross("yle", "+INS"),
    pynini.cross("ca", "+EQU"), pynini.cross("ce", "+EQU"),
    pynini.cross("", "")
)

ki_suffix = pynini.union(
    pynini.cross("ki", "+KI"),
    pynini.cross("kü", "+KI"),
    pynini.cross("", "")
)

plural_after_ki = pynini.union(
    pynini.cross("ler", "+PL"),
    pynini.cross("lar", "+PL"),
    pynini.cross("", "")
)

possessive_path = nominal_pl + possessive + case_after_poss + ki_suffix + plural_after_ki

case_no_poss = pynini.union(
    pynini.cross("ların", "+GEN"), pynini.cross("lerin", "+GEN"),
    pynini.cross("dan", "+ABL"), pynini.cross("den", "+ABL"),
    pynini.cross("tan", "+ABL"), pynini.cross("ten", "+ABL"),
    pynini.cross("nın", "+GEN"), pynini.cross("nin", "+GEN"),
    pynini.cross("nun", "+GEN"), pynini.cross("nün", "+GEN"),
    pynini.cross("da", "+LOC"), pynini.cross("de", "+LOC"),
    pynini.cross("ta", "+LOC"), pynini.cross("te", "+LOC"),
    pynini.cross("ya", "+DAT"), pynini.cross("ye", "+DAT"),
    pynini.cross("a", "+DAT"), pynini.cross("e", "+DAT"),
    pynini.cross("yı", "+ACC"), pynini.cross("yi", "+ACC"),
    pynini.cross("yu", "+ACC"), pynini.cross("yü", "+ACC"),
    pynini.cross("ı", "+ACC"), pynini.cross("i", "+ACC"),
    pynini.cross("u", "+ACC"), pynini.cross("ü", "+ACC"),
    pynini.cross("la", "+INS"), pynini.cross("le", "+INS"),
    pynini.cross("yla", "+INS"), pynini.cross("yle", "+INS"),
    pynini.cross("ca", "+EQU"), pynini.cross("ce", "+EQU"),
    pynini.cross("", "")
)

ki_suffix_case = pynini.union(
    pynini.cross("ki", "+KI"),
    pynini.cross("kü", "+KI"),
    pynini.cross("", "")
)

plural_after_ki_case = pynini.union(
    pynini.cross("ler", "+PL"),
    pynini.cross("lar", "+PL"),
    pynini.cross("", "")
)

case_only_path = nominal_pl + case_no_poss + ki_suffix_case + plural_after_ki_case

nominal_base = pynini.union(possessive_path, case_only_path).optimize()

copula = pynini.union(
    pynini.cross("ydi", "+COP.PAST"), pynini.cross("ydı", "+COP.PAST"),
    pynini.cross("ydu", "+COP.PAST"), pynini.cross("ydü", "+COP.PAST"),
    pynini.cross("ymış", "+COP.EVID"), pynini.cross("ymiş", "+COP.EVID"),
    pynini.cross("ymuş", "+COP.EVID"), pynini.cross("ymüş", "+COP.EVID"),
    pynini.cross("yse", "+COP.COND"), pynini.cross("ysa", "+COP.COND"),
    pynini.cross("di", "+COP.PAST"), pynini.cross("dı", "+COP.PAST"),
    pynini.cross("du", "+COP.PAST"), pynini.cross("dü", "+COP.PAST"),
    pynini.cross("ti", "+COP.PAST"), pynini.cross("tı", "+COP.PAST"),
    pynini.cross("tu", "+COP.PAST"), pynini.cross("tü", "+COP.PAST"),
    pynini.cross("miş", "+COP.EVID"), pynini.cross("mış", "+COP.EVID"),
    pynini.cross("muş", "+COP.EVID"), pynini.cross("müş", "+COP.EVID"),
    pynini.cross("se", "+COP.COND"), pynini.cross("sa", "+COP.COND"),
    pynini.cross("dir", "+COP.PRES"), pynini.cross("dır", "+COP.PRES"),
    pynini.cross("dur", "+COP.PRES"), pynini.cross("dür", "+COP.PRES"),
    pynini.cross("tir", "+COP.PRES"), pynini.cross("tır", "+COP.PRES"),
    pynini.cross("tur", "+COP.PRES"), pynini.cross("tür", "+COP.PRES")
)

person = pynini.union(
    pynini.cross("im", "+1SG"), pynini.cross("ım", "+1SG"),
    pynini.cross("um", "+1SG"), pynini.cross("üm", "+1SG"),
    pynini.cross("in", "+2SG"), pynini.cross("ın", "+2SG"),
    pynini.cross("un", "+2SG"), pynini.cross("ün", "+2SG"),
    pynini.cross("ız", "+1PL"), pynini.cross("iz", "+1PL"),
    pynini.cross("uz", "+1PL"), pynini.cross("üz", "+1PL"),
    pynini.cross("nız", "+2PL"), pynini.cross("niz", "+2PL"),
    pynini.cross("nuz", "+2PL"), pynini.cross("nüz", "+2PL"),
    pynini.cross("lar", "+3PL"), pynini.cross("ler", "+3PL"),
    pynini.cross("", "")
)

nominal_with_cop = copula + person
nominal_without_cop = pynini.cross("", "")
nominal_complete = nominal_base + pynini.union(nominal_with_cop, nominal_without_cop).optimize()

# ===== VERB MORPHOLOGY =====

# Voice, Ability, Negation (all optional)
voice = pynini.union(
    pynini.cross("ıl", "+PASS"), pynini.cross("il", "+PASS"),
    pynini.cross("ul", "+PASS"), pynini.cross("ül", "+PASS"),
    pynini.cross("ın", "+REFL"), pynini.cross("in", "+REFL"),
    pynini.cross("un", "+REFL"), pynini.cross("ün", "+REFL"),
    pynini.cross("lan", "+REFL"), pynini.cross("len", "+REFL"),
    pynini.cross("ış", "+RECIP"), pynini.cross("iş", "+RECIP"),
    pynini.cross("uş", "+RECIP"), pynini.cross("üş", "+RECIP"),
    pynini.cross("t", "+CAUS"), pynini.cross("d", "+CAUS"),
    pynini.cross("dır", "+CAUS"), pynini.cross("dir", "+CAUS"),
    pynini.cross("dur", "+CAUS"), pynini.cross("dür", "+CAUS"),
    pynini.cross("tır", "+CAUS"), pynini.cross("tir", "+CAUS"),
    pynini.cross("tur", "+CAUS"), pynini.cross("tür", "+CAUS"),
    pynini.cross("", "")
)

ability = pynini.union(
    pynini.cross("ebil", "+ABIL"),
    pynini.cross("abil", "+ABIL"),
    pynini.cross("", "")
)

negation = pynini.union(
    pynini.cross("ma", "+NEG"),
    pynini.cross("me", "+NEG"),
    pynini.cross("", "")
)

# Bildirme Kipleri (Indicative)
indicative_tense = pynini.union(
    pynini.cross("iyor", "+PRES.CONT"), pynini.cross("ıyor", "+PRES.CONT"),
    pynini.cross("uyor", "+PRES.CONT"), pynini.cross("üyor", "+PRES.CONT"),
    pynini.cross("ecek", "+FUT"), pynini.cross("acak", "+FUT"),
    pynini.cross("ır", "+AOR"), pynini.cross("ir", "+AOR"),
    pynini.cross("ur", "+AOR"), pynini.cross("ür", "+AOR"),
    pynini.cross("ar", "+AOR"), pynini.cross("er", "+AOR"),
    pynini.cross("r", "+AOR"),
    pynini.cross("dı", "+PAST"), pynini.cross("di", "+PAST"),
    pynini.cross("du", "+PAST"), pynini.cross("dü", "+PAST"),
    pynini.cross("tı", "+PAST"), pynini.cross("ti", "+PAST"),
    pynini.cross("tu", "+PAST"), pynini.cross("tü", "+PAST"),
    pynini.cross("mış", "+INFER"), pynini.cross("miş", "+INFER"),
    pynini.cross("muş", "+INFER"), pynini.cross("müş", "+INFER")
)

indicative_person = pynini.union(
    pynini.cross("um", "+1SG"), pynini.cross("üm", "+1SG"),
    pynini.cross("ım", "+1SG"), pynini.cross("im", "+1SG"),
    pynini.cross("m", "+1SG"),
    pynini.cross("sun", "+2SG"), pynini.cross("sün", "+2SG"),
    pynini.cross("sın", "+2SG"), pynini.cross("sin", "+2SG"),
    pynini.cross("n", "+2SG"),
    pynini.cross("uz", "+1PL"), pynini.cross("üz", "+1PL"),
    pynini.cross("ız", "+1PL"), pynini.cross("iz", "+1PL"),
    pynini.cross("k", "+1PL"),
    pynini.cross("sunuz", "+2PL"), pynini.cross("sünüz", "+2PL"),
    pynini.cross("sınız", "+2PL"), pynini.cross("siniz", "+2PL"),
    pynini.cross("nız", "+2PL"), pynini.cross("niz", "+2PL"),
    pynini.cross("nuz", "+2PL"), pynini.cross("nüz", "+2PL"),
    pynini.cross("lar", "+3PL"), pynini.cross("ler", "+3PL"),
    pynini.cross("", "+3SG")
)

# Dilek/Tasarlama Kipleri (Subjunctive/Optative)

# İstek Kipi (Optative)
optative_mood_person = pynini.union(
    pynini.cross("ayım", "+OPT+1SG"), pynini.cross("eyim", "+OPT+1SG"),
    pynini.cross("ayum", "+OPT+1SG"), pynini.cross("eyüm", "+OPT+1SG"),
    pynini.cross("asın", "+OPT+2SG"), pynini.cross("esin", "+OPT+2SG"),
    pynini.cross("asun", "+OPT+2SG"), pynini.cross("esün", "+OPT+2SG"),
    pynini.cross("asana", "+OPT+2SG+EMPH"), pynini.cross("esene", "+OPT+2SG+EMPH"),
    pynini.cross("sana", "+OPT+2SG+EMPH"), pynini.cross("sene", "+OPT+2SG+EMPH"),
    pynini.cross("asan", "+OPT+2SG"), pynini.cross("esen", "+OPT+2SG"),
    pynini.cross("a", "+OPT+3SG"), pynini.cross("e", "+OPT+3SG"),
    pynini.cross("alım", "+OPT+1PL"), pynini.cross("elim", "+OPT+1PL"),
    pynini.cross("alum", "+OPT+1PL"), pynini.cross("elüm", "+OPT+1PL"),
    pynini.cross("asınız", "+OPT+2PL"), pynini.cross("esiniz", "+OPT+2PL"),
    pynini.cross("asunuz", "+OPT+2PL"), pynini.cross("esünüz", "+OPT+2PL"),
    pynini.cross("alar", "+OPT+3PL"), pynini.cross("eler", "+OPT+3PL")
)

# Dilek-Koşul Kipi (Conditional)
conditional_mood_person = pynini.union(
    pynini.cross("sam", "+COND+1SG"), pynini.cross("sem", "+COND+1SG"),
    pynini.cross("san", "+COND+2SG"), pynini.cross("sen", "+COND+2SG"),
    pynini.cross("sa", "+COND+3SG"), pynini.cross("se", "+COND+3SG"),
    pynini.cross("sak", "+COND+1PL"), pynini.cross("sek", "+COND+1PL"),
    pynini.cross("sanız", "+COND+2PL"), pynini.cross("seniz", "+COND+2PL"),
    pynini.cross("sanuz", "+COND+2PL"), pynini.cross("senüz", "+COND+2PL"),
    pynini.cross("salar", "+COND+3PL"), pynini.cross("seler", "+COND+3PL")
)

# ...


# Gereklilik Kipi (Necessitative)
necessitative_mood_person = pynini.union(
    pynini.cross("malıyım", "+NEC+1SG"), pynini.cross("meliyim", "+NEC+1SG"),
    pynini.cross("malıyum", "+NEC+1SG"), pynini.cross("meliyüm", "+NEC+1SG"),
    pynini.cross("malısın", "+NEC+2SG"), pynini.cross("melisin", "+NEC+2SG"),
    pynini.cross("malısun", "+NEC+2SG"), pynini.cross("melisün", "+NEC+2SG"),
    pynini.cross("malı", "+NEC+3SG"), pynini.cross("meli", "+NEC+3SG"),
    pynini.cross("malıyız", "+NEC+1PL"), pynini.cross("meliyiz", "+NEC+1PL"),
    pynini.cross("malıyuz", "+NEC+1PL"), pynini.cross("meliyüz", "+NEC+1PL"),
    pynini.cross("malısınız", "+NEC+2PL"), pynini.cross("melisiniz", "+NEC+2PL"),
    pynini.cross("malısunuz", "+NEC+2PL"), pynini.cross("melisünüz", "+NEC+2PL"),
    pynini.cross("malılar", "+NEC+3PL"), pynini.cross("meliler", "+NEC+3PL")
)

# Emir Kipi (Imperative)
imperative_mood_person = pynini.union(
    pynini.cross("sin", "+IMP+3SG"), pynini.cross("sın", "+IMP+3SG"),
    pynini.cross("sun", "+IMP+3SG"), pynini.cross("sün", "+IMP+3SG"),
    pynini.cross("in", "+IMP+2PL"), pynini.cross("ın", "+IMP+2PL"),
    pynini.cross("un", "+IMP+2PL"), pynini.cross("ün", "+IMP+2PL"),
    pynini.cross("iniz", "+IMP+2PL"), pynini.cross("ınız", "+IMP+2PL"),
    pynini.cross("unuz", "+IMP+2PL"), pynini.cross("ünüz", "+IMP+2PL"),
    pynini.cross("sinler", "+IMP+3PL"), pynini.cross("sınlar", "+IMP+3PL"),
    pynini.cross("sunlar", "+IMP+3PL"), pynini.cross("sünler", "+IMP+3PL")
)

imperative_2sg_bare = pynini.cross("", "+IMP+2SG")

# Build verb structure
verb_base = verb_roots + voice + ability + negation

# Verb paths
verb_indicative = verb_base + indicative_tense + indicative_person
verb_optative = verb_base + optative_mood_person
verb_conditional = verb_base + conditional_mood_person
verb_necessitative = verb_base + necessitative_mood_person
verb_imperative = verb_base + pynini.union(imperative_mood_person, imperative_2sg_bare)

# Combine all verb paths
verb_complete = pynini.union(
    verb_indicative,
    verb_optative,
    verb_conditional,
    verb_necessitative,
    verb_imperative
).optimize()

# ===== PUNCTUATION =====
punctuation = pynini.union(
    pynini.cross(".", "+PUNCT.period"),
    pynini.cross(",", "+PUNCT.comma"),
    pynini.cross("?", "+PUNCT.question"),
    pynini.cross("!", "+PUNCT.exclamation"),
    pynini.cross(":", "+PUNCT.colon"),
    pynini.cross(";", "+PUNCT.semicolon"),
    pynini.cross("", "")
)

# ===== COMPLETE ANALYZER =====
simple_categories = pynini.union(
    adverb_roots,
    postposition_roots,
    interjection_roots,
    conjunction_roots,
    question_particles
)

nominal_fst = (nominal_complete + punctuation).optimize()
verb_fst = (verb_complete + punctuation).optimize()
simple_fst = (simple_categories + punctuation).optimize()

turkish_analyzer = pynini.union(nominal_fst, verb_fst, simple_fst).optimize()

def analyze(word):
    """Analyze a Turkish word and return all possible analyses."""
    try:
        lattice = pynini.compose(word, turkish_analyzer)
        analyses = []
        seen = set()
        try:
            for path in lattice.paths().ostrings():
                if path not in seen:
                    analyses.append(path)
                    seen.add(path)
        except:
            pass
        return sorted(analyses) if analyses else [f"No analysis found for: {word}"]
    except Exception as e:
        return [f"Error: {str(e)}"]

def disambiguate_analyses(word, analyses, prefer_verb=False):
    """
    Apply simple heuristics to reduce ambiguity.
    This is a basic rule-based approach. For production use,
    statistical/neural methods are recommended.
    """
    if not analyses or analyses[0].startswith("No analysis"):
        return analyses

    filtered = []

    for analysis in analyses:
        skip = False

        # Rule 1: "misin", "misin?", etc. are question particles, not "mis+POSS"
        if any(q in word.lower() for q in ['misin', 'mısın', 'musun', 'müsün', 'miyim', 'mıyım']):
            if "+QUES+" in analysis:
                filtered.append(analysis)
                skip = True
            elif "+NOUN+POSS" in analysis:
                skip = True  # Skip mis+NOUN+POSS

        # Rule 2: Words ending with -dım/-dim/-dum/-düm are likely VERB+PAST+1SG
        if not skip and word.endswith(('dım', 'dim', 'dum', 'düm')):
            if "+VERB+PAST+1SG" in analysis:
                filtered.append(analysis)
                skip = True
            else:
                skip = True  # Skip other analyses

        # Rule 3: Words ending with -dı/-di/-du/-dü/-tı/-ti/-tu/-tü are likely VERB+PAST
        if not skip and word.endswith(('dı', 'di', 'du', 'dü', 'tı', 'ti', 'tu', 'tü')):
            if "+VERB+PAST" in analysis:
                filtered.append(analysis)
            # Don't skip others completely, but prefer verb

        # Rule 4: Words ending with -yor/-yorum/-yorsun are definitely VERB+PRES.CONT
        elif not skip and any(word.endswith(x) for x in ['yor', 'yorum', 'yorsun', 'iyorum', 'ıyorum', 'uyorum', 'üyorum']):
            if "+VERB+PRES.CONT" in analysis:
                filtered.append(analysis)
                skip = True
            else:
                skip = True

        # Rule 5: Words ending with -ecek/-acak could be verb or noun, prefer verb in context
        elif not skip and word.endswith(('ecek', 'acak', 'eceğim', 'acağım')):
            if prefer_verb and "+VERB+" in analysis:
                filtered.append(analysis)
            elif not prefer_verb:
                filtered.append(analysis)

        # Rule 6: Single syllable pronouns like "ben" are likely PRON, not NOUN
        elif not skip and word.lower() in ['ben', 'sen', 'biz', 'siz']:
            if "+PRON" in analysis:
                filtered.append(analysis)
            # Don't completely skip NOUN, but prefer PRON

        # Add anything not caught by rules
        if not skip:
            filtered.append(analysis)

    # If filtering removed everything, return original
    return filtered if filtered else analyses

def analyze_with_disambiguation(word, prefer_verb=False):
    """Analyze word and apply basic disambiguation."""
    raw_analyses = analyze(word)
    return disambiguate_analyses(word, raw_analyses, prefer_verb)

def analyze_sentence(sentence, use_disambiguation=True):
    """
    Analyze a sentence by tokenizing and analyzing each word.
    Optionally applies context-based disambiguation.
    """
    tokens = sentence.split()
    results = []

    for i, token in enumerate(tokens):
        # Simple context: if previous word was a noun, current might be verb
        prefer_verb = False
        if i > 0 and results:
            prev_analyses = results[-1]['analyses']
            if any('+NOUN+' in a or '+ADJ+' in a for a in prev_analyses):
                prefer_verb = True

        if use_disambiguation:
            analyses = analyze_with_disambiguation(token, prefer_verb)
        else:
            analyses = analyze(token)

        results.append({
            'token': token,
            'analyses': analyses
        })

    return results

def save_fst(filename):
    """Save the compiled FST to a file."""
    turkish_analyzer.write(filename)
    print(f"FST saved to {filename}")

# Test
if __name__ == "__main__":
    print("="*60)
    print("LEXICON LOADED")
    print("="*60)
    print(f"Nouns: {len(lexicon.get('nouns', []))} words")
    print(f"Verbs: {len(lexicon.get('verbs', []))} words (infinitives → roots extracted)")
    print(f"Adjectives: {len(lexicon.get('adjectives', []))} words")
    print(f"Pronouns: {len(lexicon.get('pronouns', []))} words")
    print(f"Adverbs: {len(lexicon.get('adverbs', []))} words")
    print(f"Conjunctions: {len(lexicon.get('conjunctions', []))} words")
    print(f"Postpositions: {len(lexicon.get('postpositions', []))} words")
    print(f"Proper Nouns: {len(lexicon.get('proper_nouns', []))} words")

    # Show some verb root examples
    if lexicon.get('verbs'):
        print("\nVerb root examples:")
        for verb in lexicon.get('verbs', [])[:5]:
            root = extract_verb_root(verb)
            print(f"  {verb} → {root}")

    print("\n" + "="*60)
    print("DEBUG: Testing verb compositions")
    print("="*60)

    test = "gel"
    try:
        lattice = pynini.compose(test, verb_imperative)
        print(f"\n'{test}' + verb_imperative:")
        for path in lattice.paths().ostrings():
            print(f"  ✓ {path}")
    except Exception as e:
        print(f"  ✗ Error: {e}")

    test2 = "gelsin"
    try:
        lattice = pynini.compose(test2, verb_imperative)
        print(f"\n'{test2}' + verb_imperative:")
        for path in lattice.paths().ostrings():
            print(f"  ✓ {path}")
    except Exception as e:
        print(f"  ✗ Error: {e}")

    print("\n" + "="*60)
    print("SINGLE WORD ANALYSIS")
    print("="*60)

    test_words = [
        # Dilek Kipleri
        "gel",            # come! (imperative 2sg)
        "gelin",          # come! (imperative 2pl)
        "gelsin",         # let him/her come (imperative 3sg)
        "gelsem",         # if I come (conditional)
        "geleyim",        # let me come (optative)
        "gelmeli",        # must come (necessitative)
        "yazmalıyım",     # I must write
        "okusana",        # read then! (optative emphatic)
        "görseler",       # if they see (conditional)
        # Bildirme Kipleri
        "okudum",         # I read (past)
        "gelebilecek",    # will be able to come (future)
        "geliyorum",      # I am coming (present continuous)
        # Nouns
        "kitaplardan",    # from books
        "kalemlik",       # pencil case
        "evdekiler",      # those in the house
    ]

    for word in test_words:
        print(f"\n{word}:")
        analyses = analyze(word)
        for analysis in analyses[:5]:
            print(f"  {analysis}")
        if len(analyses) > 5:
            print(f"  ... and {len(analyses) - 5} more analyses")

    print("\n" + "="*60)
    print("MULTI-WORD ANALYSIS")
    print("="*60)

    test_sentences = [
        "evde misin?",
        "kitap da güzel",
        "gel buraya!",
        "oraya git",
        "yukarı çık",
        "ben yemeğe gidiyorum gelecek misin?",
        "kitabı aldım",
        "ağaca çıktı",
    ]

    for sentence in test_sentences:
        print(f"\n'{sentence}':")
        results = analyze_sentence(sentence)
        for result in results:
            print(f"  {result['token']}:")
            for analysis in result['analyses'][:3]:
                print(f"    {analysis}")
            if len(result['analyses']) > 3:
                print(f"    ... and {len(result['analyses']) - 3} more")

LEXICON LOADED
Nouns: 56691 words
Verbs: 3455 words (infinitives → roots extracted)
Adjectives: 5504 words
Pronouns: 50 words
Adverbs: 1300 words
Conjunctions: 53 words
Postpositions: 0 words
Proper Nouns: 0 words

Verb root examples:
  polenlemek → polenle
  kubaşmak → kubaş
  sayıklamak → sayıkla
  ölçeklemek → ölçekle
  atkılamak → atkıla

DEBUG: Testing verb compositions

'gel' + verb_imperative:
  ✓ gel+VERB+IMP+2SG

'gelsin' + verb_imperative:
  ✓ gel+VERB+IMP+3SG

SINGLE WORD ANALYSIS

gel:
  gel+VERB+IMP+2SG

gelin:
  gel+VERB+IMP+2PL
  gel+VERB+REFL+IMP+2SG
  gelin+NOUN

gelsin:
  gel+VERB+IMP+3SG

gelsem:
  gel+VERB+COND+1SG

geleyim:
  gel+VERB+OPT+1SG

gelmeli:
  gel+VERB+NEC+3SG

yazmalıyım:
  yaz+VERB+NEC+1SG

okusana:
  oku+VERB+OPT+2SG+EMPH

görseler:
  gör+VERB+COND+3PL

okudum:
  oku+VERB+PAST+1SG

gelebilecek:
  gel+VERB+ABIL+FUT+3SG

geliyorum:
  gel+VERB+PRES.CONT+1SG

kitaplardan:
  kitap+NOUN+PL+ABL

kalemlik:
  kalem+NOUN+DER.lik

evdekiler:
  ev+NOUN+LOC+KI+PL


In [8]:
for word in ["gelecek misin", "yapacak mısın", "gidiyorum", "çıktı", "gelecek", "ben"]:
    print(f"\n{word}:")
    raw = analyze(word)
    filtered = analyze_with_disambiguation(word)
    print(f"  RAW ({len(raw)}): {raw}")
    print(f"  FILTERED ({len(filtered)}): {filtered}")



gelecek misin:
  RAW (1): ['No analysis found for: gelecek misin']
  FILTERED (1): ['No analysis found for: gelecek misin']

yapacak mısın:
  RAW (1): ['No analysis found for: yapacak mısın']
  FILTERED (1): ['No analysis found for: yapacak mısın']

gidiyorum:
  RAW (1): ['git+VERB+PRES.CONT+1SG']
  FILTERED (1): ['git+VERB+PRES.CONT+1SG']

çıktı:
  RAW (2): ['çık+VERB+PAST+3SG', 'çıktı+NOUN']
  FILTERED (3): ['çık+VERB+PAST+3SG', 'çık+VERB+PAST+3SG', 'çıktı+NOUN']

gelecek:
  RAW (3): ['gel+VERB+FUT+3SG', 'gelecek+ADJ', 'gelecek+NOUN']
  FILTERED (6): ['gel+VERB+FUT+3SG', 'gel+VERB+FUT+3SG', 'gelecek+ADJ', 'gelecek+ADJ', 'gelecek+NOUN', 'gelecek+NOUN']

ben:
  RAW (2): ['ben+NOUN', 'ben+PRON']
  FILTERED (3): ['ben+NOUN', 'ben+PRON', 'ben+PRON']
