In [3]:
# Setup
!pip install --only-binary :all: pynini
!pip install wurlitzer
import pynini
%load_ext wurlitzer

Collecting pynini
  Downloading pynini-2.1.7-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.7 kB)
Downloading pynini-2.1.7-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (165.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.5/165.5 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynini
Successfully installed pynini-2.1.7


In [4]:
import pynini
import json
import math
from collections import defaultdict

# ===== LOAD LEXICON FROM JSON =====
def load_lexicon(json_file='turkish_lexicon.json'):
    """Load Turkish lexicon from JSON file."""
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"Warning: {json_file} not found. Using default minimal lexicon.")
        return {
            "nouns": ["ev", "kitap", "masa"],
            "verbs": ["gel", "git", "oku"],
            "adjectives": ["güzel", "iyi"],
            "pronouns": ["ben", "sen", "o"],
            "adverbs": ["çok", "az"],
            "conjunctions": ["ve", "da", "de"],
            "postpositions": ["gibi", "için"],
            "proper_nouns": []
        }

# Load lexicon
lexicon = load_lexicon()

# ===== HELPER FUNCTIONS =====
def extract_verb_root(verb_infinitive):
    """Extract verb root from infinitive form (remove -mak/-mek)."""
    if verb_infinitive.endswith('mak'):
        return verb_infinitive[:-3]
    elif verb_infinitive.endswith('mek'):
        return verb_infinitive[:-3]
    else:
        # If it doesn't end with -mak/-mek, return as is (might be already a root)
        return verb_infinitive

def create_alternating_roots(words, tag):
    """
    Create FST roots with consonant softening (ünsüz yumuşaması).
    p→b, ç→c, t→d, k→g/ğ when followed by vowel-initial suffix.
    """
    roots = []

    for word in words:
        # Original form (before consonant-initial suffixes)
        roots.append(pynini.cross(word, f"{word}+{tag}"))

        # Check if word ends with p, ç, t, k (sert ünsüz)
        if len(word) > 1 and word[-1] in ['p', 'ç', 't', 'k']:
            # Get the stem without final consonant
            stem = word[:-1]
            final = word[-1]

            # Determine softened form (yumuşak ünsüz)
            if final == 'p':
                softened = stem + 'b'
            elif final == 'ç':
                softened = stem + 'c'
            elif final == 't':
                softened = stem + 'd'
            elif final == 'k':
                # k → ğ after vowels, k → g after consonants
                if len(stem) > 0 and stem[-1] in 'aeıioöuü':
                    softened = stem + 'ğ'
                else:
                    softened = stem + 'g'

            # Add softened form (before vowel-initial suffixes)
            roots.append(pynini.cross(softened, f"{word}+{tag}"))

    return pynini.union(*roots) if roots else pynini.cross("", "")

# ===== ROOTS BY LEXICAL CATEGORY =====

# Nouns (Ad) - with consonant softening
noun_roots = create_alternating_roots(
    lexicon.get('nouns', []),
    'NOUN'
) if lexicon.get('nouns') else pynini.cross("", "")

# Adjectives (Sıfat) - with consonant softening
adj_roots = create_alternating_roots(
    lexicon.get('adjectives', []),
    'ADJ'
) if lexicon.get('adjectives') else pynini.cross("", "")

# Verbs (Fiil) - extract roots and apply consonant softening
verb_root_list = [extract_verb_root(word) for word in lexicon.get('verbs', [])]
verb_roots = create_alternating_roots(
    verb_root_list,
    'VERB'
) if verb_root_list else pynini.cross("", "")

# Pronouns (Zamir) - usually don't undergo consonant softening, but include for completeness
pronoun_roots = pynini.union(*[
    pynini.cross(word, f"{word}+PRON")
    for word in lexicon.get('pronouns', [])
]) if lexicon.get('pronouns') else pynini.cross("", "")

irregular_pronouns = pynini.union(
    pynini.cross("bana", "ben+PRON+DAT"),
    pynini.cross("sana", "sen+PRON+DAT")
)
pronoun_roots = pynini.union(pronoun_roots, irregular_pronouns)

# Adverbs (Zarf) - no inflection
adverb_roots = pynini.union(*[
    pynini.cross(word, f"{word}+ADV")
    for word in lexicon.get('adverbs', [])
]) if lexicon.get('adverbs') else pynini.cross("", "")

# Postpositions (Edat) - no inflection
postposition_roots = pynini.union(*[
    pynini.cross(word, f"{word}+POSTP")
    for word in lexicon.get('postpositions', [])
]) if lexicon.get('postpositions') else pynini.cross("", "")

# Interjections (Ünlem) - no inflection
interjection_roots = pynini.union(*[
    pynini.cross(word, f"{word}+INTERJ")
    for word in lexicon.get('interjections', [])
]) if lexicon.get('interjections') else pynini.cross("", "")

# Conjunctions (Bağlaç) - no inflection
conjunction_roots = pynini.union(*[
    pynini.cross(word, f"{word}+CONJ")
    for word in lexicon.get('conjunctions', [])
]) if lexicon.get('conjunctions') else pynini.cross("", "")

# Proper Nouns (Özel İsimler) - with consonant softening
proper_noun_roots = create_alternating_roots(
    lexicon.get('proper_nouns', []),
    'PROPN'
) if lexicon.get('proper_nouns') else pynini.cross("", "")

# Question Particles
question_particles = pynini.union(
    pynini.cross("mi", "mi+QUES"),
    pynini.cross("mı", "mı+QUES"),
    pynini.cross("mu", "mu+QUES"),
    pynini.cross("mü", "mü+QUES"),
    pynini.cross("misin", "mi+QUES+2SG"),
    pynini.cross("mısın", "mı+QUES+2SG"),
    pynini.cross("musun", "mu+QUES+2SG"),
    pynini.cross("müsün", "mü+QUES+2SG"),
    pynini.cross("miyim", "mi+QUES+1SG"),
    pynini.cross("mıyım", "mı+QUES+1SG"),
    pynini.cross("muyum", "mu+QUES+1SG"),
    pynini.cross("müyüm", "mü+QUES+1SG"),
    pynini.cross("miyiz", "mi+QUES+1PL"),
    pynini.cross("mıyız", "mı+QUES+1PL"),
    pynini.cross("muyuz", "mu+QUES+1PL"),
    pynini.cross("müyüz", "mü+QUES+1PL"),
    pynini.cross("misiniz", "mi+QUES+2PL"),
    pynini.cross("mısınız", "mı+QUES+2PL"),
    pynini.cross("musunuz", "mu+QUES+2PL"),
    pynini.cross("müsünüz", "mü+QUES+2PL")
)

# ===== DERIVATIONAL SUFFIXES (YAPIM EKLERİ) =====

# A) NOUN-TO-NOUN (İsimden İsim)
# Includes -lik, -ci, -li, -siz, -cil, -das, -gil
deriv_n_n = pynini.union(
    # -lik (yer, alet, meslek)
    pynini.cross("lık", "+DER.lik"), pynini.cross("lik", "+DER.lik"),
    pynini.cross("luk", "+DER.luk"), pynini.cross("lük", "+DER.lük"),

    # -ci (meslek, alışkanlık)
    pynini.cross("cı", "+DER.ci"), pynini.cross("ci", "+DER.ci"),
    pynini.cross("cu", "+DER.ci"), pynini.cross("cü", "+DER.ci"),
    pynini.cross("çı", "+DER.ci"), pynini.cross("çi", "+DER.ci"),

    # -li (bulundurma)
    pynini.cross("lı", "+DER.li"), pynini.cross("li", "+DER.li"),
    pynini.cross("lu", "+DER.li"), pynini.cross("lü", "+DER.li"),

    # -siz (yoksunluk)
    pynini.cross("sız", "+DER.siz"), pynini.cross("siz", "+DER.siz"),
    pynini.cross("suz", "+DER.siz"), pynini.cross("süz", "+DER.siz"),

    # -ki (aitlik - usually attaches to Locative or Time nouns)
    pynini.cross("ki", "+DER.ki"), pynini.cross("kü", "+DER.ki"),

    # -cE (dil, küçültme)
    pynini.cross("ca", "+DER.ce"), pynini.cross("ce", "+DER.ce"),
    pynini.cross("ça", "+DER.ce"), pynini.cross("çe", "+DER.ce"),

    pynini.cross("", "")
)

# B) VERB-TO-NOUN (Fiilden İsim)
# Includes mastar (-mak), isim-fiil (-ma), sıfat-fiil (-an, -asi, -mez...), -gi, -gin
deriv_v_n = pynini.union(
    # -mak (Mastar)
    pynini.cross("mak", "+DER.mak"), pynini.cross("mek", "+DER.mak"),

    # -ma (İsim-Fiil/Olumsuzluk değil) - Disambiguation Needed later
    pynini.cross("ma", "+DER.ma"), pynini.cross("me", "+DER.ma"),

    # -iş (Kalıcı isim)
    pynini.cross("ış", "+DER.iş"), pynini.cross("iş", "+DER.iş"),
    pynini.cross("uş", "+DER.iş"), pynini.cross("üş", "+DER.iş"),

    # -im (Fiilden isim)
    pynini.cross("ım", "+DER.im"), pynini.cross("im", "+DER.im"),
    pynini.cross("um", "+DER.im"), pynini.cross("üm", "+DER.im"),

    # -gin (Sıfat/İsim)
    pynini.cross("gın", "+DER.gin"), pynini.cross("gin", "+DER.gin"),
    pynini.cross("gun", "+DER.gin"), pynini.cross("gün", "+DER.gin"),
    pynini.cross("kın", "+DER.gin"), pynini.cross("kin", "+DER.gin"),

    # -ici (Fail)
    pynini.cross("ıcı", "+DER.ici"), pynini.cross("ici", "+DER.ici"),
    pynini.cross("ucu", "+DER.ici"), pynini.cross("ücü", "+DER.ici"),

    # -an (Sıfat Fiil)
    pynini.cross("an", "+DER.an"), pynini.cross("en", "+DER.an"),

    pynini.cross("", "")
)

# C) NOUN-TO-VERB (İsimden Fiil)
deriv_n_v = pynini.union(
    # -le (İsimden fiil)
    pynini.cross("la", "+DER.la"), pynini.cross("le", "+DER.la"),

    # -len (Dönüşlü)
    pynini.cross("lan", "+DER.lan"), pynini.cross("len", "+DER.lan"),

    # -leş (Oluş)
    pynini.cross("laş", "+DER.laş"), pynini.cross("leş", "+DER.laş"),

    pynini.cross("", "")
)

# D) VERB-TO-VERB (Fiilden Fiil / Çatı)
# Note: These usually go BEFORE the tense markers.
deriv_v_v = pynini.union(
    # -t (Ettirgen)
    pynini.cross("t", "+VOICE.t"),

    # -dir (Ettirgen)
    pynini.cross("dır", "+VOICE.dir"), pynini.cross("dir", "+VOICE.dir"),
    pynini.cross("dur", "+VOICE.dir"), pynini.cross("dür", "+VOICE.dir"),
    pynini.cross("tır", "+VOICE.dir"), pynini.cross("tir", "+VOICE.dir"),

    # -il (Edilgen)
    pynini.cross("ıl", "+VOICE.il"), pynini.cross("il", "+VOICE.il"),
    pynini.cross("ul", "+VOICE.il"), pynini.cross("ül", "+VOICE.il"),

    # -n (Dönüşlü)
    pynini.cross("ın", "+VOICE.in"), pynini.cross("in", "+VOICE.in"),
    pynini.cross("n", "+VOICE.in"),

    pynini.cross("", "")
)

# ===== NOUN/ADJECTIVE MORPHOLOGY =====
# Include proper nouns that can take case markers
nominal_roots = pynini.union(noun_roots, adj_roots, pronoun_roots, proper_noun_roots)

# 1. Update Nominal Path (Noun -> Noun Deriv -> Plural...)
nominal_derived = (nominal_roots + deriv_n_n).optimize()
# Note: We also need to allow Verbs to become Nouns (e.g. okumayı)
verb_to_noun_derived = (verb_roots + deriv_v_v + deriv_v_n).optimize()

# Combine both as valid nominal bases
nominal_base_all = pynini.union(nominal_derived, verb_to_noun_derived)

plural = pynini.union(
    pynini.cross("lar", "+PL"),
    pynini.cross("ler", "+PL"),
    pynini.cross("", "")
)

# Re-define plural/possessive/case attached to nominal_base_all
nominal_pl = nominal_base_all + plural

possessive = pynini.union(
    pynini.cross("imiz", "+POSS.1PL"), pynini.cross("ımız", "+POSS.1PL"),
    pynini.cross("umuz", "+POSS.1PL"), pynini.cross("ümüz", "+POSS.1PL"),
    pynini.cross("iniz", "+POSS.2PL"), pynini.cross("ınız", "+POSS.2PL"),
    pynini.cross("unuz", "+POSS.2PL"), pynini.cross("ünüz", "+POSS.2PL"),
    pynini.cross("leri", "+POSS.3PL"), pynini.cross("ları", "+POSS.3PL"),
    pynini.cross("im", "+POSS.1SG"), pynini.cross("ım", "+POSS.1SG"),
    pynini.cross("um", "+POSS.1SG"), pynini.cross("üm", "+POSS.1SG"),
    pynini.cross("in", "+POSS.2SG"), pynini.cross("ın", "+POSS.2SG"),
    pynini.cross("un", "+POSS.2SG"), pynini.cross("ün", "+POSS.2SG"),
    pynini.cross("si", "+POSS.3SG"), pynini.cross("sı", "+POSS.3SG"),
    pynini.cross("su", "+POSS.3SG"), pynini.cross("sü", "+POSS.3SG")
)

case_after_poss = pynini.union(
    pynini.cross("dan", "+ABL"), pynini.cross("den", "+ABL"),
    pynini.cross("tan", "+ABL"), pynini.cross("ten", "+ABL"),
    pynini.cross("ndan", "+ABL"), pynini.cross("nden", "+ABL"),
    pynini.cross("ntan", "+ABL"), pynini.cross("nten", "+ABL"),
    pynini.cross("nın", "+GEN"), pynini.cross("nin", "+GEN"),
    pynini.cross("nun", "+GEN"), pynini.cross("nün", "+GEN"),
    pynini.cross("da", "+LOC"), pynini.cross("de", "+LOC"),
    pynini.cross("ta", "+LOC"), pynini.cross("te", "+LOC"),
    pynini.cross("nda", "+LOC"), pynini.cross("nde", "+LOC"),
    pynini.cross("nta", "+LOC"), pynini.cross("nte", "+LOC"),
    pynini.cross("ya", "+DAT"), pynini.cross("ye", "+DAT"),
    pynini.cross("na", "+DAT"), pynini.cross("ne", "+DAT"),
    pynini.cross("yı", "+ACC"), pynini.cross("yi", "+ACC"),
    pynini.cross("yu", "+ACC"), pynini.cross("yü", "+ACC"),
    pynini.cross("nı", "+ACC"), pynini.cross("ni", "+ACC"),
    pynini.cross("nu", "+ACC"), pynini.cross("nü", "+ACC"),
    pynini.cross("yla", "+INS"), pynini.cross("yle", "+INS"),
    pynini.cross("ca", "+EQU"), pynini.cross("ce", "+EQU"),
    pynini.cross("", "")
)

ki_suffix = pynini.union(
    pynini.cross("ki", "+KI"),
    pynini.cross("kü", "+KI"),
    pynini.cross("", "")
)

plural_after_ki = pynini.union(
    pynini.cross("ler", "+PL"),
    pynini.cross("lar", "+PL"),
    pynini.cross("", "")
)

possessive_path = nominal_pl + possessive + case_after_poss + ki_suffix + plural_after_ki

case_no_poss = pynini.union(
    pynini.cross("ların", "+GEN"), pynini.cross("lerin", "+GEN"),
    pynini.cross("dan", "+ABL"), pynini.cross("den", "+ABL"),
    pynini.cross("tan", "+ABL"), pynini.cross("ten", "+ABL"),
    pynini.cross("nın", "+GEN"), pynini.cross("nin", "+GEN"),
    pynini.cross("nun", "+GEN"), pynini.cross("nün", "+GEN"),
    pynini.cross("da", "+LOC"), pynini.cross("de", "+LOC"),
    pynini.cross("ta", "+LOC"), pynini.cross("te", "+LOC"),
    pynini.cross("ya", "+DAT"), pynini.cross("ye", "+DAT"),
    pynini.cross("a", "+DAT"), pynini.cross("e", "+DAT"),
    pynini.cross("yı", "+ACC"), pynini.cross("yi", "+ACC"),
    pynini.cross("yu", "+ACC"), pynini.cross("yü", "+ACC"),
    pynini.cross("ı", "+ACC"), pynini.cross("i", "+ACC"),
    pynini.cross("u", "+ACC"), pynini.cross("ü", "+ACC"),
    pynini.cross("la", "+INS"), pynini.cross("le", "+INS"),
    pynini.cross("yla", "+INS"), pynini.cross("yle", "+INS"),
    pynini.cross("ca", "+EQU"), pynini.cross("ce", "+EQU"),
    pynini.cross("", "")
)

ki_suffix_case = pynini.union(
    pynini.cross("ki", "+KI"),
    pynini.cross("kü", "+KI"),
    pynini.cross("", "")
)

plural_after_ki_case = pynini.union(
    pynini.cross("ler", "+PL"),
    pynini.cross("lar", "+PL"),
    pynini.cross("", "")
)

case_only_path = nominal_pl + case_no_poss + ki_suffix_case + plural_after_ki_case

nominal_final_base = pynini.union(possessive_path, case_only_path).optimize()

# 2. Update Verb Path (Noun -> Verb OR Verb -> Verb)
noun_to_verb_stem = (nominal_roots + deriv_n_v).optimize()
verb_to_verb_stem = (verb_roots + deriv_v_v).optimize()
verb_stems_all = pynini.union(noun_to_verb_stem, verb_to_verb_stem)

copula = pynini.union(
    pynini.cross("ydi", "+COP.PAST"), pynini.cross("ydı", "+COP.PAST"),
    pynini.cross("ydu", "+COP.PAST"), pynini.cross("ydü", "+COP.PAST"),
    pynini.cross("ymış", "+COP.EVID"), pynini.cross("ymiş", "+COP.EVID"),
    pynini.cross("ymuş", "+COP.EVID"), pynini.cross("ymüş", "+COP.EVID"),
    pynini.cross("yse", "+COP.COND"), pynini.cross("ysa", "+COP.COND"),
    pynini.cross("di", "+COP.PAST"), pynini.cross("dı", "+COP.PAST"),
    pynini.cross("du", "+COP.PAST"), pynini.cross("dü", "+COP.PAST"),
    pynini.cross("ti", "+COP.PAST"), pynini.cross("tı", "+COP.PAST"),
    pynini.cross("tu", "+COP.PAST"), pynini.cross("tü", "+COP.PAST"),
    pynini.cross("miş", "+COP.EVID"), pynini.cross("mış", "+COP.EVID"),
    pynini.cross("muş", "+COP.EVID"), pynini.cross("müş", "+COP.EVID"),
    pynini.cross("se", "+COP.COND"), pynini.cross("sa", "+COP.COND"),
    pynini.cross("dir", "+COP.PRES"), pynini.cross("dır", "+COP.PRES"),
    pynini.cross("dur", "+COP.PRES"), pynini.cross("dür", "+COP.PRES"),
    pynini.cross("tir", "+COP.PRES"), pynini.cross("tır", "+COP.PRES"),
    pynini.cross("tur", "+COP.PRES"), pynini.cross("tür", "+COP.PRES")
)

person = pynini.union(
    pynini.cross("im", "+1SG"), pynini.cross("ım", "+1SG"),
    pynini.cross("um", "+1SG"), pynini.cross("üm", "+1SG"),
    pynini.cross("in", "+2SG"), pynini.cross("ın", "+2SG"),
    pynini.cross("un", "+2SG"), pynini.cross("ün", "+2SG"),
    pynini.cross("ız", "+1PL"), pynini.cross("iz", "+1PL"),
    pynini.cross("uz", "+1PL"), pynini.cross("üz", "+1PL"),
    pynini.cross("nız", "+2PL"), pynini.cross("niz", "+2PL"),
    pynini.cross("nuz", "+2PL"), pynini.cross("nüz", "+2PL"),
    pynini.cross("lar", "+3PL"), pynini.cross("ler", "+3PL"),
    pynini.cross("", "")
)

nominal_with_cop = copula + person
nominal_without_cop = pynini.cross("", "")
nominal_complete = nominal_final_base + pynini.union(nominal_with_cop, nominal_without_cop).optimize()

# ===== VERB MORPHOLOGY =====

# Voice, Ability, Negation (all optional)
voice = pynini.union(
    pynini.cross("ıl", "+PASS"), pynini.cross("il", "+PASS"),
    pynini.cross("ul", "+PASS"), pynini.cross("ül", "+PASS"),
    pynini.cross("ın", "+REFL"), pynini.cross("in", "+REFL"),
    pynini.cross("un", "+REFL"), pynini.cross("ün", "+REFL"),
    pynini.cross("lan", "+REFL"), pynini.cross("len", "+REFL"),
    pynini.cross("ış", "+RECIP"), pynini.cross("iş", "+RECIP"),
    pynini.cross("uş", "+RECIP"), pynini.cross("üş", "+RECIP"),
    pynini.cross("t", "+CAUS"), pynini.cross("d", "+CAUS"),
    pynini.cross("dır", "+CAUS"), pynini.cross("dir", "+CAUS"),
    pynini.cross("dur", "+CAUS"), pynini.cross("dür", "+CAUS"),
    pynini.cross("tır", "+CAUS"), pynini.cross("tir", "+CAUS"),
    pynini.cross("tur", "+CAUS"), pynini.cross("tür", "+CAUS"),
    pynini.cross("", "")
)

ability = pynini.union(
    pynini.cross("ebil", "+ABIL"),
    pynini.cross("abil", "+ABIL"),
    pynini.cross("", "")
)

negation = pynini.union(
    pynini.cross("ma", "+NEG"),
    pynini.cross("me", "+NEG"),
    pynini.cross("", "")
)

# Bildirme Kipleri (Indicative)
indicative_tense = pynini.union(
    pynini.cross("iyor", "+PRES.CONT"), pynini.cross("ıyor", "+PRES.CONT"),
    pynini.cross("uyor", "+PRES.CONT"), pynini.cross("üyor", "+PRES.CONT"),
    pynini.cross("ecek", "+FUT"), pynini.cross("acak", "+FUT"),
    pynini.cross("ır", "+AOR"), pynini.cross("ir", "+AOR"),
    pynini.cross("ur", "+AOR"), pynini.cross("ür", "+AOR"),
    pynini.cross("ar", "+AOR"), pynini.cross("er", "+AOR"),
    pynini.cross("r", "+AOR"),
    pynini.cross("dı", "+PAST"), pynini.cross("di", "+PAST"),
    pynini.cross("du", "+PAST"), pynini.cross("dü", "+PAST"),
    pynini.cross("tı", "+PAST"), pynini.cross("ti", "+PAST"),
    pynini.cross("tu", "+PAST"), pynini.cross("tü", "+PAST"),
    pynini.cross("mış", "+INFER"), pynini.cross("miş", "+INFER"),
    pynini.cross("muş", "+INFER"), pynini.cross("müş", "+INFER")
)

indicative_person = pynini.union(
    pynini.cross("um", "+1SG"), pynini.cross("üm", "+1SG"),
    pynini.cross("ım", "+1SG"), pynini.cross("im", "+1SG"),
    pynini.cross("m", "+1SG"),
    pynini.cross("sun", "+2SG"), pynini.cross("sün", "+2SG"),
    pynini.cross("sın", "+2SG"), pynini.cross("sin", "+2SG"),
    pynini.cross("n", "+2SG"),
    pynini.cross("uz", "+1PL"), pynini.cross("üz", "+1PL"),
    pynini.cross("ız", "+1PL"), pynini.cross("iz", "+1PL"),
    pynini.cross("k", "+1PL"),
    pynini.cross("sunuz", "+2PL"), pynini.cross("sünüz", "+2PL"),
    pynini.cross("sınız", "+2PL"), pynini.cross("siniz", "+2PL"),
    pynini.cross("nız", "+2PL"), pynini.cross("niz", "+2PL"),
    pynini.cross("nuz", "+2PL"), pynini.cross("nüz", "+2PL"),
    pynini.cross("lar", "+3PL"), pynini.cross("ler", "+3PL"),
    pynini.cross("", "+3SG")
)

# Dilek/Tasarlama Kipleri (Subjunctive/Optative)

# İstek Kipi (Optative)
optative_mood_person = pynini.union(
    pynini.cross("ayım", "+OPT+1SG"), pynini.cross("eyim", "+OPT+1SG"),
    pynini.cross("ayum", "+OPT+1SG"), pynini.cross("eyüm", "+OPT+1SG"),
    pynini.cross("asın", "+OPT+2SG"), pynini.cross("esin", "+OPT+2SG"),
    pynini.cross("asun", "+OPT+2SG"), pynini.cross("esün", "+OPT+2SG"),
    pynini.cross("asana", "+OPT+2SG+EMPH"), pynini.cross("esene", "+OPT+2SG+EMPH"),
    pynini.cross("sana", "+OPT+2SG+EMPH"), pynini.cross("sene", "+OPT+2SG+EMPH"),
    pynini.cross("asan", "+OPT+2SG"), pynini.cross("esen", "+OPT+2SG"),
    pynini.cross("a", "+OPT+3SG"), pynini.cross("e", "+OPT+3SG"),
    pynini.cross("alım", "+OPT+1PL"), pynini.cross("elim", "+OPT+1PL"),
    pynini.cross("alum", "+OPT+1PL"), pynini.cross("elüm", "+OPT+1PL"),
    pynini.cross("asınız", "+OPT+2PL"), pynini.cross("esiniz", "+OPT+2PL"),
    pynini.cross("asunuz", "+OPT+2PL"), pynini.cross("esünüz", "+OPT+2PL"),
    pynini.cross("alar", "+OPT+3PL"), pynini.cross("eler", "+OPT+3PL")
)

# Dilek-Koşul Kipi (Conditional)
conditional_mood_person = pynini.union(
    pynini.cross("sam", "+COND+1SG"), pynini.cross("sem", "+COND+1SG"),
    pynini.cross("san", "+COND+2SG"), pynini.cross("sen", "+COND+2SG"),
    pynini.cross("sa", "+COND+3SG"), pynini.cross("se", "+COND+3SG"),
    pynini.cross("sak", "+COND+1PL"), pynini.cross("sek", "+COND+1PL"),
    pynini.cross("sanız", "+COND+2PL"), pynini.cross("seniz", "+COND+2PL"),
    pynini.cross("sanuz", "+COND+2PL"), pynini.cross("senüz", "+COND+2PL"),
    pynini.cross("salar", "+COND+3PL"), pynini.cross("seler", "+COND+3PL")
)

# ...


# Gereklilik Kipi (Necessitative)
necessitative_mood_person = pynini.union(
    pynini.cross("malıyım", "+NEC+1SG"), pynini.cross("meliyim", "+NEC+1SG"),
    pynini.cross("malıyum", "+NEC+1SG"), pynini.cross("meliyüm", "+NEC+1SG"),
    pynini.cross("malısın", "+NEC+2SG"), pynini.cross("melisin", "+NEC+2SG"),
    pynini.cross("malısun", "+NEC+2SG"), pynini.cross("melisün", "+NEC+2SG"),
    pynini.cross("malı", "+NEC+3SG"), pynini.cross("meli", "+NEC+3SG"),
    pynini.cross("malıyız", "+NEC+1PL"), pynini.cross("meliyiz", "+NEC+1PL"),
    pynini.cross("malıyuz", "+NEC+1PL"), pynini.cross("meliyüz", "+NEC+1PL"),
    pynini.cross("malısınız", "+NEC+2PL"), pynini.cross("melisiniz", "+NEC+2PL"),
    pynini.cross("malısunuz", "+NEC+2PL"), pynini.cross("melisünüz", "+NEC+2PL"),
    pynini.cross("malılar", "+NEC+3PL"), pynini.cross("meliler", "+NEC+3PL")
)

# Emir Kipi (Imperative)
imperative_mood_person = pynini.union(
    pynini.cross("sin", "+IMP+3SG"), pynini.cross("sın", "+IMP+3SG"),
    pynini.cross("sun", "+IMP+3SG"), pynini.cross("sün", "+IMP+3SG"),
    pynini.cross("in", "+IMP+2PL"), pynini.cross("ın", "+IMP+2PL"),
    pynini.cross("un", "+IMP+2PL"), pynini.cross("ün", "+IMP+2PL"),
    pynini.cross("iniz", "+IMP+2PL"), pynini.cross("ınız", "+IMP+2PL"),
    pynini.cross("unuz", "+IMP+2PL"), pynini.cross("ünüz", "+IMP+2PL"),
    pynini.cross("sinler", "+IMP+3PL"), pynini.cross("sınlar", "+IMP+3PL"),
    pynini.cross("sunlar", "+IMP+3PL"), pynini.cross("sünler", "+IMP+3PL")
)

imperative_2sg_bare = pynini.cross("", "+IMP+2SG")

# Build verb structure
verb_base = verb_stems_all + ability + negation

# Verb paths
verb_indicative = verb_base + indicative_tense + indicative_person
verb_optative = verb_base + optative_mood_person
verb_conditional = verb_base + conditional_mood_person
verb_necessitative = verb_base + necessitative_mood_person
verb_imperative = verb_base + pynini.union(imperative_mood_person, imperative_2sg_bare)

# Combine all verb paths
verb_complete = pynini.union(
    verb_indicative,
    verb_optative,
    verb_conditional,
    verb_necessitative,
    verb_imperative
).optimize()

# ===== PUNCTUATION =====
punctuation = pynini.union(
    pynini.cross(".", "+PUNCT.period"),
    pynini.cross(",", "+PUNCT.comma"),
    pynini.cross("?", "+PUNCT.question"),
    pynini.cross("!", "+PUNCT.exclamation"),
    pynini.cross(":", "+PUNCT.colon"),
    pynini.cross(";", "+PUNCT.semicolon"),
    pynini.cross("", "")
)

# ===== COMPLETE ANALYZER =====
simple_categories = pynini.union(
    adverb_roots,
    postposition_roots,
    interjection_roots,
    conjunction_roots,
    question_particles
)

nominal_fst = (nominal_complete + punctuation).optimize()
verb_fst = (verb_complete + punctuation).optimize()
simple_fst = (simple_categories + punctuation).optimize()

turkish_analyzer = pynini.union(nominal_fst, verb_fst, simple_fst).optimize()

def analyze(word):
    """
    Analyze a word. Tries exact match first, then lowercase match.
    This handles 'Ali' -> 'ali' lookup automatically.
    """
    try:
        # 1. Try exact match (Good for specific abbreviations if you have them)
        lattice = pynini.compose(word, turkish_analyzer)
        if lattice.start() == pynini.NO_STATE_ID:
            # 2. If failed, try lowercase (Handles "Ali" -> "ali" in lexicon)
            lattice = pynini.compose(word.lower(), turkish_analyzer)

        analyses = []
        seen = set()
        try:
            for path in lattice.paths().ostrings():
                if path not in seen:
                    analyses.append(path)
                    seen.add(path)
        except:
            pass

        return sorted(analyses) if analyses else [f"No analysis found for: {word}"]
    except Exception as e:
        return [f"Error: {str(e)}"]
# ===== CONTEXT AWARENESS & DISAMBIGUATION =====

class ContextAwareDisambiguator:
    def __init__(self):
        # Transition Matrix
        self.transitions = {
            'START': {'NOUN': 0.3, 'PRON': 0.3, 'ADJ': 0.15, 'VERB': 0.1, 'ADV': 0.1},

            # Adjectives transition to Nouns or other Adjectives
            'ADJ': {'NOUN': 0.6, 'ADJ': 0.35, 'VERB': 0.05},

            # Nouns transition to...
            'NOUN': {'VERB': 0.3, 'NOUN': 0.15, 'CONJ': 0.15, 'POSTP': 0.15, 'QUES': 0.2, 'PUNCT': 0.05},

            # Pronouns transition to...
            'PRON': {'VERB': 0.4, 'NOUN': 0.1, 'CONJ': 0.2, 'POSTP': 0.2, 'QUES': 0.1},

            # Conjunctions (de/da) transition to Verbs or Nouns
            'CONJ': {'NOUN': 0.3, 'VERB': 0.4, 'ADJ': 0.1, 'PRON': 0.1, 'ADV': 0.1},

            # Question particles
            'QUES': {'PUNCT': 0.9, 'VERB': 0.1},

            # Verbs (End of clause/sentence)
            'VERB': {'PUNCT': 0.6, 'CONJ': 0.1, 'QUES': 0.25, 'NOUN': 0.05},

            'DEFAULT': {'NOUN': 0.2, 'VERB': 0.2}
        }
        self.tags = ['NOUN', 'VERB', 'ADJ', 'ADV', 'PRON', 'POSTP', 'CONJ', 'QUES']

    def get_tag_from_analysis(self, analysis_str):
        if "No analysis" in analysis_str: return "UNKNOWN"

        # Priority order
        if "+QUES" in analysis_str: return "QUES"
        if "+CONJ" in analysis_str: return "CONJ"
        if "+POSTP" in analysis_str: return "POSTP"
        if "+PRON" in analysis_str: return "PRON"

        # FIX FOR 'DAKİ': -ki suffix creates Adjectives
        if "+KI" in analysis_str or "+DER.ki" in analysis_str:
            return "ADJ"

        if "+ADV" in analysis_str: return "ADV"

        # Derived Nouns (verb-to-noun)
        if "+DER" in analysis_str:
            if any(x in analysis_str for x in ["+DER.ma", "+DER.mak", "+DER.lik", "+DER.iş"]):
                return "NOUN"

        if "+ADJ" in analysis_str: return "ADJ"
        if "+VERB" in analysis_str: return "VERB"

        # Detached suffixes (like 'daki' analyzed as '+LOC+KI')
        if analysis_str.startswith("+"):
            # If it contains KI, it's ADJ, otherwise likely NOUN suffix chain
            if "KI" in analysis_str: return "ADJ"
            return "NOUN"

        return "NOUN"

    def get_transition_prob(self, prev_tag, current_tag):
        if prev_tag in self.transitions:
            return self.transitions[prev_tag].get(current_tag, 0.001)
        return self.transitions['DEFAULT'].get(current_tag, 0.001)

    def heuristic_weight(self, word, analysis, tag, position, sentence_len, next_token=None):
        score = 0.0
        word_lower = word.lower()
        is_capitalized = word[0].isupper()

        # Rule 1: High-confidence Pronouns
        if word_lower in ["ben", "sen", "o", "biz", "siz", "onlar", "bana", "sana"]:
            if tag == "PRON": score += 5.0
            if tag == "NOUN": score -= 5.0

        # Rule 2: "Bir" handling
        if word_lower == "bir":
            if tag == "ADJ": score += 2.0
            if tag == "ADV": score += 1.0
            if tag == "NOUN": score -= 1.0

        # Rule 3: De/Da handling
        if word_lower in ["de", "da"]:
            if tag == "CONJ": score += 5.0
            if tag == "NOUN": score -= 5.0

        # Rule 4: Contextual Verb vs Noun (Lookahead for Question)
        if next_token:
            next_is_question = next_token.lower().startswith(("mi", "mı", "mu", "mü"))
            if next_is_question and tag == "VERB": score += 4.0
            if next_is_question and tag == "NOUN": score -= 1.0

        # Rule 5: Strong Sentence End Preference
        if position == sentence_len - 1:
            if tag == "VERB": score += 3.0
            if tag == "NOUN": score -= 1.0

        # Rule 6: Future Tense First Person Ambiguity
        if word_lower.endswith("eceğim") or word_lower.endswith("acağım"):
            if tag == "VERB": score += 2.0

        # Rule 7: Common Adjectives
        if word_lower in ["güzel", "kırmızı", "mavi", "büyük"]:
            if tag == "ADJ": score += 3.0

        # --- NEW RULE: Proper Noun Capitalization Handling ---
        # If word is Capitalized and NOT the first word, prefer PROPN
        if is_capitalized and position > 0:
            if tag == "PROPN": score += 4.0
            if tag == "NOUN": score -= 0.5 # Slight penalty for common noun usage

        # If word is Capitalized and IS the first word, we can't be sure,
        # but if the lexicon says it's ONLY a PROPN (like 'Ahmet'), the FST analysis controls this.
        # -----------------------------------------------------

        return score

    def decode_sentence(self, sentence_tokens):
        # 1. Get all possible analyses
        lattice = []
        for word in sentence_tokens:
            raw_analyses = analyze(word)
            word_candidates = []

            if not raw_analyses or "No analysis" in raw_analyses[0]:
                word_candidates.append({'analysis': f"{word}+NOUN+UNKNOWN", 'tag': 'NOUN', 'word': word})
            else:
                for ana in raw_analyses:
                    tag = self.get_tag_from_analysis(ana)
                    word_candidates.append({'analysis': ana, 'tag': tag, 'word': word})
            lattice.append(word_candidates)

        n = len(lattice)
        if n == 0: return []

        best_scores = [ {} for _ in range(n) ]
        backpointers = [ {} for _ in range(n) ]

        # Step 0: Start
        next_tok = lattice[1][0]['word'] if n > 1 else None
        for i, candidate in enumerate(lattice[0]):
            trans_prob = self.get_transition_prob('START', candidate['tag'])
            heuristic = self.heuristic_weight(candidate['word'], candidate['analysis'], candidate['tag'], 0, n, next_tok)
            best_scores[0][i] = math.log(trans_prob) + heuristic

        # Step 1..N: Viterbi Recursion
        for t in range(1, n):
            next_tok = lattice[t+1][0]['word'] if t < n - 1 else None
            for i, curr_cand in enumerate(lattice[t]):
                max_score = -float('inf')
                best_prev_idx = -1

                heuristic = self.heuristic_weight(curr_cand['word'], curr_cand['analysis'], curr_cand['tag'], t, n, next_tok)

                for j, prev_cand in enumerate(lattice[t-1]):
                    prev_score = best_scores[t-1][j]
                    trans_prob = self.get_transition_prob(prev_cand['tag'], curr_cand['tag'])
                    score = prev_score + math.log(trans_prob) + heuristic

                    if score > max_score:
                        max_score = score
                        best_prev_idx = j

                best_scores[t][i] = max_score
                backpointers[t][i] = best_prev_idx

        # Backtracking
        best_last_idx = max(best_scores[n-1], key=best_scores[n-1].get)
        result_path = []
        curr_idx = best_last_idx

        for t in range(n-1, -1, -1):
            cand = lattice[t][curr_idx]
            result_path.append(cand)
            if t > 0:
                curr_idx = backpointers[t][curr_idx]

        return result_path[::-1]

# Re-instantiate
disambiguator = ContextAwareDisambiguator()

def post_process_results(results):
    """
    Merges information from Question Particles into the preceding Verb.
    Example: 'yapacak' (3SG) + 'mısın' (2SG) -> Updates 'yapacak' to 2SG.
    """
    # Iterate through tokens looking for VERB + QUES pairs
    for i in range(len(results) - 1):
        curr = results[i]
        next_tok = results[i+1]

        if curr['tag'] == 'VERB' and next_tok['tag'] == 'QUES':
            # We found a Verb followed by a Question Particle

            # 1. Detect Person in the Question Particle
            person_marker = None
            qs_analysis = next_tok['best_analysis']

            if "+1SG" in qs_analysis: person_marker = "+1SG"
            elif "+2SG" in qs_analysis: person_marker = "+2SG"
            elif "+1PL" in qs_analysis: person_marker = "+1PL"
            elif "+2PL" in qs_analysis: person_marker = "+2PL"
            # 3SG is usually implicit, but if needed:
            elif "+3PL" in qs_analysis: person_marker = "+3PL"

            # 2. Update the Verb Analysis
            if person_marker:
                current_analysis = curr['best_analysis']

                # If the verb currently defaults to 3SG (or has no person tag), add the correct one.
                # Note: FST usually outputs +3SG for bare stems if configured,
                # or just ends with Tense.

                if "+3SG" in current_analysis:
                    # Replace default 3SG with the actual person from the particle
                    curr['best_analysis'] = current_analysis.replace("+3SG", person_marker)
                else:
                    # Append the person marker
                    curr['best_analysis'] = current_analysis + person_marker

    return results

def analyze_sentence_context_aware(sentence):
    """Wrapper function to replace the old analyze_sentence"""
    # Simple tokenization (handling punctuation loosely)
    # In a real app, split punctuation marks effectively
    import re
    tokens = re.findall(r"[\w']+|[.,!?;]", sentence)

    results = disambiguator.decode_sentence(tokens)

    formatted_output = []
    for item in results:
        formatted_output.append({
            'token': item['word'],
            'best_analysis': item['analysis'],
            'tag': item['tag']
        })

    formatted_output = post_process_results(formatted_output)

    return formatted_output

def save_fst(filename):
    """Save the compiled FST to a file."""
    turkish_analyzer.write(filename)
    print(f"FST saved to {filename}")

# Test
if __name__ == "__main__":
    print("="*60)
    print("LEXICON LOADED")
    print("="*60)
    print(f"Nouns: {len(lexicon.get('nouns', []))} words")
    print(f"Verbs: {len(lexicon.get('verbs', []))} words (infinitives → roots extracted)")
    print(f"Adjectives: {len(lexicon.get('adjectives', []))} words")
    print(f"Pronouns: {len(lexicon.get('pronouns', []))} words")
    print(f"Adverbs: {len(lexicon.get('adverbs', []))} words")
    print(f"Conjunctions: {len(lexicon.get('conjunctions', []))} words")
    print(f"Postpositions: {len(lexicon.get('postpositions', []))} words")
    print(f"Proper Nouns: {len(lexicon.get('proper_nouns', []))} words")

    # Show some verb root examples
    if lexicon.get('verbs'):
        print("\nVerb root examples:")
        for verb in lexicon.get('verbs', [])[:5]:
            root = extract_verb_root(verb)
            print(f"  {verb} → {root}")

    print("\n" + "="*60)
    print("DEBUG: Testing verb compositions")
    print("="*60)

    test = "gel"
    try:
        lattice = pynini.compose(test, verb_imperative)
        print(f"\n'{test}' + verb_imperative:")
        for path in lattice.paths().ostrings():
            print(f"  ✓ {path}")
    except Exception as e:
        print(f"  ✗ Error: {e}")

    test2 = "gelsin"
    try:
        lattice = pynini.compose(test2, verb_imperative)
        print(f"\n'{test2}' + verb_imperative:")
        for path in lattice.paths().ostrings():
            print(f"  ✓ {path}")
    except Exception as e:
        print(f"  ✗ Error: {e}")

    print("\n" + "="*60)
    print("SINGLE WORD ANALYSIS")
    print("="*60)

    test_words = [
        # Dilek Kipleri
        "gel",            # come! (imperative 2sg)
        "gelin",          # come! (imperative 2pl)
        "gelsin",         # let him/her come (imperative 3sg)
        "gelsem",         # if I come (conditional)
        "geleyim",        # let me come (optative)
        "gelmeli",        # must come (necessitative)
        "yazmalıyım",     # I must write
        "okusana",        # read then! (optative emphatic)
        "görseler",       # if they see (conditional)
        # Bildirme Kipleri
        "okudum",         # I read (past)
        "gelebilecek",    # will be able to come (future)
        "geliyorum",      # I am coming (present continuous)
        # Nouns
        "kitaplardan",    # from books
        "kalemlik",       # pencil case
        "evdekiler",      # those in the house
    ]

    for word in test_words:
        print(f"\n{word}:")
        analyses = analyze(word)
        for analysis in analyses[:5]:
            print(f"  {analysis}")
        if len(analyses) > 5:
            print(f"  ... and {len(analyses) - 5} more analyses")

    print("\n" + "="*60)
    print("CONTEXT-AWARE ANALYSIS (Viterbi)")
    print("="*60)

    # These sentences contain ambiguous words
    ambiguous_sentences = [
        "yüzü güzel",           # yüz: Face (Noun) vs Swim (Verb) -> Expect NOUN
        "denizde yüz",          # yüz: Face (Noun) vs Swim (Verb) -> Expect VERB (Imperative)
        "bana gül",             # gül: Rose (Noun) vs Smile (Verb) -> Expect VERB
        "kırmızı gül",          # gül: Rose (Noun) vs Smile (Verb) -> Expect NOUN (Adj modifies Noun)
        "okula git",            # git: Expect VERB
        "güzel bir ev",         # güzel: Adj, bir: Det, ev: Noun
        "evde misin",           # misin: Expect QUES
        "kitap okumayı severim" # severim: Expect VERB (End of sentence)
    ]

    for sent in ambiguous_sentences:
        print(f"\nSentence: '{sent}'")
        results = analyze_sentence_context_aware(sent)

        # Print formatted table
        print(f" {'Word':<15} | {'Tag':<6} | {'Selected Analysis'}")
        print("-" * 50)
        for res in results:
            print(f" {res['token']:<15} | {res['tag']:<6} | {res['best_analysis']}")

LEXICON LOADED
Nouns: 3 words
Verbs: 3 words (infinitives → roots extracted)
Adjectives: 2 words
Pronouns: 3 words
Adverbs: 2 words
Conjunctions: 3 words
Postpositions: 2 words
Proper Nouns: 0 words

Verb root examples:
  gel → gel
  git → git
  oku → oku

DEBUG: Testing verb compositions

'gel' + verb_imperative:
  ✓ gel+VERB+IMP+2SG

'gelsin' + verb_imperative:
  ✓ gel+VERB+IMP+3SG

SINGLE WORD ANALYSIS

gel:
  gel+VERB
  gel+VERB+IMP+2SG

gelin:
  gel+VERB+IMP+2PL
  gel+VERB+POSS.2SG
  gel+VERB+VOICE.in
  gel+VERB+VOICE.in+IMP+2SG

gelsin:
  gel+VERB+IMP+3SG

gelsem:
  gel+VERB+COND+1SG

geleyim:
  gel+VERB+OPT+1SG

gelmeli:
  gel+VERB+NEC+3SG

yazmalıyım:
  No analysis found for: yazmalıyım

okusana:
  oku+VERB+OPT+2SG+EMPH

görseler:
  No analysis found for: görseler

okudum:
  oku+VERB+PAST+1SG

gelebilecek:
  gel+VERB+ABIL+FUT+3SG

geliyorum:
  gel+VERB+PRES.CONT+1SG

kitaplardan:
  kitap+NOUN+PL+ABL

kalemlik:
  No analysis found for: kalemlik

evdekiler:
  ev+NOUN+LOC+KI+PL

C

In [9]:
for word in ["gelecek misin", "yapacak mısın", "gidiyorum", "ali çıktı", "gelecek", "ben", "kitapda", "kitap daki", "kitapta", "bende", "ben de"]:
    print(f"\n{word}:")
    raw = analyze(word)
    filtered = analyze_sentence_context_aware(word)
    print(f"  RAW ({len(raw)}): {raw}")
    print(f"  FILTERED ({len(filtered)}): {filtered}")



gelecek misin:
  RAW (1): ['No analysis found for: gelecek misin']
  FILTERED (2): [{'token': 'gelecek', 'best_analysis': 'gel+VERB+FUT+2SG', 'tag': 'VERB'}, {'token': 'misin', 'best_analysis': 'mi+QUES+2SG', 'tag': 'QUES'}]

yapacak mısın:
  RAW (1): ['No analysis found for: yapacak mısın']
  FILTERED (2): [{'token': 'yapacak', 'best_analysis': 'yapacak+NOUN+UNKNOWN', 'tag': 'NOUN'}, {'token': 'mısın', 'best_analysis': 'mı+QUES+2SG', 'tag': 'QUES'}]

gidiyorum:
  RAW (1): ['git+VERB+PRES.CONT+1SG']
  FILTERED (1): [{'token': 'gidiyorum', 'best_analysis': 'git+VERB+PRES.CONT+1SG', 'tag': 'VERB'}]

ali çıktı:
  RAW (1): ['No analysis found for: ali çıktı']
  FILTERED (2): [{'token': 'ali', 'best_analysis': 'ali+NOUN+UNKNOWN', 'tag': 'NOUN'}, {'token': 'çıktı', 'best_analysis': 'çıktı+NOUN+UNKNOWN', 'tag': 'NOUN'}]

gelecek:
  RAW (1): ['gel+VERB+FUT+3SG']
  FILTERED (1): [{'token': 'gelecek', 'best_analysis': 'gel+VERB+FUT+3SG', 'tag': 'VERB'}]

ben:
  RAW (2): ['ben+PRON', 'ben+PRON+I

In [10]:
analyze_sentence_context_aware("bana gül")

[{'token': 'bana', 'best_analysis': 'ben+PRON+DAT', 'tag': 'PRON'},
 {'token': 'gül', 'best_analysis': 'gül+NOUN+UNKNOWN', 'tag': 'NOUN'}]

In [11]:
analyze_sentence_context_aware("yapacak mısın")

[{'token': 'yapacak', 'best_analysis': 'yapacak+NOUN+UNKNOWN', 'tag': 'NOUN'},
 {'token': 'mısın', 'best_analysis': 'mı+QUES+2SG', 'tag': 'QUES'}]

In [12]:
# Test Case for Conjunctions
print(analyze_sentence_context_aware("kitap da masada"))
# Expected:
# kitap -> NOUN
# da -> CONJ (because 'da' is in lexicon['conjunctions'])
# masada -> NOUN (masa + LOC)

# Test Case for Questions
print(analyze_sentence_context_aware("gelecek misin"))
# Expected:
# gelecek -> VERB (FUT) or NOUN (Future) -> Context prefers VERB if at end?
# misin -> QUES (mi + 2SG)

[{'token': 'kitap', 'best_analysis': 'kitap+NOUN', 'tag': 'NOUN'}, {'token': 'da', 'best_analysis': 'da+CONJ', 'tag': 'CONJ'}, {'token': 'masada', 'best_analysis': 'masa+NOUN+LOC', 'tag': 'NOUN'}]
[{'token': 'gelecek', 'best_analysis': 'gel+VERB+FUT+2SG', 'tag': 'VERB'}, {'token': 'misin', 'best_analysis': 'mi+QUES+2SG', 'tag': 'QUES'}]
