In [1]:
# Setup
!pip install --only-binary :all: pynini
!pip install wurlitzer
import pynini
%load_ext wurlitzer

Collecting pynini
  Downloading pynini-2.1.7-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.7 kB)
Downloading pynini-2.1.7-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (165.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.5/165.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynini
Successfully installed pynini-2.1.7


In [63]:
# Roots
noun_roots = pynini.union(
    pynini.cross("kiler", "kiler+NOUN"),
    pynini.cross("kitap", "kitap+NOUN"),
    pynini.cross("ev", "ev+NOUN"),
    pynini.cross("kalem", "kalem+NOUN"),
    pynini.cross("okul", "okul+NOUN"),
    pynini.cross("oda", "oda+NOUN"),
    pynini.cross("baba", "baba+NOUN")
)

verb_roots = pynini.union(
    pynini.cross("gel", "gel+VERB"),
    pynini.cross("yap", "yap+VERB"),
    pynini.cross("oku", "oku+VERB"),
    pynini.cross("git", "git+VERB"),
    pynini.cross("sev", "sev+VERB")
)

# ===== NOUN MORPHOLOGY =====
# Structure: Root + Plural + (Possessive | Case) + Copula + Person + Question

# Plural
plural = pynini.union(
    pynini.cross("lar", "+PL"),
    pynini.cross("ler", "+PL"),
    pynini.cross("", "")
)

noun_pl = noun_roots + plural

# Two separate paths to avoid ambiguity

# PATH 1: Possessive (no case after - or only specific cases)
# All possessive markers as complete units
possessive = pynini.union(
    # Multi-character possessives (longer first)
    pynini.cross("imiz", "+POSS.1PL"), pynini.cross("ımız", "+POSS.1PL"),
    pynini.cross("umuz", "+POSS.1PL"), pynini.cross("ümüz", "+POSS.1PL"),
    pynini.cross("iniz", "+POSS.2PL"), pynini.cross("ınız", "+POSS.2PL"),
    pynini.cross("unuz", "+POSS.2PL"), pynini.cross("ünüz", "+POSS.2PL"),
    pynini.cross("leri", "+POSS.3PL"), pynini.cross("ları", "+POSS.3PL"),
    # Two-character possessives
    pynini.cross("im", "+POSS.1SG"), pynini.cross("ım", "+POSS.1SG"),
    pynini.cross("um", "+POSS.1SG"), pynini.cross("üm", "+POSS.1SG"),
    pynini.cross("in", "+POSS.2SG"), pynini.cross("ın", "+POSS.2SG"),
    pynini.cross("un", "+POSS.2SG"), pynini.cross("ün", "+POSS.2SG"),
    # Single character 3SG (with buffer s)
    pynini.cross("si", "+POSS.3SG"), pynini.cross("sı", "+POSS.3SG"),
    pynini.cross("su", "+POSS.3SG"), pynini.cross("sü", "+POSS.3SG")
)

# Case markers that can come after possessive
case_after_poss = pynini.union(
    # Longer cases first
    pynini.cross("dan", "+ABL"), pynini.cross("den", "+ABL"),
    pynini.cross("tan", "+ABL"), pynini.cross("ten", "+ABL"),  # After voiceless consonants
    pynini.cross("ndan", "+ABL"), pynini.cross("nden", "+ABL"),  # With n-buffer
    pynini.cross("ntan", "+ABL"), pynini.cross("nten", "+ABL"),
    pynini.cross("nın", "+GEN"), pynini.cross("nin", "+GEN"),
    pynini.cross("nun", "+GEN"), pynini.cross("nün", "+GEN"),
    pynini.cross("da", "+LOC"), pynini.cross("de", "+LOC"),
    pynini.cross("ta", "+LOC"), pynini.cross("te", "+LOC"),  # After voiceless consonants
    pynini.cross("nda", "+LOC"), pynini.cross("nde", "+LOC"),  # With n-buffer
    pynini.cross("nta", "+LOC"), pynini.cross("nte", "+LOC"),
    pynini.cross("ya", "+DAT"), pynini.cross("ye", "+DAT"),
    pynini.cross("na", "+DAT"), pynini.cross("ne", "+DAT"),  # With n-buffer
    pynini.cross("yı", "+ACC"), pynini.cross("yi", "+ACC"),
    pynini.cross("yu", "+ACC"), pynini.cross("yü", "+ACC"),
    pynini.cross("nı", "+ACC"), pynini.cross("ni", "+ACC"),  # With n-buffer
    pynini.cross("nu", "+ACC"), pynini.cross("nü", "+ACC"),
    pynini.cross("yla", "+INS"), pynini.cross("yle", "+INS"),
    pynini.cross("ca", "+EQU"), pynini.cross("ce", "+EQU"),
    pynini.cross("", "")
)

# -ki suffix (comes after locative or on its own after possessive+gen)
ki_suffix = pynini.union(
    pynini.cross("ki", "+KI"),
    pynini.cross("kü", "+KI"),
    pynini.cross("", "")
)

# Plural after -ki (for words like "evimdekiler")
plural_after_ki = pynini.union(
    pynini.cross("ler", "+PL"),
    pynini.cross("lar", "+PL"),
    pynini.cross("", "")
)

possessive_path = noun_pl + possessive + case_after_poss + ki_suffix + plural_after_ki

# PATH 2: Case only (no possessive)
# All case markers as complete units (with buffer consonants already included)
case_no_poss = pynini.union(
    # Genitive after plural (special case - compound form needed)
    pynini.cross("ların", "+GEN"), pynini.cross("lerin", "+GEN"),
    # Regular cases
    pynini.cross("dan", "+ABL"), pynini.cross("den", "+ABL"),
    pynini.cross("tan", "+ABL"), pynini.cross("ten", "+ABL"),  # After voiceless consonants
    pynini.cross("nın", "+GEN"), pynini.cross("nin", "+GEN"),
    pynini.cross("nun", "+GEN"), pynini.cross("nün", "+GEN"),
    pynini.cross("da", "+LOC"), pynini.cross("de", "+LOC"),
    pynini.cross("ta", "+LOC"), pynini.cross("te", "+LOC"),  # After voiceless consonants
    pynini.cross("ya", "+DAT"), pynini.cross("ye", "+DAT"),
    pynini.cross("a", "+DAT"), pynini.cross("e", "+DAT"),
    pynini.cross("yı", "+ACC"), pynini.cross("yi", "+ACC"),
    pynini.cross("yu", "+ACC"), pynini.cross("yü", "+ACC"),
    pynini.cross("ı", "+ACC"), pynini.cross("i", "+ACC"),
    pynini.cross("u", "+ACC"), pynini.cross("ü", "+ACC"),
    pynini.cross("la", "+INS"), pynini.cross("le", "+INS"),
    pynini.cross("yla", "+INS"), pynini.cross("yle", "+INS"),  # With y-buffer
    pynini.cross("ca", "+EQU"), pynini.cross("ce", "+EQU"),
    pynini.cross("", "")
)

# -ki suffix (can come after locative case)
ki_suffix_case = pynini.union(
    pynini.cross("ki", "+KI"),
    pynini.cross("kü", "+KI"),
    pynini.cross("", "")
)

# Plural after -ki (for case_only_path too)
plural_after_ki_case = pynini.union(
    pynini.cross("ler", "+PL"),
    pynini.cross("lar", "+PL"),
    pynini.cross("", "")
)

case_only_path = noun_pl + case_no_poss + ki_suffix_case + plural_after_ki_case

# Combine paths
noun_base = pynini.union(possessive_path, case_only_path).optimize()

# Copula (optional)
copula = pynini.union(
    # With y-buffer
    pynini.cross("ydi", "+COP.PAST"), pynini.cross("ydı", "+COP.PAST"),
    pynini.cross("ydu", "+COP.PAST"), pynini.cross("ydü", "+COP.PAST"),
    pynini.cross("ymış", "+COP.EVID"), pynini.cross("ymış", "+COP.EVID"),
    pynini.cross("ymuş", "+COP.EVID"), pynini.cross("ymüş", "+COP.EVID"),
    pynini.cross("yse", "+COP.COND"), pynini.cross("ysa", "+COP.COND"),
    # Without buffer
    pynini.cross("di", "+COP.PAST"), pynini.cross("dı", "+COP.PAST"),
    pynini.cross("du", "+COP.PAST"), pynini.cross("dü", "+COP.PAST"),
    pynini.cross("ti", "+COP.PAST"), pynini.cross("tı", "+COP.PAST"),  # After voiceless
    pynini.cross("tu", "+COP.PAST"), pynini.cross("tü", "+COP.PAST"),
    pynini.cross("miş", "+COP.EVID"), pynini.cross("mış", "+COP.EVID"),
    pynini.cross("muş", "+COP.EVID"), pynini.cross("müş", "+COP.EVID"),
    pynini.cross("se", "+COP.COND"), pynini.cross("sa", "+COP.COND"),
    pynini.cross("dir", "+COP.PRES"), pynini.cross("dır", "+COP.PRES"),
    pynini.cross("dur", "+COP.PRES"), pynini.cross("dür", "+COP.PRES"),
    pynini.cross("tir", "+COP.PRES"), pynini.cross("tır", "+COP.PRES"),  # After voiceless
    pynini.cross("tur", "+COP.PRES"), pynini.cross("tür", "+COP.PRES")
)

# Person markers (ONLY after copula, not optional copula)
person = pynini.union(
    pynini.cross("im", "+1SG"), pynini.cross("ım", "+1SG"),
    pynini.cross("um", "+1SG"), pynini.cross("üm", "+1SG"),
    pynini.cross("in", "+2SG"), pynini.cross("ın", "+2SG"),
    pynini.cross("un", "+2SG"), pynini.cross("ün", "+2SG"),
    pynini.cross("ız", "+1PL"), pynini.cross("iz", "+1PL"),
    pynini.cross("uz", "+1PL"), pynini.cross("üz", "+1PL"),
    pynini.cross("nız", "+2PL"), pynini.cross("niz", "+2PL"),
    pynini.cross("nuz", "+2PL"), pynini.cross("nüz", "+2PL"),
    pynini.cross("lar", "+3PL"), pynini.cross("ler", "+3PL"),
    pynini.cross("", "")
)

# Two paths: with copula+person OR without copula (no person allowed)
noun_with_cop_and_person = copula + person
noun_without_cop = pynini.cross("", "")

noun_with_person = noun_base + pynini.union(noun_with_cop_and_person, noun_without_cop).optimize()

# Question
question = pynini.union(
    pynini.cross("mi", "+QUES"), pynini.cross("mı", "+QUES"),
    pynini.cross("mu", "+QUES"), pynini.cross("mü", "+QUES"),
    pynini.cross("", "")
)

# Final noun FST
noun_fst = (noun_with_person + question).optimize()

# ===== VERB MORPHOLOGY =====
# Structure: Root + Voice + Ability + Negation + Tense + Person + Question

# Voice
voice = pynini.union(
    pynini.cross("ıl", "+PASS"), pynini.cross("il", "+PASS"),
    pynini.cross("ul", "+PASS"), pynini.cross("ül", "+PASS"),
    pynini.cross("ın", "+REFL"), pynini.cross("in", "+REFL"),
    pynini.cross("un", "+REFL"), pynini.cross("ün", "+REFL"),
    pynini.cross("lan", "+REFL"), pynini.cross("len", "+REFL"),
    pynini.cross("ış", "+RECIP"), pynini.cross("iş", "+RECIP"),
    pynini.cross("uş", "+RECIP"), pynini.cross("üş", "+RECIP"),
    pynini.cross("t", "+CAUS"), pynini.cross("d", "+CAUS"),
    pynini.cross("dır", "+CAUS"), pynini.cross("dir", "+CAUS"),
    pynini.cross("dur", "+CAUS"), pynini.cross("dür", "+CAUS"),
    pynini.cross("tır", "+CAUS"), pynini.cross("tir", "+CAUS"),
    pynini.cross("tur", "+CAUS"), pynini.cross("tür", "+CAUS"),
    pynini.cross("", "")
).closure()

# Ability (can/able to)
ability = pynini.union(
    pynini.cross("ebil", "+ABIL"),
    pynini.cross("abil", "+ABIL"),
    pynini.cross("", "")
)

# Negation
negation = pynini.union(
    pynini.cross("ma", "+NEG"),
    pynini.cross("me", "+NEG"),
    pynini.cross("", "")
)

# Tense (includes all surface forms)
tense = pynini.union(
    pynini.cross("iyor", "+PRES.CONT"), pynini.cross("ıyor", "+PRES.CONT"),
    pynini.cross("uyor", "+PRES.CONT"), pynini.cross("üyor", "+PRES.CONT"),
    pynini.cross("ecek", "+FUT"), pynini.cross("acak", "+FUT"),
    pynini.cross("ır", "+AOR"), pynini.cross("ir", "+AOR"),
    pynini.cross("ur", "+AOR"), pynini.cross("ür", "+AOR"),
    pynini.cross("ar", "+AOR"), pynini.cross("er", "+AOR"),
    pynini.cross("r", "+AOR"),
    pynini.cross("dı", "+PAST"), pynini.cross("di", "+PAST"),
    pynini.cross("du", "+PAST"), pynini.cross("dü", "+PAST"),
    pynini.cross("tı", "+PAST"), pynini.cross("ti", "+PAST"),
    pynini.cross("tu", "+PAST"), pynini.cross("tü", "+PAST"),
    pynini.cross("mış", "+INFER"), pynini.cross("miş", "+INFER"),
    pynini.cross("muş", "+INFER"), pynini.cross("müş", "+INFER")
)

verb_stem = verb_roots + voice + ability + negation + tense

# Person
verb_person = pynini.union(
    pynini.cross("um", "+1SG"), pynini.cross("üm", "+1SG"),
    pynini.cross("ım", "+1SG"), pynini.cross("im", "+1SG"),
    pynini.cross("m", "+1SG"),
    pynini.cross("sun", "+2SG"), pynini.cross("sün", "+2SG"),
    pynini.cross("sın", "+2SG"), pynini.cross("sin", "+2SG"),
    pynini.cross("n", "+2SG"),
    pynini.cross("uz", "+1PL"), pynini.cross("üz", "+1PL"),
    pynini.cross("ız", "+1PL"), pynini.cross("iz", "+1PL"),
    pynini.cross("k", "+1PL"),
    pynini.cross("sunuz", "+2PL"), pynini.cross("sünüz", "+2PL"),
    pynini.cross("sınız", "+2PL"), pynini.cross("siniz", "+2PL"),
    pynini.cross("nız", "+2PL"), pynini.cross("niz", "+2PL"),
    pynini.cross("nuz", "+2PL"), pynini.cross("nüz", "+2PL"),
    pynini.cross("lar", "+3PL"), pynini.cross("ler", "+3PL"),
    pynini.cross("", "+3SG")
)

verb_with_person = verb_stem + verb_person

# Final verb FST
verb_fst = (verb_with_person + question).optimize()

# ===== COMPLETE ANALYZER =====
turkish_analyzer = pynini.union(noun_fst, verb_fst).optimize()

def analyze(word):
    """Analyze a Turkish word and return all possible analyses."""
    try:
        # Compose input word with analyzer
        lattice = pynini.compose(word, turkish_analyzer)

        # Collect unique analyses
        analyses = set()
        try:
            for path in lattice.paths().ostrings():
                analyses.add(path)
        except:
            pass

        return sorted(list(analyses)) if analyses else [f"No analysis found for: {word}"]
    except Exception as e:
        return [f"Error: {str(e)}"]

# Test
if __name__ == "__main__":
    test_words = ["kitaplardan", "kitapta", "kitaptaki", "evimdeki", "evimdekiler", "kalemim", "gelebiliyorum", "okudum"]

    for word in test_words:
        print(f"{word}:")
        for analysis in analyze(word):
            print(f"  {analysis}")

kitaplardan:
  kitap+NOUN+PL+ABL
kitapta:
  kitap+NOUN+LOC
kitaptaki:
  kitap+NOUN+LOC+KI
evimdeki:
  ev+NOUN+POSS.1SG+LOC+KI
evimdekiler:
  ev+NOUN+POSS.1SG+LOC+KI+PL
kalemim:
  kalem+NOUN+POSS.1SG
gelebiliyorum:
  gel+VERB+ABIL+PRES.CONT+1SG
okudum:
  oku+VERB+PAST+1SG


In [27]:
# Valid Tests
for s in ["kiler","evindeydiler", "kilerler", "kitaplarımdan", "geldim", "gelecek", "okullarından", "yapılmış", "yapmayacaklar","okul", "yapamayacaklar"]:
    result = pynini.accep(s) @ turkish_word
    if result.num_states() > 0:
        print(f"{s}: ✅ valid")
    else:
        print(f"{s}: ❌ invalid")


kiler: ✅ valid
evindeydiler: ✅ valid
kilerler: ✅ valid
kitaplarımdan: ✅ valid
geldim: ✅ valid
gelecek: ✅ valid
okullarından: ✅ valid
yapılmış: ✅ valid
yapmayacaklar: ✅ valid
okul: ✅ valid
yapamayacaklar: ❌ invalid
