In [5]:
import math

In [6]:
input_file="brown_nouns.txt"

## working stemming

In [7]:
def make_node():
    return {"count": 0, "children": {}}

def insert_word(trie, word):
    node = trie
    node["count"] += 1
    for ch in word:
        if ch not in node["children"]:
            node["children"][ch] = make_node()
        node = node["children"][ch]
        node["count"] += 1

def build_trie(words, suffix=False):
    root = make_node()
    for word in words:
        w = word[::-1] if suffix else word
        insert_word(root, w)
    return root

def find_split(word, trie, suffix=False):
    node = trie
    best_split = 0
    max_branch = 0
    chars = word[::-1] if suffix else word

    for i, ch in enumerate(chars):
        if ch not in node["children"]:
            break
        node = node["children"][ch]
        # branching factor
        if len(node["children"]) > max_branch:
            max_branch = len(node["children"])
            best_split = i + 1

    if suffix:
        stem = word[:-best_split] if best_split > 0 else word
        suff = word[-best_split:] if best_split > 0 else ""
    else:
        stem = word[:best_split] if best_split > 0 else word
        suff = word[best_split:] if best_split > 0 else ""

    return stem, suff



# ================= MAIN ==================
with open(input_file) as f:
    words = [w.strip() for w in f if w.strip()]

prefix_trie = build_trie(words, suffix=False)
suffix_trie = build_trie(words, suffix=True)

with open("Prefix Trie Splits.txt","w") as f:
    for w in words[:]:
        stem, suff = find_split(w, prefix_trie, suffix=False)
        f.write(f"{w} = {stem}+{suff}\n")
print(" Prefix Trie Splits written")

with open("Suffix Trie Splits.txt","w") as f:
    for w in words[:]:
        stem, suff = find_split(w, suffix_trie, suffix=True)
        f.write(f"{w} = {stem}+{suff}\n")

print("\nSuffix Trie Splits written")

 Prefix Trie Splits written

Suffix Trie Splits written


In [8]:
def make_node():
    return {"count": 0, "children": {}}

def insert_word(trie, word):
    node = trie
    node["count"] += 1
    for ch in word:
        if ch not in node["children"]:
            node["children"][ch] = make_node()
        node = node["children"][ch]
        node["count"] += 1

def build_trie(words, suffix=False):
    root = make_node()
    for word in words:
        w = word[::-1] if suffix else word
        insert_word(root, w)
    return root

def find_split(word, trie, suffix=False):
    node = trie
    best_split = 0
    max_branch = 0
    best_prob = 0.0
    chars = word[::-1] if suffix else word

    for i, ch in enumerate(chars):
        if ch not in node["children"]:
            break
        node = node["children"][ch]
        branch = len(node["children"])
        if branch > max_branch:
            max_branch = branch
            best_split = i + 1
            # probability = (#children) / (count at this node)
            best_prob = branch / node["count"]

    if suffix:
        stem = word[:-best_split] if best_split > 0 else word
        suff = word[-best_split:] if best_split > 0 else ""
    else:
        stem = word[:best_split] if best_split > 0 else word
        suff = word[best_split:] if best_split > 0 else ""

    return stem, suff, max_branch, best_prob

with open(input_file) as f:
    words = [w.strip() for w in f if w.strip()]

prefix_trie = build_trie(words, suffix=False)
suffix_trie = build_trie(words, suffix=True)

with open("Prefix Trie Splits Probability.txt","w") as f:
    for w in words[:]:
        stem, suff, freq, prob = find_split(w, prefix_trie, suffix=False)
        f.write(f"{w} = {stem}+{suff}  | branching={freq}, prob={prob:.2f}\n") 

print("\Prefix Trie Splits probability written")

with open("Suffix Trie Splits Probability.txt","w") as f:
    for w in words[:]:
        stem, suff, freq, prob = find_split(w, suffix_trie, suffix=True)
        f.write(f"{w} = {stem}+{suff}  | branching={freq}, prob={prob:.2f}\n")  

print("\nSuffix Trie Splits probability written")

\Prefix Trie Splits probability written

Suffix Trie Splits probability written
