In [1]:
#Basically I am creating Tries where I have to add all the words 
# i had created same structre for Both We Just Need To pass a parameter named Suffix to make a suffix trie 
#If suffix =true then we would reverse the current word and same process for Suffix trie as prefix trie
#There will be three para
#To store the count How many words passes Through This 
# In trie we need To iterate every single character at every single charater if it is present at root then just add one to freq and 
#reach to next charcter ,and if no then add a new root and work on add freq+=1 also on that node 
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end = False
        self.freq = 0   


class Trie:
    def __init__(self, suffix=False):
        self.root = TrieNode()
        self.suffix = suffix  

    def insert(self, word):
        sequence = word[::-1] if self.suffix else word
        node = self.root
        for ch in sequence:
            if ch not in node.children:
                node.children[ch] = TrieNode()
            node = node.children[ch]
            node.freq += 1
        node.is_end = True

    def build(self, words):
        for w in words:
            self.insert(w)

    def stem_word(self, word):
        sequence = word[::-1] if self.suffix else word
        node = self.root

        best_ratio = -1
        split_idx = None

        for i, ch in enumerate(sequence):
            if ch not in node.children:
                break

            parent_freq = node.freq
            node = node.children[ch]

            if node.freq > 0 and parent_freq > 0:
                ratio = parent_freq / node.freq

                # choose deeper split if ratio equal
                if ratio > best_ratio or (abs(ratio - best_ratio) < 1e-6 and i+1 > split_idx):
                    best_ratio = ratio
                    split_idx = i + 1

    # enforce at least one split
        if split_idx is None or split_idx == 0 or split_idx == len(sequence):
            split_idx = max(1, len(sequence) - 1)

        stem = sequence[:split_idx]
        suffix = sequence[split_idx:]

        if self.suffix:
            return stem[::-1], suffix[::-1]
        return stem, suffix



In [2]:
with open("brown_nouns.txt", "r", encoding="utf-8") as f:
    words = [line.strip().lower() for line in f if line.strip()]

    prefix_trie = Trie(suffix=False)
    suffix_trie = Trie(suffix=True)

    prefix_trie.build(words)
    suffix_trie.build(words)

  
    print("Stemming results (first 20 words):\n")
    for w in words[50:100]:
        p_stem, p_suff = prefix_trie.stem_word(w)
        s_stem, s_suff = suffix_trie.stem_word(w)
        print(f"Word: {w}")
        print(f"  Prefix Trie → {w} = {p_stem}+{p_suff}")
        print(f"  Suffix Trie → {w} = {s_suff}+{s_stem}")
        print()


    p_stem, p_suff = prefix_trie.stem_word("goes")
    s_stem, s_suff = suffix_trie.stem_word("goes")
    print(f"Word: {w}")
    print(f"  Prefix Trie → {"Goes"} = {p_stem}+{p_suff}")
    print(f"  Suffix Trie → {"Goes"} = {s_suff}+{s_stem}")
    print()


Stemming results (first 20 words):

Word: city
  Prefix Trie → city = ci+ty
  Suffix Trie → city = c+ity

Word: personnel
  Prefix Trie → personnel = pe+rsonnel
  Suffix Trie → personnel = personn+el

Word: policies
  Prefix Trie → policies = polici+es
  Suffix Trie → policies = poli+cies

Word: city
  Prefix Trie → city = ci+ty
  Suffix Trie → city = c+ity

Word: steps
  Prefix Trie → steps = ste+ps
  Suffix Trie → steps = ste+ps

Word: problem
  Prefix Trie → problem = prob+lem
  Suffix Trie → problem = probl+em

Word: implementation
  Prefix Trie → implementation = impl+ementation
  Suffix Trie → implementation = implemen+tation

Word: automobile
  Prefix Trie → automobile = au+tomobile
  Suffix Trie → automobile = automob+ile

Word: title
  Prefix Trie → title = tit+le
  Suffix Trie → title = ti+tle

Word: law
  Prefix Trie → law = la+w
  Suffix Trie → law = l+aw

Word: jury
  Prefix Trie → jury = jur+y
  Suffix Trie → jury = j+ury

Word: funds
  Prefix Trie → funds = fu+nds
  Suff

In [13]:
def analyze_trie(trie):
    stats = {
        "total_nodes": 0,
        "total_children": 0,
        "branching_depths": []
    }

    def dfs(node, depth):
        stats["total_nodes"] += 1
        child_count = len(node.children)
        stats["total_children"] += child_count

        if child_count > 1:  # branching occurs
            stats["branching_depths"].append(depth)

        for child in node.children.values():
            dfs(child, depth + 1)

    dfs(trie.root, 0)
    return stats


In [14]:
def compute_metrics(stats, total_characters):
    avg_branching = stats["total_children"] / stats["total_nodes"]

    if stats["branching_depths"]:
        avg_branch_depth = sum(stats["branching_depths"]) / len(stats["branching_depths"])
    else:
        avg_branch_depth = 0

    compression = (total_characters - stats["total_nodes"]) / total_characters

    return {
        "avg_branching": avg_branching,
        "avg_branch_depth": avg_branch_depth,
        "compression": compression
    }


In [15]:
words = ["goes", "going", "played", "plays", "dogs", "cats"]
total_characters = sum(len(w) for w in words)

# Build prefix trie
prefix_trie = Trie(suffix=False)
for w in words:
    prefix_trie.insert(w)

# Build suffix trie
suffix_trie = Trie(suffix=True)
for w in words:
    suffix_trie.insert(w)

# Analyze
prefix_stats = analyze_trie(prefix_trie)
suffix_stats = analyze_trie(suffix_trie)

prefix_metrics = compute_metrics(prefix_stats, total_characters)
suffix_metrics = compute_metrics(suffix_stats, total_characters)

print("Prefix Trie:", prefix_metrics)
print("Suffix Trie:", suffix_metrics)


Prefix Trie: {'avg_branching': 0.9565217391304348, 'avg_branch_depth': 2.0, 'compression': 0.17857142857142858}
Suffix Trie: {'avg_branching': 0.9615384615384616, 'avg_branch_depth': 0.5, 'compression': 0.07142857142857142}


Prefix Trie is better → It compresses more and detects branching later, which makes sense because English words share more prefixes than suffixes.

Suffix Trie didn’t help much → Words don’t overlap much in endings, so suffix-based morphology isn’t as strong here.