In [70]:
class TrieNode:
    def __init__(self):
        self.children={}
        self.count=0
        self.is_end=False

class PrefixTrie:
    def __init__(self):
        self.root=TrieNode()
    
    def insert(self,word):
        node=self.root
        for char in word:
            if char not in node.children:
                node.children[char]=TrieNode()
            node=node.children[char]
            node.count+=1
        node.is_end=True
    
    def dfs(self,node,prefix="",depth=0):
        if node is None:
            node=self.root
        print(f"Prefix: {prefix} | Depth: {depth} | Count: {node.count}")
        for char,nxt in node.children.items():
            self.dfs(nxt,prefix+char,depth+1)

In [71]:
class SuffixTrie:
    def __init__(self):
        self.root=TrieNode()
    
    def insert(self,word):
        word=word[::-1]
        node=self.root
        for char in word:
            if char not in node.children:
                node.children[char]=TrieNode()
            node=node.children[char]
            node.count+=1
        node.is_end=True
    
    def dfs(self,node,prefix="",depth=0):
        if node is None:
            node=self.root
        print(f"Prefix: {prefix} | Depth: {depth} | Count: {node.count}")
        for char,nxt in node.children.items():
            self.dfs(nxt,prefix+char,depth+1)

In [72]:
path="C:\\Sahil\\NLP\\Lab3\\brown_nouns.txt"
trie=PrefixTrie()
with open(path,'r') as f:
    for line in f:
        words=line.split()
        for word in words:
            trie.insert(word)



In [73]:
total_count = 0
with open(path,'r') as f:
    for line in f:
        words=line.split()
        for word in words:
            total_count+=1
print(f"Total Count: {total_count}")

Total Count: 202793


In [74]:
import math

def find_best_split(word, trie, total_count,beta=1.0,min_count=1,min_frac=None):
    node = trie.root
    depth = 0
    parent_count = total_count
    scores = {}
    
    
    for ch in word:
        if ch not in node.children:
            break
        node = node.children[ch]
        depth += 1
        cnt = node.count

        if min_frac is not None and cnt < min_frac * total_count:
            continue
        if cnt < min_count:
            continue
        ratio = (cnt / (parent_count)) if parent_count > 0 else 0.0
        score = cnt * math.log(1 + depth) * (ratio ** beta)
        extra = {'ratio': ratio}
        scores[depth] = {
            'score': score,
            'prefix': word[:depth],
            'count': cnt,
            'depth': depth,
            'parent_count': parent_count,
            **extra
        }
        parent_count = cnt

    if not scores:
        return None
 
    best_depth, best_info = max(scores.items(), key=lambda kv: kv[1]['score'])
    return best_info


In [76]:
suffix_trie = SuffixTrie()
with open(path, 'r') as f:
    for line in f:
        words = line.split()
        for word in words:
            suffix_trie.insert(word) 

In [None]:
import csv

prefix_file = "best_splits.txt"
with open(path, 'r') as f, open(prefix_file, 'w', encoding='utf-8', newline='') as out:
    writer = csv.writer(out)
    for line in f:
        words = line.split()
        for word in words:
            best = find_best_split(word, trie, total_count, min_count=1, min_frac=0.0001)
            if best:
                writer.writerow([
                    word,
                    best['prefix'],
                    f"{best['score']:.2f}",
                    best['count'],
                    best['depth'],
                    f"{best['ratio']:.4f}"
                ])


suffix_file = "best_splits_suffix.txt"
with open(path, 'r') as f, open(suffix_file, 'w', encoding='utf-8', newline='') as out:
    writer = csv.writer(out)
    for line in f:
        words = line.split()
        for word in words:
            best = find_best_split(word[::-1], suffix_trie, total_count, min_count=1, min_frac=0.0001)
            if best:
                suffix = best['prefix'][::-1]
                writer.writerow([
                    word,
                    suffix,
                    f"{best['score']:.2f}",
                    best['count'],
                    best['depth'],
                    f"{best['ratio']:.4f}"
                ])


In [None]:
import pandas as pd

suffix_file = "best_splits_suffix.txt"
prefix_file = "best_splits.txt"

prefix_df = pd.read_csv(
    "best_splits.txt",
    names=["word", "prefix", "score_pre", "freq_pre", "len_pre", "ratio_pre"]
)

suffix_df = pd.read_csv(
    "best_splits_suffix.txt",
    names=["word", "suffix", "score_suf", "freq_suf", "len_suf", "ratio_suf"]
)

df = pd.merge(suffix_df, prefix_df, on="word", how="inner")

df["final_score"] = (
    (df["ratio_suf"] * df["len_suf"] * df["score_suf"])**0.5 +
    (df["ratio_pre"] * df["len_pre"] * df["score_pre"])**0.5
)

df["final_score"] = df["final_score"] / df["final_score"].max()


df_sorted = df.sort_values("final_score", ascending=False)

print(df_sorted[["word", "prefix", "suffix", "final_score"]].head(20))




                     word prefix suffix  final_score
14562523       production    pro    ion          1.0
14562522       production    pro    ion          1.0
14562521       production    pro    ion          1.0
14562520       production    pro    ion          1.0
14562519       production    pro    ion          1.0
14562518       production    pro    ion          1.0
14562517       production    pro    ion          1.0
14562516       production    pro    ion          1.0
14562515       production    pro    ion          1.0
14562514       production    pro    ion          1.0
14562481       production    pro    ion          1.0
14562480       production    pro    ion          1.0
14562511       production    pro    ion          1.0
14562510       production    pro    ion          1.0
14562509       production    pro    ion          1.0
2429090   prognostication    pro    ion          1.0
14562538       production    pro    ion          1.0
14562539       production    pro    ion       

In [80]:
df["better"] = df.apply(
    lambda row: "prefix" if row["score_pre"] > row["score_suf"] else "suffix",
    axis=1
)

In [None]:
df["prefix_strength"] = df["score_pre"] * df["ratio_pre"] * df["len_pre"]
df["suffix_strength"] = df["score_suf"] * df["ratio_suf"] * df["len_suf"]

df["better"] = df.apply(
    lambda row: "prefix" if row["prefix_strength"] > row["suffix_strength"] else "suffix",
    axis=1
)

In [82]:
print(df["better"].value_counts(normalize=True) * 100)


better
suffix    73.064366
prefix    26.935634
Name: proportion, dtype: float64


score = cnt * log(1 + depth) * (ratio ** beta)


Suffix analysis explains most word patterns, prefix explains fewer but still important ones.

Suffix is better if

final_score_suf(word)
>
final_score_pre(word)
final_score_suf(word)>final_score_pre(word)

Prefix is better if

final_score_pre(word)
>
final_score_suf(word)
final_score_pre(word)>final_score_suf(word)

side_score_suf=sqrt(ratio×len×score)suffix​



side_score_pre=sqrt(ratio×len×score)prefix​
​
​

#In english ,Suffixes are generally more informative than prefixes.