In [None]:
import pickle

class TrieNode:
    def __init__(self):
        self.children = {}     
        self.is_end = False
        self.prefix_count = 0  


class Trie:
    def __init__(self):
        self.root = TrieNode()
    def insert(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
            node.prefix_count += 1
        node.is_end = True

    def save(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self.root, f)

    def load(self, filename):
        with open(filename, 'rb') as f:
            self.root = pickle.load(f)

    def most_common_branch(self):
        node = self.root
        path = []
        while node.children:

            next_char, next_node = max(node.children.items(),
                                       key=lambda item: item[1].prefix_count)
            path.append(next_char)
            node = next_node
        return ''.join(path)

    def _add_nodes(self, dot, node, node_id):
        for char, child in node.children.items():
            child_id = f'{node_id}_{char}'
            label = f'{char}\nCount:{child.prefix_count}'
            if child.is_end:
                label += '\n(End)'
            dot.node(child_id, label=label, shape='circle')
            dot.edge(node_id, child_id)
            self._add_nodes(dot, child, child_id)


filename = 'brown_nouns.txt'  

trie = Trie()
with open(filename, 'r') as f:
    for line in f:
        word = line.strip()
        if word:
            trie.insert(word)

trie.save('brown_trie.pkl')


most_common_prefix = trie.most_common_branch()
print(f'Most common prefix: {most_common_prefix}')


Most common prefix: statements


In [None]:
from collections import defaultdict

class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end = False
        self.count = 0  

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, word):
        node = self.root
        for ch in word:
            if ch not in node.children:
                node.children[ch] = TrieNode()
            node = node.children[ch]
            node.count += 1
        node.is_end = True

    def find_prefix_split(self, word, words_set):
        """
        Try to split word into stem+suffix using prefix trie
        Only valid if stem exists in dataset and suffix branching is valid
        """
        node = self.root
        for i in range(len(word)):
            ch = word[i]
            if ch not in node.children:
                break
            node = node.children[ch]

            stem = word[:i+1]
            suffix = word[i+1:]
            if stem in words_set and node.count > 1 and suffix != "":
                return f"{stem}+{suffix}"
        return word


class SuffixTrie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, word):
        node = self.root
        for ch in reversed(word):  
            if ch not in node.children:
                node.children[ch] = TrieNode()
            node = node.children[ch]
            node.count += 1
        node.is_end = True

    def find_suffix_split(self, word, words_set):
        """
        Try to split word into stem+suffix using suffix trie
        """
        node = self.root
        for i in range(len(word)):
            ch = word[-(i+1)]
            if ch not in node.children:
                break
            node = node.children[ch]

            stem = word[:len(word)-(i+1)]
            suffix = word[len(word)-(i+1):]

            if stem in words_set and node.count > 1 and suffix != "":
                return f"{stem}+{suffix}"
        return word


if __name__ == "__main__":
    with open("brown_nouns.txt") as f:
        words = [w.strip().lower() for w in f if w.strip()]

    words_set = set(words)

    prefix_trie = Trie()
    suffix_trie = SuffixTrie()

    for w in words:
        prefix_trie.insert(w)
        suffix_trie.insert(w)

    # Collect results
    prefix_results = []
    suffix_results = []

    for w in words:
        p_split = prefix_trie.find_prefix_split(w, words_set)
        s_split = suffix_trie.find_suffix_split(w, words_set)

        if "+" in p_split:  # valid prefix split
            prefix_results.append(p_split)
        if "+" in s_split:  # valid suffix split
            suffix_results.append(s_split)

    # Save to files
    with open("prefix.txt", "w") as f:
        f.write("\n".join(prefix_results))

    with open("suffix.txt", "w") as f:
        f.write("\n".join(suffix_results))

    print(f"✅ Prefix splits found: {len(prefix_results)} saved in prefix.txt")
    print(f"✅ Suffix splits found: {len(suffix_results)} saved in suffix.txt")


✅ Prefix splits found: 92116 saved in prefix.txt
✅ Suffix splits found: 91740 saved in suffix.txt


In [11]:
# ---------------- CHECK SPECIFIC WORD ----------------
query_word = "jury"

p_split = prefix_trie.find_prefix_split(query_word, words_set)
s_split = suffix_trie.find_suffix_split(query_word, words_set)

print("Word:", query_word)
print("Prefix-based split:", p_split)
print("Suffix-based split:", s_split)


Word: jury
Prefix-based split: jury
Suffix-based split: jury


In [13]:
from collections import defaultdict

class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end = False
        self.count = 0  # how many words pass through this node

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, word):
        node = self.root
        for ch in word:
            if ch not in node.children:
                node.children[ch] = TrieNode()
            node = node.children[ch]
            node.count += 1
        node.is_end = True

    def top_k_splits(self, word, k=3):
        """
        Return split form and stats for top-k splits.
        """
        node = self.root
        split_candidates = []

        for i, ch in enumerate(word):
            if ch not in node.children:
                break
            node = node.children[ch]
            split_candidates.append((i, node.count, ch))

        # top k by count
        top_splits = sorted(split_candidates, key=lambda x: -x[1])[:k]
        split_indices = sorted([idx for idx, _, _ in top_splits])

        # build parts
        parts = []
        last = 0
        for idx in split_indices:
            parts.append(word[last:idx+1])
            last = idx+1
        parts.append(word[last:])

        # make split string
        split_str = f"{word}=" + "+".join(parts)

        # make frequency info
        freq_info = " | ".join([f"{ch}@{cnt}" for _, cnt, ch in top_splits])

        return split_str, freq_info


class SuffixTrie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, word):
        node = self.root
        for ch in reversed(word):  # insert reversed
            if ch not in node.children:
                node.children[ch] = TrieNode()
            node = node.children[ch]
            node.count += 1
        node.is_end = True

    def top_k_splits(self, word, k=3):
        """
        Return split form and stats for top-k splits (suffix trie).
        """
        node = self.root
        split_candidates = []
        n = len(word)

        for i, ch in enumerate(reversed(word)):
            if ch not in node.children:
                break
            node = node.children[ch]
            split_candidates.append((n - i - 1, node.count, ch))

        # top k by count
        top_splits = sorted(split_candidates, key=lambda x: -x[1])[:k]
        split_indices = sorted([idx for idx, _, _ in top_splits])

        # build parts
        parts = []
        last = 0
        for idx in split_indices:
            parts.append(word[last:idx+1])
            last = idx+1
        parts.append(word[last:])

        # make split string
        split_str = f"{word}=" + "+".join(parts)

        # frequency info
        freq_info = " | ".join([f"{ch}@{cnt}" for _, cnt, ch in top_splits])

        return split_str, freq_info


# ---------------- MAIN ----------------
if __name__ == "__main__":
    with open("brown_nouns.txt") as f:
        words = [w.strip().lower() for w in f if w.strip()]

    prefix_trie = Trie()
    suffix_trie = SuffixTrie()

    for w in words:
        prefix_trie.insert(w)
        suffix_trie.insert(w)

    prefix_results = []
    suffix_results = []

    for w in words:
        split_str, freq_info = prefix_trie.top_k_splits(w, k=3)
        prefix_results.append(split_str + "\n" + freq_info + "\n")

        split_str, freq_info = suffix_trie.top_k_splits(w, k=3)
        suffix_results.append(split_str + "\n" + freq_info + "\n")

    with open("prefix.txt", "w") as f:
        f.write("\n".join(prefix_results))

    with open("suffix.txt", "w") as f:
        f.write("\n".join(suffix_results))

    print("✅ Done. Prefix splits in prefix.txt, suffix splits in suffix.txt")


✅ Done. Prefix splits in prefix.txt, suffix splits in suffix.txt
