In [1]:
def load_telugu_corpus():
    # Auto-handle broken UTF-8 characters
    with open("../ASSIGNMENT-1/telugu_dataset.txt", "r", encoding="utf-8", errors="replace") as f:
        lines = [line.strip() for line in f if line.strip()]
    return lines

def whitespace_tokenize(text):
    return text.split()


In [2]:
from collections import Counter, defaultdict

class FastBPE:
    def __init__(self):
        self.merges = []
        self.vocab = set()

    def train(self, corpus, num_merges=32000):
        word_freqs = defaultdict(int)
        for line in corpus:
            for word in whitespace_tokenize(line):
                chars = tuple(list(word) + ["</w>"])
                word_freqs[chars] += 1

        for w in word_freqs:
            for ch in w:
                self.vocab.add(ch)

        pair_stats = Counter()
        word_instances = defaultdict(list)

        # Track all pair occurrences initially
        for word, freq in word_freqs.items():
            for i in range(len(word) - 1):
                pair = (word[i], word[i+1])
                pair_stats[pair] += freq
                word_instances[pair].append((word, i))

        # Main merge loop
        for _ in range(num_merges):
            if not pair_stats:
                break

            best = pair_stats.most_common(1)[0][0]
            self.merges.append(best)

            occurrences = word_instances.pop(best, [])
            if not occurrences:
                continue

            new_pair_stats = Counter()

            for word, pos in occurrences:
                if word not in word_freqs:
                    continue

                freq = word_freqs[word]

                # Merge operation
                merged = []
                i = 0
                L = len(word)
                while i < L:
                    if i < L - 1 and (word[i], word[i+1]) == best:
                        merged.append(word[i] + word[i+1])
                        i += 2
                    else:
                        merged.append(word[i])
                        i += 1
                merged = tuple(merged)

                # Update frequencies
                del word_freqs[word]
                word_freqs[merged] += freq

                # Update pair stats for newly created word
                for i in range(len(merged) - 1):
                    pair = (merged[i], merged[i+1])
                    new_pair_stats[pair] += freq
                    word_instances[pair].append((merged, i))

            pair_stats = new_pair_stats

        for w in word_freqs:
            for ch in w:
                self.vocab.add(ch)

    def encode_word(self, word):
        tokens = tuple(list(word) + ["</w>"])
        for pair in self.merges:
            merged = []
            i = 0
            while i < len(tokens):
                if i < len(tokens)-1 and tokens[i:i+2] == pair:
                    merged.append(tokens[i] + tokens[i+1])
                    i += 2
                else:
                    merged.append(tokens[i])
                    i += 1
            tokens = tuple(merged)
        if tokens[-1] == "</w>":
            tokens = tokens[:-1]
        return list(tokens)

    def encode(self, text):
        out = []
        for w in whitespace_tokenize(text):
            out.extend(self.encode_word(w))
        return out



In [3]:
from collections import defaultdict

class WordPiece:
    def __init__(self):
        self.vocab = set()
        self.vocab_list = []
        self.unk = "[UNK]"

    def tokenize_word(self, word):
        start = 0
        sub_tokens = []

        while start < len(word):
            end = len(word)
            cur = None

            while start < end:
                piece = word[start:end]
                if start > 0:
                    piece = "##" + piece

                if piece in self.vocab:
                    cur = piece
                    break
                end -= 1

            if not cur:
                return [self.unk]

            sub_tokens.append(cur)
            start = end if not cur.startswith("##") else start + len(cur) - 2

        return sub_tokens

    def tokenize(self, text):
        result = []
        for w in whitespace_tokenize(text):
            result.extend(self.tokenize_word(w))
        return result

    def train(self, corpus, vocab_size=32000):
        word_freq = defaultdict(int)
        for line in corpus:
            for w in whitespace_tokenize(line):
                word_freq[w] += 1

        chars = set()
        for w in word_freq:
            for ch in w:
                chars.add(ch)

        self.vocab = set(chars)
        self.vocab.add(self.unk)
        self.vocab_list = [self.unk] + sorted(chars)

        while len(self.vocab) < vocab_size:
            pair_freqs = defaultdict(int)

            for w, freq in word_freq.items():
                toks = self.tokenize_word(w)
                for i in range(len(toks)-1):
                    pair = (toks[i], toks[i+1])
                    pair_freqs[pair] += freq

            if not pair_freqs:
                break

            best = max(pair_freqs, key=pair_freqs.get)
            t1, t2 = best

            def raw(x): return x[2:] if x.startswith("##") else x

            if t1.startswith("##"):
                new_tok = "##" + raw(t1) + raw(t2)
            else:
                new_tok = raw(t1) + raw(t2)

            if new_tok in self.vocab:
                continue

            self.vocab.add(new_tok)
            self.vocab_list.append(new_tok)

            if len(self.vocab) >= vocab_size:
                break

In [None]:
if __name__ == "__main__":
    corpus = load_telugu_corpus()

    # -------- BPE ----------
    print("\nTraining FAST BPE... this will be MUCH faster...")
    bpe = FastBPE()
    bpe.train(corpus, num_merges=32000)
    print("BPE vocab size:", len(bpe.vocab))
    print("BPE sample:", bpe.encode("నేను మంచి పిల్లను"))

    # -------- WordPiece ----------
    print("\nTraining WordPiece...")
    wp = WordPiece()
    wp.train(corpus, vocab_size=32000)
    print("WordPiece vocab:", len(wp.vocab))
    print("WP sample:", wp.tokenize("నేను మంచి పిల్లను"))


Training FAST BPE... this will be MUCH faster...
