## Name - Sayak Rana
## Roll No. - CS2427

In [1]:
from collections import Counter
from pathlib import Path
import json

class SimpleBPE:
    def __init__(self, num_merges=1000, min_freq=2):
        self.num_merges = num_merges
        self.min_freq = min_freq
        self.merges = []
        self.token_to_id = {}
        self.word_freq = None

    @staticmethod
    def _word_to_symbols(word):
        return tuple(list(word) + ["</w>"])

    def build_word_freq(self, lines):
        freqs = Counter()
        for line in lines:
            for w in line.strip().split():
                if not w:
                    continue
                freqs[self._word_to_symbols(w)] += 1
        self.word_freq = freqs
        return freqs

    @staticmethod
    def get_pair_counts(word_freq):
        pairs = Counter()
        for seq, freq in word_freq.items():
            for i in range(len(seq)-1):
                pairs[(seq[i], seq[i+1])] += freq
        return pairs

    @staticmethod
    def merge_pair_in_vocab(pair, word_freq):
        a, b = pair
        merged = a + b
        new_freq = Counter()
        for seq, freq in word_freq.items():
            s = list(seq)
            i = 0
            new_seq = []
            while i < len(s):
                if i < len(s)-1 and s[i] == a and s[i+1] == b:
                    new_seq.append(merged)
                    i += 2
                else:
                    new_seq.append(s[i])
                    i += 1
            new_freq[tuple(new_seq)] += freq
        return new_freq

    def learn_bpe(self, lines):
        word_freq = self.build_word_freq(lines)
        merges = []
        for i in range(self.num_merges):
            pair_counts = self.get_pair_counts(word_freq)
            if not pair_counts:
                break
            best_pair, best_count = pair_counts.most_common(1)[0]
            if best_count < self.min_freq:
                break
            merges.append(best_pair)
            word_freq = self.merge_pair_in_vocab(best_pair, word_freq)
            if (i+1) % 100 == 0:
                unique_symbols = len({sym for seq in word_freq for sym in seq})
                print(f"learned {i+1} merges — unique symbols: {unique_symbols}")
        self.merges = merges
        # build token_to_id
        token_set = set()
        for seq in word_freq:
            token_set.update(seq)
        token_set.add("</w>")
        self.token_to_id = {t: idx for idx, t in enumerate(sorted(token_set))}
        self.word_freq = word_freq
        return merges

    def encode_word(self, word):
        symbols = list(word) + ["</w>"]
        rank = {pair: idx for idx, pair in enumerate(self.merges)}
        while True:
            if len(symbols) < 2:
                break
            pairs = [(symbols[i], symbols[i+1]) for i in range(len(symbols)-1)]
            candidate = None
            best_rank = None
            best_pos = None
            for pos, p in enumerate(pairs):
                if p in rank:
                    r = rank[p]
                    if best_rank is None or r < best_rank:
                        best_rank = r
                        candidate = p
                        best_pos = pos
            if candidate is None:
                break
            a, b = candidate
            symbols = symbols[:best_pos] + [a + b] + symbols[best_pos+2:]
        return symbols

    def encode(self, text):
        tokens = []
        for w in text.strip().split():
            tokens.extend(self.encode_word(w))
        ids = []
        for t in tokens:
            if t not in self.token_to_id:
                self.token_to_id[t] = len(self.token_to_id)
            ids.append(self.token_to_id[t])
        return tokens, ids

    def save(self, prefix="bengali_bpe_demo"):
        Path(prefix + ".merges.txt").write_text("\n".join([" ".join(pair) for pair in self.merges]), encoding="utf-8")
        Path(prefix + ".vocab.json").write_text(json.dumps(self.token_to_id, ensure_ascii=False, indent=2), encoding="utf-8")
        print("Saved:", prefix + ".merges.txt", prefix + ".vocab.json")

    def load(self, prefix="bengali_bpe_demo"):
        merges_path = Path(prefix + ".merges.txt")
        vocab_path = Path(prefix + ".vocab.json")
        merges = []
        if merges_path.exists():
            for ln in merges_path.read_text(encoding="utf-8").splitlines():
                if ln.strip():
                    a,b = ln.split()
                    merges.append((a,b))
        if vocab_path.exists():
            token_to_id = json.loads(vocab_path.read_text(encoding="utf-8"))
        else:
            token_to_id = {}
        self.merges = merges
        self.token_to_id = token_to_id
        print("Loaded model from", prefix)


In [2]:
from datasets import load_dataset
import re

print("Loading dataset mHossain/bengali_sentiment_v2 ...")
ds = load_dataset("mHossain/bengali_sentiment_v2", split="train")
print("Columns:", ds.column_names)
text_col = "text" if "text" in ds.column_names else ds.column_names[0]

def clean_line(s):
    return re.sub(r"\s+", " ", s).strip()

lines = [clean_line(str(x[text_col])) for x in ds if x[text_col] and str(x[text_col]).strip()]
print("Total loaded lines:", len(lines))

sample_size = 10000
if sample_size and sample_size < len(lines):
    lines = lines[:sample_size]
print("Using", len(lines), "lines for training.")

# Train
bpe = SimpleBPE(num_merges=1000, min_freq=2)
print("Starting training...")
merges = bpe.learn_bpe(lines)
print("Training done. Merges learned:", len(merges))
print("Sample merges (first 20):", merges[:20])


Loading dataset mHossain/bengali_sentiment_v2 ...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


train.csv: 0.00B [00:00, ?B/s]

val.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/11881 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1486 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1485 [00:00<?, ? examples/s]

Columns: ['Unnamed: 0', 'text', 'label']
Total loaded lines: 11881
Using 10000 lines for training.
Starting training...
learned 100 merges — unique symbols: 245
learned 200 merges — unique symbols: 343
learned 300 merges — unique symbols: 443
learned 400 merges — unique symbols: 542
learned 500 merges — unique symbols: 641
learned 600 merges — unique symbols: 740
learned 700 merges — unique symbols: 838
learned 800 merges — unique symbols: 938
learned 900 merges — unique symbols: 1037
learned 1000 merges — unique symbols: 1136
Training done. Merges learned: 1000
Sample merges (first 20): [('ে', '</w>'), ('র', '</w>'), ('া', '</w>'), ('ন', '</w>'), ('ি', '</w>'), ('ে', 'র</w>'), ('।', '</w>'), ('া', 'র</w>'), ('্', 'য'), ('া', 'র'), ('ই', '</w>'), ('ক', 'র'), ('্', 'র'), ('ব', 'া'), ('য়', '</w>'), ('া', 'ন'), ('দ', 'ে'), ('্', 'ত'), ('ো', '</w>'), ('ক', '</w>')]


In [3]:
examples = [
    "বাংলা ভাষা সুন্দর।",
    "আমি মেশিন লার্নিং শিখছি",
    "আপনার নাম কি?",
    "প্রযুক্তি আমাদের ভবিষ্যৎ।"
]

for s in examples:
    toks, ids = bpe.encode(s)
    print("\nINPUT:", s)
    print("TOKENS:", toks)
    print("IDS   :", ids)



INPUT: বাংলা ভাষা সুন্দর।
TOKENS: ['বাং', 'লা</w>', 'ভা', 'ষ', 'া</w>', 'সু', 'ন্দ', 'র', '।</w>']
IDS   : [624, 812, 695, 861, 964, 907, 543, 781, 28]

INPUT: আমি মেশিন লার্নিং শিখছি
TOKENS: ['আমি</w>', 'মে', 'শি', 'ন</w>', 'ল', 'ার্', 'নি', 'ং</w>', 'শি', 'খ', 'ছি</w>']
IDS   : [79, 752, 846, 499, 807, 993, 515, 31, 846, 230, 307]

INPUT: আপনার নাম কি?
TOKENS: ['আপনার</w>', 'নাম</w>', 'কি', '?</w>']
IDS   : [70, 511, 191, 10]

INPUT: প্রযুক্তি আমাদের ভবিষ্যৎ।
TOKENS: ['প্র', 'যু', 'ক্তি</w>', 'আমাদের</w>', 'ভ', 'বিষ', '্য', 'ৎ', '।</w>']
IDS   : [582, 772, 220, 77, 693, 661, 1033, 1043, 28]


In [4]:
from google.colab import drive
import shutil
from pathlib import Path

# Mount drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
local_merges = Path("bengali_bpe_demo.merges.txt")
local_vocab  = Path("bengali_bpe_demo.vocab.json")

bpe.save(prefix="bengali_bpe_demo")

drive_target = Path("/content/drive/MyDrive/ISI/3rd_sem/nlp/assign1")
drive_target.mkdir(parents=True, exist_ok=True)

copied = []
if local_merges.exists():
    shutil.copy(local_merges, drive_target / local_merges.name)
    copied.append(local_merges.name)
else:
    print("Local merges file not found:", local_merges)

if local_vocab.exists():
    shutil.copy(local_vocab, drive_target / local_vocab.name)
    copied.append(local_vocab.name)
else:
    print("Local vocab file not found:", local_vocab)

if copied:
    print("Copied files to Drive folder:", drive_target)
    print(copied)
else:
    print("No files copied. Ensure training completed and files exist locally.")


Saved: bengali_bpe_demo.merges.txt bengali_bpe_demo.vocab.json
Copied files to Drive folder: /content/drive/MyDrive/ISI/3rd_sem/nlp/assign1
['bengali_bpe_demo.merges.txt', 'bengali_bpe_demo.vocab.json']
