# Benchmarking MoR on Bangla Dataset (Kaggle)

This notebook benchmarks the Mixture-of-Recursion (MoR) Transformer on the Bangla dataset.
It is designed to run "on the go" in a Kaggle kernel.

In [None]:
# 1. Setup Repository
import os

REPO_URL = "https://github.com/ShMazumder/Benchmarking-MoR-on-fine-tuned-SLM.git"
REPO_DIR = "Benchmarking-MoR-on-fine-tuned-SLM"

if not os.path.exists(REPO_DIR):
    print(f"Cloning repository from {REPO_URL}...")
    !git clone {REPO_URL}
else:
    print("Repository already exists.")

# Change working directory to the code folder
CODE_DIR = os.path.join(REPO_DIR, 'code')
if os.path.exists(CODE_DIR):
    os.chdir(CODE_DIR)
    print(f"Changed directory to {os.getcwd()}")
else:
    # If we are already inside the repo (e.g. uploaded as dataset)
    if os.path.exists('code'):
        os.chdir('code')
        print(f"Changed directory to {os.getcwd()}")

In [None]:
# 2. HOST FIX: Patch bangla.py (Fixing duplicate code and path issues)
# This ensures the code works even if the repo hasn't been updated yet.

bangla_py_content = """"""Bangla SLM Dataset Loader with configurable tokenization"""
import os
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
from config import Config

try:
    from data.tokenizers import load_sentencepiece, train_sentencepiece, build_word_vocab_from_text, encode_with_sp
except Exception:
    from tokenizers import load_sentencepiece, train_sentencepiece, build_word_vocab_from_text, encode_with_sp


class BanglaSLMDataset(Dataset):
    def __init__(self, seq_length=64, split='train', split_ratio=0.9, tokenization='char', tokenizer_model=None, vocab_size=None, data_file=None):
        self.seq_length = seq_length
        cfg = Config()

        # Determine data path: use argument if provided, else use config default
        if data_file:
            data_path = Path(data_file)
        else:
            # Fallback to config 
            data_path = Path(cfg.bangla_data_file)
            
            # Ensure the directory exists if we are likely to write to it (though we just read usually)
            if not data_path.parent.exists():
                 data_path.parent.mkdir(parents=True, exist_ok=True)

        if not data_path.exists():
            # If on Kaggle, check input directory for common patterns
            kaggle_path = Path('/kaggle/input/bangla-slm/bangla_slm.txt')
            if kaggle_path.exists():
                print(f"Found dataset at: {kaggle_path}")
                data_path = kaggle_path
            else:
                raise FileNotFoundError(f"Bangla dataset file not found: {data_path}. Please place a UTF-8 text file at that path or in /kaggle/input.")

        text = data_path.read_text(encoding='utf-8')

        self.tokenization = tokenization
        if tokenization == 'char':
            chars = sorted(list(set(text)))
            self.vocab_size = len(chars)
            self.stoi = {ch: i for i, ch in enumerate(chars)}
            self.itos = {i: ch for i, ch in enumerate(chars)}
            data_ids = [self.stoi[ch] for ch in text]

        elif tokenization == 'word':
            stoi, itos = build_word_vocab_from_text(text)
            self.stoi = stoi
            self.itos = itos
            self.vocab_size = len(stoi)
            data_ids = [self.stoi[w] for w in text.split()]

        elif tokenization == 'subword':
            model_path = tokenizer_model or cfg.tokenizer_model_bangla
            model_file = Path(model_path)
            
            # Ensure tokenizer directory exists
            model_file.parent.mkdir(parents=True, exist_ok=True)
            
            if not model_file.exists():
                print(f"Training SentencePiece model for Bangla at {model_file}...")
                model_prefix = str(model_file.with_suffix(''))
                train_sentencepiece(str(data_path), model_prefix, vocab_size or cfg.subword_vocab_size)
                model_file = Path(model_prefix + '.model')
            
            sp = load_sentencepiece(str(model_file))
            self.vocab_size = sp.get_piece_size()
            self.stoi = None
            self.itos = None
            data_ids = encode_with_sp(sp, text)

        else:
            raise ValueError(f'Unknown tokenization: {tokenization}')

        data = torch.tensor(data_ids, dtype=torch.long)

        split_idx = int(len(data) * split_ratio)
        if split == 'train':
            self.data = data[:split_idx]
        else:
            self.data = data[split_idx:]

    def __len__(self):
        # We need at least seq_length + 1 tokens to form one sample (x, y)
        if len(self.data) <= self.seq_length:
            return 0
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        x = self.data[idx:idx + self.seq_length]
        y = self.data[idx + 1:idx + self.seq_length + 1]
        return x, y


def get_bangla_loaders(batch_size=64, seq_length=64, split_ratio=0.9, tokenization=None, tokenizer_model=None, vocab_size=None, data_file=None):
    cfg = Config()
    tokenization = tokenization or cfg.tokenization
    tokenizer_model = tokenizer_model or (cfg.tokenizer_model_bangla if tokenization == 'subword' else None)
    vocab_size = vocab_size or cfg.subword_vocab_size

    # Pass data_file to the dataset class
    train_dataset = BanglaSLMDataset(seq_length=seq_length, split='train', split_ratio=split_ratio,
                                     tokenization=tokenization, tokenizer_model=tokenizer_model, vocab_size=vocab_size, data_file=data_file)
    test_dataset = BanglaSLMDataset(seq_length=seq_length, split='test', split_ratio=split_ratio,
                                    tokenization=tokenization, tokenizer_model=tokenizer_model, vocab_size=vocab_size, data_file=data_file)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

    return train_loader, test_loader, train_dataset.vocab_size
"""

with open('data/bangla.py', 'w') as f:
    f.write(bangla_py_content)
print("Patched data/bangla.py")

In [None]:
# 3. Install Requirements
!pip install -r requirements.txt --quiet
print("Requirements installed.")

In [None]:
# 4. Run Training
# Set --dataset bangla
# Set --experiment mor_exp1 (MoR N=12) or baseline_12 (Baseline N=12)

# NOTE: Make sure your data is in the expected path (e.g. /kaggle/input) or update Config in config.py
# If using Kaggle input, you might need to symlink or copy it if you don't edit the path above.

print("Starting training...")
import sys
import subprocess

# Example command
cmd = [
    sys.executable, 'train_amp.py', 
    '--dataset', 'bangla', 
    '--experiment', 'mor_exp1',
    '--tokenization', 'subword',  # Bangla works best with subword/sentencepiece
    '--subword_vocab_size', '4000', 
    '--epochs', '3', # Set this higher for real training
    '--device', 'cuda' if torch.cuda.is_available() else 'cpu'
]

# Execute
print("Running command:", " ".join(cmd))
subprocess.check_call(cmd)

In [None]:
# 5. View Results
import json
try:
    with open('results/bangla_mor_exp1.json', 'r') as f:
        print(json.dumps(json.load(f), indent=2))
except FileNotFoundError:
    print("Results file not found. Check training output.")