In [3]:
# =============================================================================
# 1. SETUP & IMPORTS
# =============================================================================

import os
import json
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import itertools
from pathlib import Path
from dataclasses import dataclass
from contextlib import nullcontext
from tqdm.auto import tqdm
from google.colab import userdata

from datasets import Dataset, DatasetDict, load_dataset, disable_caching
from transformers import AutoTokenizer
from tokenizers import Tokenizer, models, pre_tokenizers, trainers
from huggingface_hub import login, list_repo_files, hf_hub_download

# Install required packages (if needed)
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

# Uncomment if packages are not available
# install_package("huggingface_hub")
# install_package("datasets")
# install_package("transformers")
# install_package("tokenizers")

In [4]:
# =============================================================================
# 2. CONFIGURATION & HYPERPARAMETERS
# =============================================================================

# Dataset Configuration
HF_TOKEN = userdata.get("HF_TOKEN")
TOKENIZER = "csebuetnlp/banglat5"  # Bangla-T5 SentencePiece
TEST_SPLIT = 0.05  # 5% validation

# Processing Configuration
NUM_PROC = 4
WRITE_BATCH = 1024
BIN_DTYPE = np.uint16  # OK since vocab < 65536
BLOCK_SIZE = 128

# Model Configuration
N_LAYER = 8
N_HEAD = 8
N_EMBD = 512

# Training Configuration
BATCH_SIZE = 64
LR = 3e-4
WARMUP_ITERS = 1000
MAX_ITERS = 20000
EVAL_INTERVAL = 500
EVAL_ITERS = 200
GRAD_CLIP = 1.0

# Kaggle Directory Configuration
KAGGLE_WORKING_DIR = "/kaggle/working"
KAGGLE_INPUT_DIR = "/kaggle/input"
SAVE_DIR = f"{KAGGLE_WORKING_DIR}/bangla_gpt_data"
CKPT_DIR = f"{KAGGLE_WORKING_DIR}/bangla_gpt_ckpt"

# Device Configuration
Device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# DType = torch.float16 if torch.cuda.is_available() else torch.float32
DType = torch.float32

print(f"Using device: {Device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

Using device: cuda
GPU: Tesla T4
GPU Memory: 14.7 GB


In [5]:
# =============================================================================
# 3. KAGGLE ENVIRONMENT SETUP
# =============================================================================

# Create necessary directories
os.makedirs(SAVE_DIR, exist_ok=True)
os.makedirs(CKPT_DIR, exist_ok=True)

# Function to get HuggingFace token from Kaggle secrets or environment
def get_hf_token():
    # Try to get token from Kaggle secrets
    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        return user_secrets.get_secret("HF_TOKEN")
    except:
        # Fallback to environment variable or manual input
        token = os.environ.get("HF_TOKEN", HF_TOKEN)
        if not token:
            print("Warning: No HuggingFace token found. Please set HF_TOKEN.")
        return token

# HF_TOKEN = get_hf_token()

In [8]:
# Login to Hugging Face (if token is available)
if HF_TOKEN:
    login(token=HF_TOKEN)
    print("✔ Logged in to HuggingFace")
else:
    print("⚠ No HuggingFace token - using public datasets only")

full_df = pd.read_csv('/content/book-reviews.csv')
full_df.head()

✔ Logged in to HuggingFace


Unnamed: 0,id,Book_Name,Writer_Name,Category,Rating,Review,Site,sentiment,label
0,1,JAIBB Banking Diploma Series (Paperback),Professor Mojher Uddin Ahamed,Banking Diploma,1,।,Rokomari,negative,0
1,2,"অপেক্ষা ১ম, ২য় ও ৩য় খন্ড রকমারি কালেকশন (হার্...",রেদোয়ান মাসুদ,সমকালীন উপন্যাস,5,রেদোয়ান মাসুদের এই বইটা অসাধারণ! বাস্তব এবং বর...,Rokomari,positive,2
2,3,মুসলিমজাতি বিশ্বকে কী দিয়েছে ১-৪ খণ্ড (হার্ডক...,ড. রাগিব সারজানি,ইসলামি ইতিহাস ও ঐতিহ্য,5,ভাল বই,Rokomari,positive,2
3,4,অফুরন্ত বিশ্বাস (পেপারব্যাক),মাওলানা রুহুল আমিন সিরাজী,ইসলামি আদর্শ ও মতবাদ,5,নাস্তিকতার ভিত কাঁপাবে এবং মুখোশ উন্মোচন করে দ...,Rokomari,positive,2
4,5,একটা দেশ যেভাবে দাঁড়ায় (হার্ডকভার) প্রথম আলো...,রউফুল আলম,শিক্ষা বিষয়ক,5,ভাল,Rokomari,positive,2


In [9]:
# =============================================================================
# 5. DATASET PREPROCESSING
# =============================================================================

# Prepare dataset for training
text_col = 'Review'
print(f"Using column '{text_col}' as text.")

# Check if column exists
if text_col not in full_df.columns:
    print("Available columns:", full_df.columns.tolist())
    # Try to find a text column automatically
    text_cols = [col for col in full_df.columns if full_df[col].dtype == 'object']
    if text_cols:
        text_col = text_cols[0]
        print(f"Using '{text_col}' as text column instead")
    else:
        raise ValueError("No suitable text column found")

# Clean and filter data
full_df = full_df.dropna(subset=[text_col])
full_df = full_df[full_df[text_col].str.len() > 10]  # Remove very short texts
print(f"After cleaning: {len(full_df)} samples")

# Create HuggingFace dataset
ds_full = Dataset.from_pandas(full_df[[text_col]].rename(columns={text_col: "text"}))
ds_full = ds_full.shuffle(seed=42)

# Split into train/validation
splits = ds_full.train_test_split(test_size=TEST_SPLIT, seed=42)
ds = DatasetDict({"train": splits["train"], "validation": splits["test"]})
print(ds)

Using column 'Review' as text.
After cleaning: 83593 samples
DatasetDict({
    train: Dataset({
        features: ['text', '__index_level_0__'],
        num_rows: 79413
    })
    validation: Dataset({
        features: ['text', '__index_level_0__'],
        num_rows: 4180
    })
})


In [10]:
# =============================================================================
# 6. TOKENIZER SETUP
# =============================================================================

print("\n>>> Loading tokenizer…")
try:
    Tokenizer = AutoTokenizer.from_pretrained(
        TOKENIZER,
        cache_dir=f"{KAGGLE_WORKING_DIR}/tokenizer_cache"
    )
    print(f"✔ Loaded tokenizer: {TOKENIZER}")
    print(f"Vocab size: {Tokenizer.vocab_size}")
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    # Fallback to a different tokenizer if needed
    raise

assert Tokenizer.vocab_size < np.iinfo(BIN_DTYPE).max, "Vocab > uint16, switch dtype!"


>>> Loading tokenizer…


tokenizer_config.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


✔ Loaded tokenizer: csebuetnlp/banglat5
Vocab size: 32100


In [11]:
def tokenize_function(batch):
    """Tokenize batch of texts"""
    texts = [str(x) if x is not None else "" for x in batch["text"]]
    out = Tokenizer(
        texts,
        add_special_tokens=False,
        truncation=True,
        max_length=512
    )
    return {
        "ids": out["input_ids"],
        "len": [len(x) for x in out["input_ids"]],
    }

print(">>> Tokenising…")
TokDS = ds.map(
    tokenize_function,
    batched=True,
    num_proc=NUM_PROC,
    remove_columns=["text"],
    desc="Tokenising",
)

>>> Tokenising…


Tokenising (num_proc=4):   0%|          | 0/79413 [00:00<?, ? examples/s]

Tokenising (num_proc=4):   0%|          | 0/4180 [00:00<?, ? examples/s]

In [12]:
# =============================================================================
# 7. BINARY DATASET PREPARATION
# =============================================================================

print(">>> Creating binary dataset files...")

# Save tokenized data as binary files
for split in ("train", "validation"):
    tokens = TokDS[split]
    tot_len = int(np.sum(tokens["len"], dtype=np.uint64))
    mmap = np.memmap(Path(SAVE_DIR)/f"{split}.bin", dtype=BIN_DTYPE, mode="w+", shape=(tot_len,))

    idx = 0
    num_shards = min(WRITE_BATCH, len(tokens))
    for shard in tqdm(range(num_shards), desc=f"Writing {split}.bin"):
        part = tokens.shard(num_shards=num_shards, index=shard, contiguous=True)
        ids_pylist = part["ids"]
        flat_ids = list(itertools.chain.from_iterable(ids_pylist))
        buf = np.asarray(flat_ids, dtype=BIN_DTYPE)
        mmap[idx:idx+len(buf)] = buf
        idx += len(buf)

    mmap.flush()
    print(f"✔ saved {split}.bin – {tot_len:,} tokens")

>>> Creating binary dataset files...


Writing train.bin:   0%|          | 0/1024 [00:00<?, ?it/s]

✔ saved train.bin – 1,736,635 tokens


Writing validation.bin:   0%|          | 0/1024 [00:00<?, ?it/s]

✔ saved validation.bin – 90,210 tokens


In [13]:
# Save metadata
meta = {
    "tokenizer": TOKENIZER,
    "vocab_size": Tokenizer.vocab_size,
    "train_tokens": int(np.sum(TokDS["train"]["len"])),
    "val_tokens": int(np.sum(TokDS["validation"]["len"])),
    "block_size": BLOCK_SIZE,
    "bin_dtype": "uint16",
}

with open(Path(SAVE_DIR)/"meta.json", "w") as f:
    json.dump(meta, f, indent=2)
print("✔ meta.json saved")

✔ meta.json saved


In [14]:
# =============================================================================
# 8. MODEL DEFINITION (nanoGPT-style GPT)
# =============================================================================

class GPTConfig:
    """GPT model configuration"""
    def __init__(self):
        self.vocab_size = meta["vocab_size"]
        self.block_size = BLOCK_SIZE
        self.n_layer = N_LAYER
        self.n_head = N_HEAD
        self.n_embd = N_EMBD
        self.dropout = 0.2

class CausalSelfAttention(nn.Module):
    """Causal self-attention mechanism"""
    def __init__(self, cfg):
        super().__init__()
        self.n_head = cfg.n_head
        self.key = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
        self.query = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
        self.value = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
        self.proj = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
        self.attn_drop = nn.Dropout(cfg.dropout)
        self.resid_drop = nn.Dropout(cfg.dropout)

        # Causal mask
        mask = torch.tril(torch.ones(cfg.block_size, cfg.block_size)).view(1, 1, cfg.block_size, cfg.block_size)
        self.register_buffer("mask", mask)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x).view(B, T, self.n_head, C//self.n_head).transpose(1, 2)
        q = self.query(x).view(B, T, self.n_head, C//self.n_head).transpose(1, 2)
        v = self.value(x).view(B, T, self.n_head, C//self.n_head).transpose(1, 2)

        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)

        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.resid_drop(self.proj(y))
        return y

class MLP(nn.Module):
    """Multi-layer perceptron"""
    def __init__(self, cfg):
        super().__init__()
        self.fc1 = nn.Linear(cfg.n_embd, 4*cfg.n_embd)
        self.fc2 = nn.Linear(4*cfg.n_embd, cfg.n_embd)
        self.act = nn.GELU()
        self.drop = nn.Dropout(cfg.dropout)

    def forward(self, x):
        return self.drop(self.fc2(self.act(self.fc1(x))))

class Block(nn.Module):
    """Transformer block"""
    def __init__(self, cfg):
        super().__init__()
        self.ln1 = nn.LayerNorm(cfg.n_embd)
        self.ln2 = nn.LayerNorm(cfg.n_embd)
        self.attn = CausalSelfAttention(cfg)
        self.mlp = MLP(cfg)

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

class GPT(nn.Module):
    """GPT model"""
    def __init__(self, cfg):
        super().__init__()
        self.token_emb = nn.Embedding(cfg.vocab_size, cfg.n_embd)
        self.pos_emb = nn.Embedding(cfg.block_size, cfg.n_embd)
        self.drop = nn.Dropout(cfg.dropout)
        self.blocks = nn.ModuleList([Block(cfg) for _ in range(cfg.n_layer)])
        self.ln_f = nn.LayerNorm(cfg.n_embd)
        self.head = nn.Linear(cfg.n_embd, cfg.vocab_size, bias=False)
        self.block_size = cfg.block_size

    def forward(self, idx, targets=None):
        B, T = idx.size()
        pos = torch.arange(0, T, device=idx.device)
        x = self.drop(self.token_emb(idx)+ self.pos_emb(pos))

        for blk in self.blocks:
            x = blk(x)

        x = self.ln_f(x)
        logits = self.head(x)

        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

        return logits, loss

In [15]:
# =============================================================================
# 9. TRAINING UTILITIES
# =============================================================================

def get_batch(split: str, block: int = BLOCK_SIZE, batch: int = BATCH_SIZE, device=Device):
    """Get a batch of data"""
    data = np.memmap(Path(SAVE_DIR)/f"{split}.bin", dtype=BIN_DTYPE, mode="r")
    ix = torch.randint(len(data)-block, (batch,))
    x = torch.stack([torch.from_numpy(data[i:i+block].astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy(data[i+1:i+1+block].astype(np.int64)) for i in ix])

    if device.type == "cuda":
        x = x.pin_memory().to(device, non_blocking=True)
        y = y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)

    return x, y

def lr_scheduler(iteration):
    """Learning rate scheduler with warmup and cosine decay"""
    if iteration < WARMUP_ITERS:
        return LR * iteration / WARMUP_ITERS
    pct = (iteration - WARMUP_ITERS) / max(1, MAX_ITERS - WARMUP_ITERS)
    return 0.1 * LR + 0.9 * LR * 0.5 * (1 + math.cos(math.pi * pct))

@torch.no_grad()
def evaluate_loss():
    """Evaluate model loss on train and validation sets"""
    model.eval()
    out = {}
    for split in ("train", "validation"):
        losses = torch.zeros(EVAL_ITERS)
        for k in range(EVAL_ITERS):
            x, y = get_batch(split, BLOCK_SIZE, BATCH_SIZE, Device)
            _, loss = model(x, y)
            losses[k] = loss.item()
        out[split] = losses.mean().item()
    model.train()
    return out

In [16]:
# =============================================================================
# 10. MODEL INITIALIZATION
# =============================================================================

# Initialize model
cfg = GPTConfig()
model = GPT(cfg).to(Device).to(dtype=DType)
print(f"Model params: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")

# Initialize optimizer
opt = torch.optim.AdamW(model.parameters(), lr=LR, betas=(0.9, 0.95), weight_decay=0.1)

Model params: 58.14M


In [17]:
# =============================================================================
# 11. TRAINING LOOP
# =============================================================================

print("\n>>> Training…")
disable_caching()

# Track training metrics
train_losses = []
val_losses = []

# Kaggle-specific: Monitor memory usage
def print_memory_usage():
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        print(f"GPU Memory - Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB")

for it in range(1, MAX_ITERS+1):
    # Update learning rate
    for g in opt.param_groups:
        g["lr"] = lr_scheduler(it)

    # Forward pass
    x, y = get_batch("train", BLOCK_SIZE, BATCH_SIZE, Device)
    logits, loss = model(x, y)

    # Backward pass
    opt.zero_grad(set_to_none=True)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
    opt.step()

    # Evaluation and checkpointing
    if it % EVAL_INTERVAL == 0 or it == 1:
        losses = evaluate_loss()
        train_losses.append(losses['train'])
        val_losses.append(losses['validation'])

        print(f"iter {it:6d} | train {losses['train']:.4f} | val {losses['validation']:.4f} | lr {opt.param_groups[0]['lr']:.2e}")
        print_memory_usage()

        # Save checkpoint
        ckpt = {
            "model": model.state_dict(),
            "cfg": cfg.__dict__,
            "iter": it,
            "meta": meta,
            "train_losses": train_losses,
            "val_losses": val_losses,
        }
        torch.save(ckpt, Path(CKPT_DIR)/f"ckpt_{it}.pt")
        print("✔ checkpoint saved")

        # Early stopping check (optional)
        if len(val_losses) > 2 and val_losses[-1] > val_losses[-2]:
            print("⚠ Validation loss increased - consider stopping soon")

print(" Training complete. Checkpoints at", CKPT_DIR)


>>> Training…
iter      1 | train 10.5387 | val 10.5399 | lr 3.00e-07
GPU Memory - Allocated: 1.86GB, Reserved: 7.94GB
✔ checkpoint saved
iter    500 | train 6.5269 | val 6.6160 | lr 1.50e-04
GPU Memory - Allocated: 1.86GB, Reserved: 7.94GB
✔ checkpoint saved
iter   1000 | train 5.7481 | val 5.9649 | lr 3.00e-04
GPU Memory - Allocated: 1.86GB, Reserved: 7.94GB
✔ checkpoint saved
iter   1500 | train 5.2052 | val 5.5662 | lr 3.00e-04
GPU Memory - Allocated: 1.86GB, Reserved: 7.94GB
✔ checkpoint saved
iter   2000 | train 4.8277 | val 5.3507 | lr 2.98e-04
GPU Memory - Allocated: 1.86GB, Reserved: 7.94GB
✔ checkpoint saved
iter   2500 | train 4.5270 | val 5.1867 | lr 2.96e-04
GPU Memory - Allocated: 1.86GB, Reserved: 7.94GB
✔ checkpoint saved
iter   3000 | train 4.2694 | val 5.0849 | lr 2.93e-04
GPU Memory - Allocated: 1.86GB, Reserved: 7.94GB
✔ checkpoint saved
iter   3500 | train 4.0569 | val 5.0109 | lr 2.89e-04
GPU Memory - Allocated: 1.86GB, Reserved: 7.94GB
✔ checkpoint saved
iter   

KeyboardInterrupt: 

In [18]:
# =============================================================================
# Load checkpoint for generation (fixed to iter_8500)
# =============================================================================
target_iter = 8000
target_checkpoint = Path(CKPT_DIR) / f"ckpt_{target_iter}.pt"

if target_checkpoint.exists():
    print(f"Loading checkpoint: {target_checkpoint}")
    checkpoint = torch.load(target_checkpoint, map_location=Device)
    model.load_state_dict(checkpoint['model'])
    model.to(Device)
else:
    print(f"Checkpoint ckpt_{target_iter}.pt not found, using current model state")

Loading checkpoint: /kaggle/working/bangla_gpt_ckpt/ckpt_8000.pt


In [19]:
@torch.no_grad()
def generate(prompt: str, max_new_tokens: int = 50, temperature: float = 1.0, top_k: int = None):
    """Generate text from a given prompt"""
    model.eval()
    ids = Tokenizer(prompt, return_tensors="pt")["input_ids"].to(Device)
    out = ids

    for _ in range(max_new_tokens):
        idx_cond = out[:, -BLOCK_SIZE:]
        logits, _ = model(idx_cond)
        logits = logits[:, -1, :] / temperature

        if top_k is not None:
            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
            logits[logits < v[:, [-1]]] = -float('Inf')

        probs = torch.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, 1)
        out = torch.cat([out, next_id], dim=1)

    return Tokenizer.decode(out[0].tolist(),skip_special_tokens=True)

In [23]:
# Test generation with different settings
print("\n>>> Testing text generation:")
test_prompts = [
    "চোখের বালি ", "রবীন্দ্রনাথ ", "আমার আছে জল "
]

for prompt in test_prompts:
    print(f"\nPrompt: {prompt}")
    print("Generated:", generate(prompt, max_new_tokens=40, temperature=0.9, top_k=40))


>>> Testing text generation:

Prompt: চোখের বালি 
Generated: চোখের বালি।সাথে রয়েছে সূক্ষ্মাতিসূক্ষ্ম বর্ণনা। বইটি পড়ে পাঠক খুব ভালো লাগলো। বইটি পড়ে খুব সুন্দর লেগেছে। তবে লেখকের লেখনীতে বেশ কিছুটা গতি রয়েছে। আরো ভালো বই চাই। এই বিষয়টি নিয়ে আমার

Prompt: রবীন্দ্রনাথ 
Generated: রবীন্দ্রনাথী হলে সম্ভবত সবচেয়ে বেশি রোমাঞ্চ অনুভব করতাম।এ বইটি তাদের জন্য যারা দীর্ঘদিন ধরে মানসিক দক্ষতার সাথে জড়িত।এই বইয়ের বিষয়বস্তুগুলি সত্যই সহায়ক।আমি আমার জন্য এই বইগুলি পড়তে খুব উপভোগ করি।আমি

Prompt: আমার আছে জল 
Generated: আমার আছে জল। তার এই সব বই সম্পর্কে কিছু বলা একধরনের ধৃষ্টতা । ওমর খৈয়ামের ভাষায় কিছু বই অনন্ত যৌবনের বই, যাদের কোন ক্ষয় নেই। এটা তেমনি একটি বই। যারা এখনো পড়েন নি নিঃসন্দেহে নিঃসং
