In [29]:
from dataclasses import dataclass


@dataclass
class Config:
    pretrain_raw_data = "dataset/tamil_pretrain.txt"
    context_length = 512
    filtered_data = "dataset/filtered_cl_tamil_pretrain.txt"
    vocab_size = 30000
    tokenizer_path = "artifacts/tokenizer.json"

## Data Preprocessing

In [14]:
import mmap
import re

def clean_whitespaces(text):
    return re.sub(r"\s+", " ", text).strip()


def filter_sentences(file_path, max_len, output_file):
    with open(file_path, "r", encoding="utf-8") as f:

        mmapped_file = mmap.mmap(
            f.fileno(),
            0,
            access=mmap.ACCESS_READ
        )
        with open(output_file, "w") as out_file:
            for line in iter(mmapped_file.readline, b""):
                sentence = line.decode("utf-8").strip()
                sentence = clean_whitespaces(sentence)
                if sentence:
                    if len(sentence) >= max_len:
                        out_file.write(sentence + "\n")
        mmapped_file.close()

In [15]:
filter_sentences(Config.pretrain_raw_data, Config.context_length, "dataset/filtered_cl_tamil_pretrain.txt")

## Training a custom tokenizer
This block helps to train a BPE tokenizer from scratch for the new filtered dataset.

You can learn how to build a tokenizer from scratch block by block here: [Link](https://huggingface.co/learn/nlp-course/en/chapter6/8)

In [19]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

In [20]:
tamil_tokenizer = Tokenizer(models.BPE())

In [21]:
tamil_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

In [22]:
#tamil sample sentence
sample_text = "வணக்கம் உலகம்"
tamil_tokenizer.pre_tokenizer.pre_tokenize_str(sample_text)

[('à®µà®£à®ķ', (0, 3)),
 ('à¯į', (3, 4)),
 ('à®ķà®®', (4, 6)),
 ('à¯į', (6, 7)),
 ('Ġà®īà®²à®ķà®®', (7, 12)),
 ('à¯į', (12, 13))]

In [24]:
trainer = trainers.BpeTrainer(vocab_size=Config.vocab_size, special_tokens=["<|endoftext|>"])
tamil_tokenizer.model = models.BPE()
tamil_tokenizer.train([Config.filtered_data], trainer=trainer)






In [25]:
tamil_tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

In [26]:
tamil_tokenizer.decoder = decoders.ByteLevel()

In [27]:
tamil_tokenizer.decode(
    tamil_tokenizer.encode("வணக்கம் உலகம்").ids
)

'வணக்கம் உலகம்'

In [30]:
tamil_tokenizer.save(Config.tokenizer_path)

## Training pipeline setup

In [32]:
from transformers import PreTrainedTokenizerFast
from datasets import load_dataset

In [33]:
#loading the splits
data_files ={
    "train": "dataset/v2/tamil_train.txt",
    "test": "dataset/v2/tamil_test.txt",
}

raw_dataset = load_dataset(
    "text",
    data_files=data_files
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [35]:
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=Config.tokenizer_path,
)

outputs = tokenizer(
    raw_dataset["train"][:2]['text'],
    truncation=True,
    max_length=Config.context_length,
    return_overflowing_tokens=True,
    return_length=True

)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {outputs['length']}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 3
Input chunk lengths: [512, 390, 496]
Chunk mapping: [0, 0, 1]


In [41]:
raw_dataset['train'][:2]['text']

['ஃபாயெட்டெ (1825) ஆகியவற்றால் சாமுவேல் மோர்ஸுக்கு ஜனநாயக தேசிய உணர்வின் மீது குழப்பம் ஏற்பட்டது. செயல் முறையில் அமெரிக்க மக்களாட்சியைக் காண்பிப்பதற்காக அமெரிக்க பிரதிநிதிகள் அவையை அவர் ஓவியமாக வரையத் தேர்ந்தெடுத்தார். அவர் புதிய அறைகளின் கட்டமைப்பை வரைவதற்கு வாஷிங்டன் D.C.க்கு பயணித்தார். அந்த ஓவியத்தில் கவனமாக எண்பது பேரை வரைந்திருந்தார். மேலும் அதில் இடம் பெற்றிருந்த இரவுக்காட்சி பொருத்தமானதாக இருப்பதாக நம்பினார். அவர் வட்டமான மண்டபத்தின் விளக்குகளின் வெளிச்சத்தின் குவிதிறனுக்கு ஏற்ப அதில் இடம் பெற்றிருந்தோரை வெற்றிகரமாக சீராக வரைந்திருந்தார். இணைந்திருக்கும் மக்கள், தனியாக நின்றிருப்போர், தங்கள் மேஜைகளின் மீது சாய்ந்திருக்கும் தனியாளர் போன்றவை சாதாரணமாக வரையப்பட்டிருந்தது. ஆனால் சிறப்பியல்புடன் கூடிய முகங்களாக இருந்தது. சாமுவெல் மோர்ஸ் மக்களாட்சி கொள்கைகளின் மீது காங்கிரஸ்சின் அர்ப்பணிப்பு காலத்தின் அனுபவ வரம்பைத் தாண்டியதை வெளிப்படுத்துவதற்கு இரவு நேரத்தைத் தேர்ந்தெடுத்தார். எனினும் காங்கிரஸ் நியூயார்க் நகரத்தில் மக்கள் கூட்டத்தைப் பெறத் தவறியது. 1820 ஆம் ஆண்டில் ஜான் ட்ரம்புலின் 

In [44]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=Config.context_length,
        return_overflowing_tokens=True,
        return_length=True
    )

    input_batch = []
    for length, input_ids in zip(
        outputs['length'], outputs['input_ids']
    ):
        if length == Config.context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

tokenized_dataset = raw_dataset.map(
    tokenize,
    batched=True,
    remove_columns=["text"],
)

Map:   0%|          | 0/1184417 [00:00<?, ? examples/s]

Map:   0%|          | 0/345494 [00:00<?, ? examples/s]

In [45]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1528958
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 445316
    })
})

In [47]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=Config.context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [48]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 108.9M parameters


In [None]:
# 1. First, add the pad token more explicitly
if tokenizer.pad_token is None:
    # Method 1: Set pad_token to eos_token
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
    
    # Method 2: If Method 1 doesn't work, try adding a special token
    # This is more reliable as it modifies the tokenizer's vocabulary
    special_tokens_dict = {'pad_token': '[PAD]'}
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    print(f"Added {num_added_toks} special tokens: {special_tokens_dict}")
    
    # If working with a model, resize embeddings to match new vocabulary size
    # model.resize_token_embeddings(len(tokenizer))

# 2. Verify that pad token is set
print(f"Pad token: '{tokenizer.pad_token}', ID: {tokenizer.pad_token_id}")

Added 1 special tokens: {'pad_token': '[PAD]'}
Pad token: '[PAD]', ID: 30000


In [64]:
# 2. Create data collator with explicit padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False, 
)

# 3. Try processing your batch
out = data_collator([tokenized_dataset["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 512])
attention_mask shape: torch.Size([5, 512])
labels shape: torch.Size([5, 512])


In [65]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="artifacts",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

  trainer = Trainer(


In [None]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU 0 has a total capacity of 7.60 GiB of which 139.06 MiB is free. Process 2120 has 102.55 MiB memory in use. Including non-PyTorch memory, this process has 7.10 GiB memory in use. Of the allocated memory 6.92 GiB is allocated by PyTorch, and 55.95 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

: 

## Transformer from scratch

In [4]:
1/8

0.125

In [3]:
64 ** -0.5

0.125

In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

## Model

In [2]:
#Block:
    # Masked Multi-head Attention
    # Residual connection
    # Layer Normalization
    # Feed Forward
    # Residual connection
    # Layer Normalization

#Output Head:
    # Linear layer
    # Softmax
class MultiHeadAttention(nn.Module):
    
    def __init__(self, num_heads: int, dim: int, dropout:float):
        super().__init__()
        
        self.dim = dim
        self.num_heads = num_heads
        self.head_dim = dim // num_heads

        self.qkv = nn.Linear(dim, dim * 3)
        self.proj = nn.Linear(dim, dim)
        self.attn_dropout = nn.Dropout(dropout)
        self.out_dropout = nn.Dropout(dropout)
        self.scale = self.head_dim ** -0.5
    
    def forward(self, x, mask=None):
        batch_size, seq_len, dim = x.size()
        qkv = self.qkv(x).chunk(3, dim=-1)
        q, k, v = map(lambda t: t.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2), qkv)

        #attention scores
        attention = (q @ k.transpose(-2, -1)) * self.scale

        if mask is not None:
            mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(x.device)
            attention.masked_fill_(mask[None, None, ...], float("-inf"))
        
        attention = F.softmax(attention, dim=-1)
        attention = self.attn_dropout(attention)

        out = (attention @ v).transpose(1, 2).reshape(batch_size, seq_len, dim)
        out = self.proj(out)
        out = self.out_dropout(out)

        return out

class FeedForward(nn.Module):
    def __init__(self, dim: int = 512) -> None:
        super().__init__()

        self.fc1 = nn.Linear(dim, dim * 4)
        self.fc2 = nn.Linear(dim * 4, dim)
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

class Transformer(nn.Module):
    def __init__(self, dim:int=512, num_heads:int=8, dropout:float=0.1):
        super().__init__()

        self.attn_norm = nn.LayerNorm(dim)
        self.attention = MultiHeadAttention(dim=dim, num_heads=num_heads, dropout=dropout)

        self.ff = FeedForward(dim)
        self.ff_norm = nn.LayerNorm(dim)

        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):

        residual = x
        x = self.attn_norm(x)
        x = self.attention(x, mask=mask)
        x = residual + self.dropout(x)
        
        residual = x
        x = self.ff_norm(x)
        x = self.ff(x)
        x = residual + self.dropout(x)
        return x
    
class SinusoidalPositionalEncoding(nn.Module):
    def __init__(self, dim: int, context_length: int = 256):
        super().__init__()
        
        # Create positional encodings
        pe = torch.zeros(context_length, dim)
        position = torch.arange(0, context_length).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, dim, 2) * -(math.log(10000.0) / dim))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        
        # Register as buffer (not a parameter)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        # x shape: [batch_size, seq_len, dim]
        return self.pe[:, :x.size(1)]


class GPT(nn.Module):
    def __init__(self, num_tokens: int, dim: int, num_layers:int = 6, context_length:int = 256):
        super().__init__()
        self.num_tokens = num_tokens
        self.dim = dim
        self.token_emb = nn.Embedding(num_tokens, dim)
        self.pos_emb = SinusoidalPositionalEncoding(dim=dim, context_length=context_length)
        self.transformer = nn.ModuleList([Transformer(dim=dim) for _ in range(num_layers)])
        self.final_norm = nn.LayerNorm(dim)
        self.head = nn.Linear(dim, num_tokens)
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, x):
        wte = self.token_emb(x)
        wpe = self.pos_emb(x)
        x = wte + wpe

        x = self.dropout(x)
        for layer in self.transformer:
            x = layer(x)
        x = self.final_norm(x)
        x = self.head(x)
        return x

## Tokenization

In [3]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer
)

In [4]:
tokenizer = Tokenizer(models.BPE())

In [5]:
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

In [6]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test pre-tokenization!")

[('Let', (0, 3)),
 ("'s", (3, 5)),
 ('Ġtest', (5, 10)),
 ('Ġpre', (10, 14)),
 ('-', (14, 15)),
 ('tokenization', (15, 27)),
 ('!', (27, 28))]

In [7]:
tokenizer.model = models.BPE()
trainer = trainers.BpeTrainer(vocab_size=30000, special_tokens=["<|endoftext|>"])
tokenizer.train(["data.txt"], trainer=trainer)






In [8]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['Let', "'s", 'Ġtest', 'Ġthis', 'Ġtoken', 'iz', 'er', '.']


In [9]:
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

In [10]:
sentence = "Let's test this tokenizer."
encoding = tokenizer.encode(sentence)
start, end = encoding.offsets[4]
sentence[start:end]

' token'

In [11]:
tokenizer.decoder = decoders.ByteLevel()

In [12]:
tokenizer.decode(encoding.ids)

"Let's test this tokenizer."

In [13]:
tokenizer.save("tokenizer.json")

## Dataset

In [3]:
data = [x for x in open("data.txt", "r").readlines() if x.strip()]

In [23]:
import token
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizerFast
from tokenizers import Tokenizer

data = [x for x in open("data.txt", "r").readlines() if x.strip()]

tokenizer = Tokenizer.from_file("tokenizer.json")
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<|endoftext|>",
    eos_token="<|endoftext|>",
    pad_token="<|endoftext|>",
)

def create_batches(data, batch_size=8):
    for i in range(0, len(data), batch_size):
        x = data[i: i + batch_size]
        y = data[i+1: i + batch_size + 1]

        xtokens = wrapped_tokenizer(x, padding="max_length", truncation=True, max_length=256, return_tensors="pt")["input_ids"]
        ytokens = wrapped_tokenizer(y, padding="max_length", truncation=True, max_length=256, return_tensors="pt")["input_ids"]

        yield xtokens, ytokens


class TextDataset(Dataset):
    def __init__(self, filename: str, context_length: int = 256):
        super().__init__()
        self.filename = filename
        self.file = [x for x in open(filename, "r").readlines() if x.strip()]
        self.tokenizer = Tokenizer.from_file("tokenizer.json")
        self.wrapped_tokenizer = PreTrainedTokenizerFast(
                                    tokenizer_object=tokenizer,
                                    bos_token="<|endoftext|>",
                                    eos_token="<|endoftext|>",
                                    pad_token="<|endoftext|>",
                                )
        self.context_length = context_length
    
    def preprocess(self, text):
        return wrapped_tokenizer(
            text=text,
            return_tensors="pt",
            padding=True,
            truncation=True,
        )

    def __len__(self):
        return len(self.file)
    
    def __getitem__(self, idx):

        x = self.file[idx: idx + self.context_length]
        y = self.file[idx + 1: idx + self.context_length + 1]

        xtokens = self.preprocess(x)["input_ids"]
        ytokens = self.preprocess(y)["input_ids"]

        return xtokens, ytokens



In [28]:
x= TextDataset("data.txt", context_length=256)
x[0][0].shape, x[0][1].shape

(torch.Size([256, 17]), torch.Size([256, 17]))

In [5]:
def generate(seed_token, model, context_length, max_new_tokens, temperature=1.0):
    model.eval()
    gen_tokens = []
    tokens = wrapped_tokenizer(seed_token, padding="max_length", truncation=True, max_length=context_length, return_tensors="pt")["input_ids"]
    
    for _ in range(max_new_tokens):
        # Prepare input
        x = tokens[-context_length:].to("cuda:0").long()
        
        # Get predictions
        logits = model(x)
        
        # Sample next token (from the last position)
        logits = logits[0, -1, :] / temperature
        probs = F.softmax(logits, dim=0)
        next_token = torch.multinomial(probs, num_samples=1).item()
        
        gen_tokens.append(next_token)
    
    # Convert to text
    return ''.join([wrapped_tokenizer.decode([t]) for t in gen_tokens])

In [10]:
model = GPT(num_tokens=len(wrapped_tokenizer), dim=512, num_layers=6, context_length=256).to("cuda:0")
model.eval()
generate(
    "Love",
    model, 
    context_length=256,
    max_new_tokens=10,
)

' lightseys Once infringe babe pageant AdamAlackortal shortness'

In [None]:
from tqdm import tqdm

In [7]:
from tqdm import tqdm
import numpy as np


batch_size = 32
lr = 1e-6
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()
context = 256
model = GPT(len(wrapped_tokenizer), 512).to("cuda")


for epoch in tqdm(range(2)):
    step_loss = 0.0
    step_count = 0

    for xtokens, ytokens in create_batches(data=data, batch_size=batch_size):
        xtokens, ytokens = xtokens.to("cuda").long(), ytokens.to("cuda").long()
        optimizer.zero_grad()
        logits = model(xtokens)
        loss = loss_fn(logits.view(-1, logits.size(-1)), ytokens.view(-1))
        loss.backward()
        optimizer.step()
        step_loss += loss.item()
        step_count += 1

        if step_count % 1000 == 0:
            print(f"Epoch {epoch}, Step {step_count}, Loss {step_loss/step_count:.3f}")
            generate("Love", model, context, 100)
        

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch 0, Step 1000, Loss 10.397


  0%|          | 0/2 [04:48<?, ?it/s]


ValueError: Expected input batch_size (2304) to match target batch_size (2048).

In [36]:
dataset = TamilDataset("data.txt")

In [37]:
from torch.utils.data import DataLoader

dataloader = DataLoader(dataset, batch_size=32)

In [38]:
data = next(iter(dataloader))

In [39]:
data.shape

torch.Size([32, 256])