# Data Import

In [1]:
from dotenv import load_dotenv
load_dotenv(override=True)  # 👈 force reload if values were already set


True

In [2]:
import os
print(os.getenv('DB_HOST'), os.getenv('DB_USER'), os.getenv('DB_PASSWORD'), os.getenv('DB_NAME'))


127.0.0.1 root MySQL31# text_dataset_db


In [3]:
import pymysql
from dotenv import load_dotenv
import os

# ✅ Load environment variables from the .env file
load_dotenv()

# ✅ Fetch database configuration from environment variables
db_config = {
    'host': os.getenv('DB_HOST'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'database': os.getenv('DB_NAME')
}

# ✅ Connect to MySQL using PyMySQL
conn = pymysql.connect(**db_config)
cursor = conn.cursor()

# ✅ Define a function to fetch content based on the level
def fetch_data_by_level(level):
    query = "SELECT content FROM text_data WHERE level = %s"
    cursor.execute(query, (level,))
    rows = cursor.fetchall()
    return [row[0] for row in rows]

# ✅ Fetch data for each level
L1= fetch_data_by_level('L1')
L2 = fetch_data_by_level('L2')
L3 = fetch_data_by_level('L3')
L4 = fetch_data_by_level('L4')

# ✅ Close the cursor and connection
cursor.close()
conn.close()

# ✅ Print samples (optional)
print("L1 Data Sample:", L1[0][:5000] if L1 else "No data found")
print("L2 Data Sample:", L2[0][:500] if L2 else "No data found")
print("L3 Data Sample:", L3[0][:500] if L3 else "No data found")
print("L4 Data Sample:", L4[0][:5000] if L4 else "No data found")


L1 Data Sample:  Once upon a time in the land of Policymia, there lived two leaders named Majora and Minoro. Their job was to make sure all the citizens had beautiful parks, clean water, and top-notch schools. But there were so many things to fix! How would they ever decide where to start?
L2 Data Sample: usually , he would be tearing around the living room , playing with his toys .
L3 Data Sample: LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as s
L4 Data Sample: Research on Neural Machine Translation Model


In [12]:
# Concatenate all content from L1, L2, L3, L4 into single strings
L1_combined = " ".join(L1) if L1 else ""
L2_combined = " ".join(L2) if L2 else ""
L3_combined = " ".join(L3) if L3 else ""
L4_combined = " ".join(L4) if L4 else ""

In [16]:
print("L1 Data Sample:", L1_combined[:500] )
print("L2 Data Sample:", L2_combined[:500] )
print("L3 Data Sample:", L3_combined[:500] )
print("L4 Data Sample:", L4_combined[:500] )

L1 Data Sample:  Once upon a time in the land of Policymia, there lived two leaders named Majora and Minoro. Their job was to make sure all the citizens had beautiful parks, clean water, and top-notch schools. But there were so many things to fix! How would they ever decide where to start?  Majora, being the wise leader she was, knew just what to do. She invited her fellow policymakers for a big meeting at the Roundtable of Representatives. There, they discussed the most important problems Policymia faced. This was called identifying "key policy areas." It meant figuring out which topics needed attention first.  Next came assessing support – finding out if everyone agreed on the solutions. Some people thought building more playgrounds was the way to go, while others wanted better libraries. To understand everyone's thoughts, Majora used something called 'polling.' Just like taking a vote, polling helped her see what ideas were popular among her friends (the majority) and also those acr

## Hugging Face Dataset

In [5]:
# import requests

# with open("Data/L1_ChildrenStories.txt", "r", encoding="utf-8") as f:
#     L1 = f.read()
# with open("Data/L2_BookCorpus.txt", "r", encoding="utf-8") as f:
#     L2 = f.read()
# with open("Data/L3_CNN_DailyMail.txt", "r", encoding="utf-8") as f:
#     L3 = f.read()
# with open("Data/L4_S2ORC.txt", "r", encoding="utf-8") as f:
#     L4 = f.read()

# Data Cleaning

In [13]:
!pip install emoji
import re
import emoji

# Define allowed character sets
english_regex = r"[a-zA-Z0-9\s]"                   # English letters, numbers, spaces
math_symbols  = r"[\+\-\*/=<>∑∫√πθΣ∂∞]"             # Add more math symbols if needed
special_chars = r"[\.,!?;:'\"()\[\]{}#@%^&*_~]"     # Common special characters

# Function to clean a given text
def clean_text(text):
    return "".join(
        c for c in text
        if re.match(english_regex, c) or 
           re.match(math_symbols, c) or 
           re.match(special_chars, c) or 
           emoji.is_emoji(c)
    )

# Apply cleaning to all levels
# L1_cleaned = clean_text(L1)
# L2_cleaned = clean_text(L2)
# L3_cleaned = clean_text(L3)
# L4_cleaned = clean_text(L4)
L1_cleaned = clean_text(L1_combined)
L2_cleaned = clean_text(L2_combined)
L3_cleaned = clean_text(L3_combined)
L4_cleaned = clean_text(L4_combined)
print("All levels cleaned and stored in *_cleaned variables!")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


All levels cleaned and stored in *_cleaned variables!


In [14]:
full_text = L1_cleaned+L2_cleaned+L3_cleaned+L4_cleaned
D = len(full_text)
D

419268673

# Data Stats

In [15]:
chars = sorted(list(set(full_text)))
vocab_size = len(chars)

token = full_text.encode("utf-8")
def get_stats(ids):
    counts = {} # Creates an empty Dictionary
    for pair in zip(ids,ids[1:]):
        counts[pair] = counts.get(pair ,0)+1
    return counts
print(vocab_size)
print(''.join(chars))
stats = get_stats(token)
top_pair = max(stats,key=stats.get)
top_pair

208
	
 !"#%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_abcdefghijklmnopqrstuvwxyz{}~ ©®Σθπ  ™↔↘↪∂∑√∞∫▪▫▶☀☁☄☺♀♣♥♦♻⚙⚠⚡✏✔✨❤🌈🌍🌏🌐🌞🌟🌬🌱🌳🌸🌻🌿🍃🍇🍊🍞🍡🍭🎉🎮🎶🏠🏡🏰🏼🏽🐇🐛🐦🐰🐿👑👧👨💁💎💕💖💚💡💪💻💾📃📊📚📝📡📣📱📲🔒🔗🔬🕊🖥🗣😁😂😃😄😊😎😐😔😢😮🚀🚧🚯🤖🤩🤯🥗🥦🦉🦋🦝


(101, 32)

# BPE

In [9]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.decoders import ByteLevel as ByteLevelDecoder

# Your cleaned text data
docs = [L1_cleaned, L2_cleaned, L3_cleaned, L4_cleaned]

# Break large documents into smaller lines/chunks
def chunked_docs():
    for doc in docs:
        # You can tweak the split here (e.g., '. ' or '\n' or custom logic)
        for line in doc.split('\n'):
            line = line.strip()
            if line:
                yield line

# Initialize BPE tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()

# Trainer with manageable vocab size
trainer = BpeTrainer(vocab_size=2000, special_tokens=["[UNK]"])

# Train using iterator to save memory
tokenizer.train_from_iterator(chunked_docs(), trainer=trainer)

# Check final vocab size
print("Actual vocab size:", tokenizer.get_vocab_size())







KeyboardInterrupt: 

In [None]:
def encode(text: str) -> list[int]:
    return tokenizer.encode(text).ids

def decode(token_ids: list[int]) -> str:
    return tokenizer.decode(token_ids)
sample = "math is beautiful ✨"
ids = encode(sample)
print("→", ids)
print("←", decode(ids))

# GPT Like Transformer

In [None]:
import os
import torch
import torch.nn as nn
from torch.nn import functional as F

# ------------------ Hyperparameters ------------------
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# # ------------
# # # hyperparameters for GPU
# batch_size = 128 # how many independent sequences will we process in parallel?
# block_size = 512 # what is the maximum context length for predictions?
# max_iters = 10000
# eval_interval = 500
# learning_rate = 3e-4
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# eval_iters = 200
# n_embd = 384
# n_head = 6
# n_layer = 6
# dropout = 0.2
# # # ------------

torch.manual_seed(1337)

# ------------------ BPE tokenizer functions (assumed defined) ------------------
# encode(text) -> list of token ids
# decode(ids)   -> string
# merges, vocab, etc. already built above in your notebook

# ------------------ Data loader for transformer ------------------
def get_batch_transformer(data):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x  = torch.stack([data[i:i+block_size] for i in ix]).to(device)
    y  = torch.stack([data[i+1:i+block_size+1] for i in ix]).to(device)
    return x, y

@torch.no_grad()
def estimate_loss_transformer(model, train_data, val_data):
    model.eval()
    out = {}
    for split, data in zip(['train','val'], [train_data, val_data]):
        losses = []
        for _ in range(eval_iters):
            X, Y = get_batch_transformer(data)
            _, loss = model(X, Y)
            losses.append(loss.item())
        out[split] = sum(losses) / len(losses)
    model.train()
    return out

def compute_perplexity(loss):
    return torch.exp(torch.tensor(loss))

# ------------------ Transformer model classes ------------------
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key   = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x); q = self.query(x)
        wei = q @ k.transpose(-2,-1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T,:T]==0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        return wei @ v

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj  = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.dropout(self.proj(out))

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd),
            nn.Dropout(dropout),
        )
    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa    = MultiHeadAttention(n_head, head_size)
        self.ffwd  = FeedForward(n_embd)
        self.ln1   = nn.LayerNorm(n_embd)
        self.ln2   = nn.LayerNorm(n_embd)
    def forward(self, x):
        x = x + self.sa(self.ln1(x))   # Residual connection after self-attention
        x = x + self.ffwd(self.ln2(x)) # Residual connection after feedforward
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table    = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f   = nn.LayerNorm(n_embd)
        self.lm_head= nn.Linear(n_embd, vocab_size)
        self.apply(self._init_weights)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets= targets.view(B*T)
            loss   = F.cross_entropy(logits, targets)
        return logits, loss
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _= self(idx_cond)
            logits = logits[:, -1, :]
            probs  = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# ------------------ Incremental Training Pipeline ------------------
import matplotlib.pyplot as plt

def incremental_training_pipeline(level_texts, model, checkpoint_dir="checkpoints"):
    os.makedirs(checkpoint_dir, exist_ok=True)

    # Store metrics for plotting
    all_metrics = {
        "level": [],
        "iters": [],
        "train_loss": [],
        "val_loss": [],
        "train_ppl": [],
        "val_ppl": [],
    }

    for level_i, level_text in enumerate(level_texts, start=1):
        print(f"\n=== LEVEL {level_i} TRAINING ===")
        data_ids   = torch.tensor(encode(level_text), dtype=torch.long).to(device)
        n          = int(0.9 * data_ids.size(0))
        train_data = data_ids[:n]
        val_data   = data_ids[n:]

        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
        for it in range(max_iters):
            if it % eval_interval == 0:
                losses = estimate_loss_transformer(model, train_data, val_data)
                ppl_train = compute_perplexity(losses['train'])
                ppl_val   = compute_perplexity(losses['val'])

                # Store for later plotting
                all_metrics["level"].append(level_i)
                all_metrics["iters"].append(it)
                all_metrics["train_loss"].append(losses['train'])
                all_metrics["val_loss"].append(losses['val'])
                all_metrics["train_ppl"].append(ppl_train.item())
                all_metrics["val_ppl"].append(ppl_val.item())

                print(f" it={it:4d} | train loss {losses['train']:.4f}  ppl {ppl_train:.2f}  | "
                      f"val loss {losses['val']:.4f}  ppl {ppl_val:.2f}")

            xb, yb = get_batch_transformer(train_data)
            logits, loss = model(xb, yb)
            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            optimizer.step()

        # final evaluation
        losses = estimate_loss_transformer(model, train_data, val_data)
        ppl_train = compute_perplexity(losses['train'])
        ppl_val   = compute_perplexity(losses['val'])

        print(f"\n*** Level {level_i} complete.")
        print(f"    Final train loss {losses['train']:.4f}, ppl {ppl_train:.2f}")
        print(f"    Final   val loss {losses['val']:.4f}, ppl {ppl_val:.2f}")

        # sample generation
        context = torch.zeros((1,1), dtype=torch.long, device=device)
        sample_ids = model.generate(context, max_new_tokens=500)[0].tolist()
        print("    Sample:", decode(sample_ids))

        # save checkpoint
        cp = os.path.join(checkpoint_dir, f"model_level{level_i}.pt")
        torch.save(model.state_dict(), cp)
        print(f"    → Saved checkpoint: {cp}")

    return model, all_metrics


# ------------------ Usage ------------------
# Make sure: L1, L2, L3, L4_cleaned are already in your notebook
level_texts = [L1_cleaned, L2_cleaned, L3_cleaned, L4_cleaned]

# 'vocab_size' must match your BPE final vocabulary size (e.g. 276)
vocab_size = tokenizer.get_vocab_size()
model = GPTLanguageModel(vocab_size).to(device)

model,all_metrics = incremental_training_pipeline(level_texts, model)


In [None]:
for i, level in enumerate(level_texts, start=1):
    D = len(level)
    total_tokens_seen = block_size * batch_size * max_iters
    epochs = total_tokens_seen / D
    print(f"Total Epochs for Level {i}: {epochs:.4f}")



In [None]:
# # For Shakespeare input prompt
# # # Example Shakespearean prompt
# prompt = "Who is the President of USA "
#
# # # Encode the prompt using your trained BPE tokenizer
# encoded = encode(prompt)
#
# # # Truncate if the prompt is longer than block_size
# if len(encoded) > block_size:
#     encoded = encoded[-block_size:]
#
# # # Create context tensor
# context = torch.tensor([encoded], dtype=torch.long, device=device)
#
# # # Generate continuation
# generated_ids = model.generate(context, max_new_tokens=1000)[0].tolist()
#
# # # Decode and print the generated text
# print(decode(generated_ids))

In [None]:
import matplotlib.pyplot as plt

def plot_metrics(all_metrics):
    levels = sorted(set(all_metrics["level"]))
    for level in levels:
        # Get indices for this level
        idxs = [i for i, l in enumerate(all_metrics["level"]) if l == level]
        iters = [all_metrics["iters"][i] for i in idxs]
        val_loss = [all_metrics["val_loss"][i] for i in idxs]
        val_ppl  = [all_metrics["val_ppl"][i] for i in idxs]

        # Create figure and first y-axis (for loss)
        fig, ax1 = plt.subplots(figsize=(10, 5))
        ax1.plot(iters, val_loss, color='blue', label='Validation Loss')
        ax1.set_xlabel('Iterations')
        ax1.set_ylabel('Validation Loss', color='blue')
        ax1.tick_params(axis='y', labelcolor='blue')

        # Second y-axis for perplexity
        ax2 = ax1.twinx()
        ax2.plot(iters, val_ppl, color='green', label='Validation Perplexity')
        ax2.set_ylabel('Validation Perplexity', color='green')
        ax2.tick_params(axis='y', labelcolor='green')

        plt.title(f'Level {level} - Validation Loss & Perplexity')
        fig.tight_layout()
        plt.grid(True)
        plt.show()


In [None]:
plot_metrics(all_metrics)

In [None]:
import joblib
joblib.dump(model, 'transformer.joblib')
