In [None]:
!pip install torch transformers datasets gradio tqdm ipywidgets huggingface_hub

In [None]:

# ===============================
# MiniLLM Training on Stanford OVAL Wikipedia
# Seq2Seq, Gradio Demo - Optimized for Colab
# With Streaming Mode Implementation
# ===============================

# First, install required packages
print("Installing required packages...")
!pip install torch transformers datasets gradio tqdm huggingface_hub
!pip install 'datasets[streaming]' aiohttp  # Required for streaming

print("All packages installed successfully!")

# Import libraries
import torch
from torch import nn
from torch.utils.data import IterableDataset, DataLoader
from transformers import GPT2Tokenizer
from datasets import load_dataset
import gradio as gr
import os
import time
from huggingface_hub import login
from tqdm import tqdm
from google.colab import userdata  # For accessing Colab secrets

# -------------------------------
# Authentication with Hugging Face
# -------------------------------
print("Authenticating with Hugging Face Hub...")
try:
    # Try to get token from Colab secrets
    HF_TOKEN = userdata.get('HF_TOKEN')
    login(token=HF_TOKEN)
    print("Authentication successful using Colab secret!")
except Exception as e:
    print(f"Colab secret not found: {e}")
    try:
        # Try environment variable
        HF_TOKEN = os.environ.get('HF_TOKEN')
        if HF_TOKEN:
            login(token=HF_TOKEN)
            print("Authentication successful using environment variable!")
        else:
            print("No HF_TOKEN found. Continuing without authentication...")
    except Exception as e2:
        print(f"Authentication failed: {e2}")
        print("Continuing without authentication - may have limited access")

# -------------------------------
# Device detection
# -------------------------------
print("Detecting available hardware...")
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Running on CUDA GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Running on CPU")

# -------------------------------
# Tokenizer
# -------------------------------
print("Loading tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'eos_token': ''})

# -------------------------------
# Streaming Dataset Implementation
# -------------------------------
print("Loading dataset with streaming...")
try:
    # Load dataset with streaming enabled
    dataset = load_dataset("stanford-oval/wikipedia", "20241101", split="train", streaming=True)

    # Create a custom IterableDataset for streaming
    class WikipediaStreamingDataset(IterableDataset):
        def __init__(self, hf_dataset, tokenizer, max_samples=50000, max_len=256):
            self.hf_dataset = hf_dataset
            self.tokenizer = tokenizer
            self.max_samples = max_samples
            self.max_len = max_len

        def __iter__(self):
            count = 0
            for item in self.hf_dataset:
                if count >= self.max_samples:
                    break

                # Extract question and answer
                question = item.get("document_title", "")
                answer = item.get("content", "")

                if question and answer:  # Only process if both exist
                    # Tokenize inputs
                    input_ids = self.tokenizer.encode(
                        question,
                        max_length=self.max_len//2,
                        truncation=True,
                        padding='max_length',
                        return_tensors='pt'
                    ).squeeze(0)

                    # Tokenize targets with EOS token
                    target_ids = self.tokenizer.encode(
                        answer + self.tokenizer.eos_token,
                        max_length=self.max_len//2,
                        truncation=True,
                        padding='max_length',
                        return_tensors='pt'
                    ).squeeze(0)

                    yield input_ids, target_ids
                    count += 1

    # Create streaming dataset
    streaming_dataset = WikipediaStreamingDataset(dataset, tokenizer)
    print("Streaming dataset created successfully!")

except Exception as e:
    print(f"Error loading streaming dataset: {e}")
    print("Falling back to dummy data...")
    # Create dummy dataset as fallback
    class DummyDataset(IterableDataset):
        def __init__(self, tokenizer, num_samples=1000, max_len=256):
            self.tokenizer = tokenizer
            self.num_samples = num_samples
            self.max_len = max_len

        def __iter__(self):
            for i in range(self.num_samples):
                # Create dummy questions and answers
                question = f"What is topic {i}?"
                answer = f"This is a detailed explanation about topic {i}."

                input_ids = self.tokenizer.encode(
                    question,
                    max_length=self.max_len//2,
                    truncation=True,
                    padding='max_length',
                    return_tensors='pt'
                ).squeeze(0)

                target_ids = self.tokenizer.encode(
                    answer + self.tokenizer.eos_token,
                    max_length=self.max_len//2,
                    truncation=True,
                    padding='max_length',
                    return_tensors='pt'
                ).squeeze(0)

                yield input_ids, target_ids

    streaming_dataset = DummyDataset(tokenizer)
    print("Using dummy dataset as fallback")

# -------------------------------
# MiniLLM Model
# -------------------------------
class MiniLLM(nn.Module):
    def __init__(self, vocab_size, emb_size=512, n_layers=8, n_heads=8, ff_size=2048, max_len=256):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_size)
        self.pos_embed = nn.Embedding(max_len, emb_size)
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=emb_size,
                nhead=n_heads,
                dim_feedforward=ff_size,
                activation='gelu',
                batch_first=True
            )
            for _ in range(n_layers)
        ])
        self.ln = nn.LayerNorm(emb_size)
        self.head = nn.Linear(emb_size, vocab_size)

    def forward(self, x):
        seq_len = x.size(1)
        positions = torch.arange(0, seq_len, device=x.device).unsqueeze(0)
        x = self.embed(x) + self.pos_embed(positions)
        for layer in self.layers:
            x = layer(x)
        x = self.ln(x)
        logits = self.head(x)
        return logits

vocab_size = len(tokenizer)
model = MiniLLM(vocab_size)
model.to(device)
print("MiniLLM initialized with vocab size:", vocab_size)

# -------------------------------
# Training Setup
# -------------------------------
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
num_epochs = 3  # Reduced for demonstration
target_batches = 1000  # Reduced target for demonstration

# Create DataLoader
batch_size = 4 if torch.cuda.is_available() else 2
loader = DataLoader(streaming_dataset, batch_size=batch_size, num_workers=0)

print(f"Starting training for {num_epochs} epochs with target of {target_batches} batches...")
print(f"Using batch size: {batch_size}")

# -------------------------------
# Training Loop
# -------------------------------
start_time = time.time()
batch_count = 0

for epoch in range(num_epochs):
    epoch_loss = 0
    progress_bar = tqdm(loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch_idx, (x, y) in enumerate(progress_bar):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model(x)
        logits = logits.view(-1, vocab_size)
        y_flat = y.view(-1)
        loss = loss_fn(logits, y_flat)

        loss.backward()
        optimizer.step()

        batch_count += 1
        epoch_loss += loss.item()

        # Update progress bar
        progress_bar.set_postfix({"Loss": f"{loss.item():.4f}"})

        # Check if we've reached the target number of batches
        if batch_count >= target_batches:
            break

    # Calculate average epoch loss
    avg_epoch_loss = epoch_loss / (batch_idx + 1)
    print(f"Epoch {epoch+1}/{num_epochs} completed. Average Loss: {avg_epoch_loss:.4f}")

    # Save checkpoint
    torch.save(model.state_dict(), f"minillm_epoch_{epoch+1}.pt")
    print(f"Checkpoint saved for epoch {epoch+1}")

    if batch_count >= target_batches:
        break

end_time = time.time()
training_time = end_time - start_time
print(f"Training completed in {training_time/60:.2f} minutes!")

# Save final model
torch.save(model.state_dict(), "minillm_final.pt")
print("Final model saved.")

# -------------------------------
# Gradio Demo
# -------------------------------
def answer_question(question, max_new_tokens=100, top_k=50, temperature=0.8):
    model.eval()
    tokens = tokenizer.encode(question, return_tensors="pt").to(device)
    generated = tokens

    with torch.no_grad():
        for _ in range(max_new_tokens):
            logits = model(generated)
            next_token_logits = logits[:, -1, :]
            probs = torch.softmax(next_token_logits / temperature, dim=-1)
            top_probs, top_idx = torch.topk(probs, top_k)
            next_token = top_idx[0, torch.multinomial(top_probs[0], 1)]
            generated = torch.cat([generated, next_token.unsqueeze(0)], dim=1)
            if next_token.item() == tokenizer.eos_token_id:
                break

    output = tokenizer.decode(generated[0], skip_special_tokens=True)
    return output

# Create and launch Gradio interface
demo = gr.Interface(
    fn=answer_question,
    inputs=gr.Textbox(label="Question"),
    outputs=gr.Textbox(label="Answer"),
    title="MiniLLM Question Answering Demo",
    description="Ask a question and see how the MiniLLM model responds!"
)

demo.launch(share=True)  # share=True creates a public link
