# LLM Project - Colab Setup

This notebook demonstrates how to use the modular LLM project in Google Colab.


In [None]:
%pip install -q transformers torch matplotlib scikit-learn numpy python-dateutil


In [None]:
import sys
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

# Add repo root to path if needed
sys.path.insert(0, '.')  # adjust if your module path differs

from llm_tokenizers import BaseTokenizerWrapper
from llm_models import Seq2SeqModelLoader

print("✅ Imports ready")


In [None]:
# Initialize tokenizer and model
tokenizer = BaseTokenizerWrapper("t5-small")
model = Seq2SeqModelLoader("t5-small")
print("✅ Tokenizer and model loaded")


## Encoding and Decoding
Convert text to token IDs and back.


In [None]:
text = "hello, this is a sentence!"
encoded = tokenizer.encode(text)
print("Text:", text)
print("Token IDs:", encoded['input_ids'])
print("Decoded:", tokenizer.decode(encoded['input_ids']))


## Forward Pass (Encoder-Decoder)
Run a single forward step with decoder starting at <pad>.


In [None]:
inp = "translate english to german: hello, how are you?"
toks = tokenizer.encode(inp, return_tensors="pt")
decoder_input_ids = torch.tensor([[tokenizer.tokenizer.pad_token_id]])
with torch.no_grad():
    out = model(**toks, decoder_input_ids=decoder_input_ids)
print("Logits shape:", out.logits.shape)
print("Keys:", list(out.keys()))


## Text Generation (Greedy)
Use model.generate() to produce a translation.


In [None]:
with torch.no_grad():
    gen_ids = model.generate(**toks, max_length=20)
print("Generated IDs:", gen_ids)
print("Generated text:", tokenizer.decode(gen_ids[0], skip_special_tokens=True))


## Token Embeddings: PCA and Cosine Similarity
Project token embeddings to 2D (PCA) and visualize pairwise cosine similarity.


add mardown cells to it and do