# 6.4610 Research Project

## Overview
In this file, we evaluate different transformer model trained on OpenWebText.

## Imports

In [None]:
import torch
import torch.nn as nn
from typing import Tuple, Union, Optional, List, Dict
import math
import numpy as np
from dataclasses import dataclass
from transformers import AutoTokenizer
import datasets
from tqdm import tqdm
import os
import json
from itertools import islice
import sys
import matplotlib.pyplot as plt
import re
import pandas as pd
import random
from transformers import AutoConfig, PreTrainedModel
from transformers.modeling_outputs import CausalLMOutputWithPast
!git clone https://github.com/EleutherAI/lm-evaluation-harness
%cd lm-evaluation-harness
!pip install -e .
from transformers import AutoModelForCausalLM, PretrainedConfig
from transformers.modeling_outputs import CausalLMOutputWithPast
from lm_eval.evaluator import simple_evaluate
# Import the Python SDK
import google.genai as genai
# Used to securely store your API key
from google.colab import userdata
client = genai.Client(api_key=userdata.get('GOOGLE_API_KEY'))

In [None]:
path = ""
sys.path.append(path)

%run "All_Models.ipynb"

## Preliminaries

In [3]:
def evaluate_model(model, tokenizer, test_prompts: List[str], temperature: float = 0.7):
    """Evaluate model with test prompts"""
    model.eval()

    # Get device from model parameters
    device = next(model.parameters()).device

    print("Generating samples from trained model:")
    print("=" * 60)

    for i, prompt in enumerate(test_prompts):
        print(f"\nPrompt {i+1}: '{prompt}'")
        print("-" * 40)

        # Tokenize prompt
        input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)

        # Generate with different temperatures
        with torch.no_grad():
            generated_ids = model.generate(
                input_ids,
                max_new_tokens=150,
                temperature=temperature
            )

            generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
            print(f"Temperature {temperature}: {generated_text}")
            print()

def calculate_perplexity(model, tokenizer, test_prompts_iter: iter, batch_size: int = 4, max_length: int = 512):
    """
    Calculate perplexity of model with test prompts in a batched way.
    Applies attention mask so that only valid tokens contribute to the loss.
    """
    model.eval()
    device = next(model.parameters()).device

    print("Calculating perplexity of model with test prompts (batched):")
    print("=" * 60)

    test_prompts = [sample['text'] for sample in test_prompts_iter]

    # Tokenize all prompts at once (batched)
    encodings = tokenizer(
        test_prompts,
        max_length=max_length,
        truncation=True,
        padding='longest',
        return_tensors='pt'
    )
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings.get('attention_mask', None)
    if attention_mask is not None:
        attention_mask = attention_mask.to(device)

    num_samples = input_ids.size(0)
    total_loss = 0.0
    total_tokens = 0
    with torch.inference_mode():
    # with torch.no_grad():
        for start in tqdm(range(0, num_samples, batch_size)):
            end = min(start + batch_size, num_samples)
            batch_input_ids = input_ids[start:end]

            # Forward pass
            logits = model(batch_input_ids)
            if isinstance(logits, tuple):
                logits = logits[0]

            # Shift logits and labels for causal language modeling
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = batch_input_ids[..., 1:].contiguous()
            if attention_mask is not None:
                batch_attention_mask = attention_mask[start:end]
                shift_mask = batch_attention_mask[..., 1:].contiguous()
            else:
                shift_mask = None

            # Flatten for loss computation
            loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
            loss = loss_fct(
                shift_logits.view(-1, shift_logits.size(-1)),
                shift_labels.view(-1)
            )  # (batch * seq_len-1,)

            if shift_mask is not None:
                loss = loss * shift_mask.view(-1).float()
                num_valid = shift_mask.sum().item()
            else:
                num_valid = shift_labels.numel()

            total_loss += loss.sum().item()
            total_tokens += num_valid

    avg_loss = total_loss / max(1, total_tokens)
    avg_perplexity = float(torch.exp(torch.tensor(avg_loss)))
    print(f"Average Perplexity: {avg_perplexity:.4f}")
    return avg_perplexity


def save_model(model, tokenizer, save_path: str):
    """Save model and tokenizer"""
    os.makedirs(save_path, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(save_path, "model.pt"))
    if hasattr(model, 'config'):
        torch.save(model.config.__dict__, os.path.join(save_path, "config.json"))
    tokenizer.save_pretrained(save_path)
    print(f"Model saved to {save_path}")

def load_model(model, load_path: str):
    """Load model weights"""
    model.load_state_dict(torch.load(os.path.join(load_path, "model.pt")))
    print(f"Model loaded from {load_path}")

## Loading Datasets And Declaring Global Constants

In [None]:
print("Loading OpenWebText dataset...")
dataset = datasets.load_dataset("openwebtext", split="train", streaming=True, trust_remote_code=True)

In [None]:
# Load a pre-trained tokenizer (GPT-2 tokenizer works well for English text)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

print(f"Tokenizer loaded with vocab size: {tokenizer.vocab_size}")
print(f"Special tokens: PAD={tokenizer.pad_token_id}, EOS={tokenizer.eos_token_id}")

training_config = TrainingConfig(vocab_size=tokenizer.vocab_size)
model_config = TransformerConfig(
    vocab_size=training_config.vocab_size,
    hidden_size=training_config.hidden_size,
    num_attention_heads=training_config.num_attention_heads,
    num_hidden_layers=training_config.num_hidden_layers,
    intermediate_size=training_config.intermediate_size,
    max_position_embeddings=training_config.max_position_embeddings
)


In [6]:
#global constants
DATA_SET = dataset
TEST_DATASET_SIZE = 10000
TOKENIZER = tokenizer
MODEL_CONFIG = model_config
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Model Evaluation and Generation

##Training Log Analysis

In [None]:
def load_training_log(model):
    assert model in {"transformer", "transformer_ph", "transformer_laplacian"}, "Invalid Model"

    records = []
    with open(os.path.join(path, f"logs_{model}/training_log.jsonl"), "r", encoding="utf-8") as f:
        for line in f:
            objs = re.findall(r"\{.*?\}", line)
            for obj_str in objs:
                try:
                    obj = json.loads(obj_str)
                    records.append(obj)
                except:
                    pass
    df = pd.DataFrame(records)
    df = df.dropna(subset=["batch_idx"])
    df = df[["step", "loss"]]
    return df


def plot_training_loss(dfs, start = 0, end = 150000, step = 1000, opacity = 0.5):
    plt.figure()
    for l, df in dfs:
        plt.plot(df["step"][start:end + 1: step], df["loss"][start:end + 1: step], label = l, alpha = opacity)
    plt.legend()
    plt.grid()
    plt.title("Training Loss VS. Steps")
    plt.ylabel("Training Loss")
    plt.xlabel("Steps")
    plt.show()
    plt.close()


In [None]:
models = ["transformer", "transformer_laplacian", "transformer_ph"]
dfs = [(m, load_training_log(m)) for m in models]
plot_training_loss(dfs, start = 10000, step = 2500)

In [None]:
def get_perplexity_at_checkpoints(model_name, save = True, checkpoints = [str(i) for i in range(5000, 150001, 5000)]):
    assert model_name in {"transformer", "transformer_ph", "transformer_laplacian"}, "Invalid Model"

    map = {"transformer":CausalLanguageModel, "transformer_laplacian": CausalLanguageModelL, "transformer_ph":CausalLanguageModelPH}
    record = {}

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = map[model_name](MODEL_CONFIG).to(device)

    for cp in checkpoints:
        test_iter = islice(DATA_SET, 0, TEST_DATASET_SIZE)

        print(f"Loading {model_name} at checkpoint-{cp}")
        load_model(model, os.path.join(path, f"{model_name}/checkpoint-{cp}"))
        record[cp] = calculate_perplexity(model, TOKENIZER, test_iter)

        print("\n")

        if save:
            with open(os.path.join(path, f"{model_name}/perplexity_data"), "w") as f:
                json.dump(record, f)
    return record

In [None]:
get_perplexity_at_checkpoints("transformer", save = True)

In [None]:
def load_perplexity_log(model):
    assert model in {"transformer", "transformer_ph", "transformer_laplacian"}, "Invalid Model"

    records = []
    with open(os.path.join(path, f"{model}/perplexity_data"), "r", encoding="utf-8") as f:
        try:
          data = json.load(f)
        except:
          pass
    df = {"checkpoints": list(data.keys()), "perplexities": list(data.values())}
    return df


def plot_perplexity(dfs, opacity = 0.5):
    plt.figure()
    for l, df in dfs:
        plt.plot(df["checkpoints"], df["perplexities"], label = l, alpha = opacity)
    plt.legend()
    plt.grid()
    plt.title("Perplexity VS. Steps")
    plt.ylabel("Perplexity")
    plt.xlabel("Steps")
    plt.xticks(df["checkpoints"][2::3], rotation=45)
    plt.show()
    plt.close()

models = ["transformer", "transformer_laplacian", "transformer_ph"]
dfs = [(m, load_perplexity_log(m)) for m in models]

In [None]:
plot_perplexity(dfs)

##Generative Analysis


In [None]:
model_original = CausalLanguageModel(model_config)
load_model(model_original, os.path.join(path, "transformer"))
print(f"Model initialized with {count_parameters(model_original):,} parameters")

In [None]:
model_laplacian = CausalLanguageModelL(model_config)
load_model(model_laplacian, os.path.join(path, "transformer_laplacian"))
print(f"Model initialized with {count_parameters(model_laplacian):,} parameters")

In [None]:
model_ph  = CausalLanguageModelPH(model_config)
load_model(model_ph, os.path.join(path, "transformer_ph"))
print(f"Model initialized with {count_parameters(model_ph):,} parameters")


In [None]:
model_original.to(DEVICE)
model_laplacian.to(DEVICE)
model_ph.to(DEVICE)

#Qualitative Analysis

In [11]:
def generate_response(model, tokenizer, message, device, *,
                      max_new_tokens=100):

    enc = tokenizer(message, return_tensors="pt")
    input_ids = enc["input_ids"]

    seq_len = input_ids.size(1)
    max_pos = model.config.max_position_embeddings

    if seq_len > max_pos:
        input_ids = input_ids[:, -max_pos:]
    input_ids = input_ids.to(device)
    input_len = input_ids.shape[1]

    gen_kwargs = dict(max_new_tokens=max_new_tokens)

    with torch.inference_mode():
        output_ids = model.generate(input_ids, **gen_kwargs)

    new_tokens = output_ids[0][input_len:]
    return tokenizer.decode(new_tokens, skip_special_tokens=True).strip()

##Grammar Checks
Given sentence starters, test for sentence completions
Plug in words/doc/grammarly to check for grammar mistakes

In [None]:
def generate_random_response(model, count):
    for i in range(count):
        print(f"Generated Text {i}")
        print(generate_response(model, TOKENIZER, "Tell me something.", DEVICE), "\n\n")

In [None]:
generate_random_response(model_original, 10)

In [None]:
generate_random_response(model_laplacian, 10)

In [None]:
generate_random_response(model_ph, 10)

##LLM as Judge



In [None]:
def get_preferred_response_with_llm(prompt, response_1, response_2):
    """
    Compares two responses to a given prompt using an LLM as a judge and returns the preferred response.

    Args:
        prompt: The original prompt.
        response_1: The first response to compare.
        response_2: The second response to compare.

    Returns:
        The preferred response based on the judge model's output, or None if an error occurred.
    """

    # Randomly assign A and B to avoid position bias
    flip = random.random() < 0.5
    response_a = response_1 if flip else response_2
    response_b = response_2 if flip else response_1

    # TODO: Your prompt here
    # Hint: In the prompt, specify a certain output format for simpler parsing
    judge_prompt = f"Given this prompt and the two responses:\n\
    Prompt:{prompt}\n\
    Reponse 1:{response_a}\n\
    Response 2:{response_b}\n\
    which response is better? Output 1 if response 1 is better. Output 2 if response 2 is better. Only output a single number, no explanation is needed."

    try:
        result = client.models.generate_content(
            model="gemini-2.0-flash-001",
            contents=judge_prompt,
            config=genai.types.GenerateContentConfig(
                temperature=0,
            ),
        )
        output = result.text.strip().upper()

        # TODO: Extract the model's preferred response from the output
        # --- YOUR CODE HERE ---
        choice = output[-1]
        if choice not in {"1", "2"}:
            return None
        if not flip:
            return 0 if choice == "1" else 1
        else:
            return 1 if choice == "1" else 0


    except Exception as e:
        print(f"Error: {e}")
        return None

In [16]:
def run_LLM_Judge(m1, m2, messages):
    labels = {CausalLanguageModel: "Original Transformer", CausalLanguageModelL:"Laplacian Transformer", CausalLanguageModelPH:"PH Tranformer "}
    model_1 = m1
    model_2 = m2
    num_examples = min(20, len(messages))
    results = []

    for i in range(num_examples):
        message = messages[i]

        response_1 = generate_response(model_1, tokenizer, message, DEVICE) #transformer
        response_2 = generate_response(model_2, tokenizer, message, DEVICE) #transformer laplacian

        # Use Gemini as judge to choose which response is better (0 = response_1 wins, 1 = response_2 wins)
        preferred_response = get_preferred_response_with_llm(
            prompt=message,
            response_1=response_1,
            response_2=response_2,
        )
        print(f"\n=====================================Example {i+1}==========================================")
        print("Prompt:", message)
        print(f"{labels[type(model_1)]} Response:", response_1)
        print("\n")
        print(f"{labels[type(model_2)]} Response:", response_2)
        print("\n")
        print("Winner:", preferred_response)

        results.append({
            "prompt": message,
            "response_model_1": response_1,
            "response_model_2": response_2,
            "preferred_response": preferred_response,
        })

    # Compute win rate
    model_1_wins = sum(1 for r in results if r["preferred_response"] == 0)
    model_2_wins = sum(1 for r in results if r["preferred_response"] == 1)
    total_valid = model_1_wins + model_2_wins

    print(f"\n{labels[type(model_1)]} win rate: {model_1_wins}/{total_valid}")
    print(f"\n{labels[type(model_2)]} win rate: {model_2_wins}/{total_valid}")

    return model_1_wins, model_2_wins

def run_iterations(m1, m2, messages, iter):
    labels = {CausalLanguageModel: "Original Transformer", CausalLanguageModelL:"Laplacian Transformer", CausalLanguageModelPH:"PH Tranformer "}
    counter = [0, 0]
    for i in range(iter):
        print(f"++++++++++++++++++++++++++++++++++++++++++++++++++ITERATION {i}++++++++++++++++++++++++++++++++++++++++++++++++++")
        w1, w2 = run_LLM_Judge(m1, m2, messages)
        counter[0] += w1
        counter[1] += w2
    print(f"\n{labels[type(m1)]} win rate: {counter[0]}/{counter[0] + counter[1]}")
    print(f"\n{labels[type(m2)]} win rate: {counter[1]}/{counter[0] + counter[1]}")


In [17]:
messages = ["Complete the following sentence:\n Alice was running from",
            "Complete the following sentence:\n The bakery had many",
            "Complete the following sentence:\n The winter is the perfect time for",
            "Complete the following sentence:\n Jack was driving on the",
            "Complete the following sentence:\n Jack was having a really great",
            "Answer the following question:\nWhat happens when you throw an egg?",
            "Answer the following question:\n Can you tell me a joke?",
            "Answer the following question:\n What are people doing on reddit?",
            "Answer the following question:\n What does the fox say?",
            "Answer the following question:\n Who are you and what am I?"]

In [None]:
run_iterations(model_original, model_laplacian, messages, 10)

In [None]:
run_iterations(model_original, model_ph, messages, 10)

In [None]:
run_iterations(model_ph, model_laplacian, messages, 10)

#Leader Board Ranking - lm-evaluation-harness

##Converting current models to HF transformer models

In [None]:
class HFTransformerConfig(PretrainedConfig):
    """
    HuggingFace-compatible wrapper around your existing TransformerConfig.

    This is what gets saved to / loaded from config.json,
    and what the HF + lm-eval-harness ecosystem will see.
    """
    model_type = "custom_transformer"

    def __init__(
        self,
        vocab_size=50257,
        hidden_size=768,
        num_attention_heads=12,
        num_hidden_layers=12,
        intermediate_size=3072,
        max_position_embeddings=512,
        use_causal_mask=True,
        number_diffusion_kernels=4,
        arch_type="original", #original, laplacian, ph
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_attention_heads = num_attention_heads
        self.num_hidden_layers = num_hidden_layers
        self.intermediate_size = intermediate_size
        self.max_position_embeddings = max_position_embeddings
        self.use_causal_mask = use_causal_mask
        self.number_diffusion_kernels = number_diffusion_kernels

    @classmethod
    def from_original(cls, cfg: TransformerConfig, **extra_kwargs):
        """
        Create HFTransformerConfig from your original TransformerConfig dataclass.
        """
        base = cfg.__dict__.copy()
        base.update(extra_kwargs)
        return cls(**base)

In [None]:
class HFTransformer(PreTrainedModel):
    """
    HuggingFace-style causal LM that mirrors your CausalLanguageModel.

    - Uses HFTransformerConfig (for config.json, from_pretrained, etc.)
    - Internally builds TransformerModel + lm_head exactly like your original
    """
    config_class = HFTransformerConfig

    def __init__(self, config: HFTransformerConfig):
        super().__init__(config)

        # Convert HF config -> your original dataclass
        orig_cfg = TransformerConfig(
            vocab_size=config.vocab_size,
            hidden_size=config.hidden_size,
            num_attention_heads=config.num_attention_heads,
            num_hidden_layers=config.num_hidden_layers,
            intermediate_size=config.intermediate_size,
            max_position_embeddings=config.max_position_embeddings,
            use_causal_mask=config.use_causal_mask,
            number_diffusion_kernels=getattr(config, "number_diffusion_kernels", 4),
        )

        # Recreate your architecture
        if config.archtype == "laplacian":
            self.transformer = TransformerModelL(orig_cfg)
        elif config.archtype == "ph":
            self.transformer = TransformerModelPH(orig_cfg)
        else:
            self.transformer = TransformerModel(orig_cfg)

        self.lm_head = nn.Linear(orig_cfg.hidden_size, orig_cfg.vocab_size, bias=False)
        self.criterion = nn.CrossEntropyLoss()

        # HF utility to init any new params
        self.post_init()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        labels=None,
        **kwargs,
    ):
        """
        HF-style forward for causal LM.

        input_ids: [batch, seq]
        labels:    [batch, seq] (optional)
        attention_mask is accepted for API compatibility; you can
        wire it into TransformerModel later if you support masking.
        """

        # Your original behavior (minus returning a tuple directly)
        hidden_states = self.transformer(input_ids)  # [B, T, H]
        logits = self.lm_head(hidden_states)         # [B, T, V]

        loss = None
        if labels is not None:
            # Same training objective you had before:
            # predict token t+1 from token t
            shift_logits = logits[:, :-1, :].contiguous()
            shift_labels = labels[:, 1:].contiguous()

            loss = self.criterion(
                shift_logits.view(-1, shift_logits.size(-1)),
                shift_labels.view(-1),
            )

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=None,
            hidden_states=None,
            attentions=None,
        )
    @torch.no_grad()
    def generate(
        self,
        input_ids=None,
        attention_mask=None,
        max_length=None,
        max_new_tokens=None,
        temperature: float = 1.0,
        **kwargs,
    ):
        """
        Simple autoregressive generation, similar to your original CausalLanguageModel.generate.

        Supports the common HF call pattern:
            model.generate(**tokenizer("Hello", return_tensors="pt"), max_length=40)
        """
        if input_ids is None:
            raise ValueError("generate() requires input_ids")

        # Figure out how many new tokens to generate
        if max_new_tokens is None:
            if max_length is None:
                max_new_tokens = 20  # default
            else:
                max_new_tokens = max_length - input_ids.size(1)
                if max_new_tokens <= 0:
                    return input_ids
        # else: trust max_new_tokens

        # Ensure we're on the same device as the model
        device = next(self.parameters()).device
        input_ids = input_ids.to(device)
        if attention_mask is not None:
            attention_mask = attention_mask.to(device)

        for _ in range(max_new_tokens):
            # Call your HF-style forward, which returns CausalLMOutputWithPast
            outputs = self(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits  # [B, T, V]

            # Take the last token's logits
            next_token_logits = logits[:, -1, :] / temperature
            probs = torch.nn.functional.softmax(next_token_logits, dim=-1)

            next_token = torch.multinomial(probs, num_samples=1)  # [B, 1]
            input_ids = torch.cat([input_ids, next_token], dim=1)

            if attention_mask is not None:
                new_mask_token = torch.ones_like(next_token)
                attention_mask = torch.cat([attention_mask, new_mask_token], dim=1)

        return input_ids

In [None]:
#Using this to convert current model to Huggingface wrapper models
hf_cfg = HFTransformerConfig.from_original(MODEL_CONFIG, archtype="original")
hf_model = HFTransformer(hf_cfg)
state = torch.load(os.path.join(path, "transformer/model.pt"), map_location="cpu")
missing, unexpected = hf_model.load_state_dict(state, strict=False)
print("Missing keys:", missing)
print("Unexpected keys:", unexpected)
save_dir = os.path.join(path, "HFWrapper_transformer_model")
hf_model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
hf_cfg.save_pretrained(save_dir)

##Running lm-eval

In [None]:
# Register your config class for this model_type string
AutoConfig.register("custom_transformer", HFTransformerConfig)
AutoModelForCausalLM.register(HFTransformerConfig, HFTransformer)


# model = AutoModelForCausalLM.from_pretrained(save_dir)
# tokenizer = AutoTokenizer.from_pretrained(save_dir)
# text = tokenizer("Hello world", return_tensors="pt")
# out = model.generate(**text, max_length=40)
# print(tokenizer.decode(out[0], skip_special_tokens=True))

###Normal Transformer

In [None]:
transformer_model = AutoModelForCausalLM.from_pretrained(os.path.join(path, "HFWrapper_transformer_model"))
tokenizer = AutoTokenizer.from_pretrained(os.path.join(path, "HFWrapper_transformer_model"))

results = simple_evaluate(
    model="hf",
    model_args= "pretrained="+os.path.join(path, "HFWrapper_transformer_model"),
    tasks=["hellaswag", "piqa"],
    batch_size=1,
)

In [None]:
results

In [None]:
results["results"]

###Laplacian Transformer

In [None]:
transformer_lapalacian_model = AutoModelForCausalLM.from_pretrained(os.path.join(path, "HFWrapper_transformer_laplacian_model"))
tokenizer = AutoTokenizer.from_pretrained(os.path.join(path, "HFWrapper_transformer_laplacian_model"))

results_laplacian = simple_evaluate(
    model="hf",
    model_args= "pretrained="+os.path.join(path, "HFWrapper_transformer_laplacian_model"),
    tasks=["hellaswag", "piqa"],
    batch_size=1,
)


In [None]:
results_laplacian

In [None]:
results_laplacian['results']

###PH Transformer

In [None]:
transformer_ph_model = AutoModelForCausalLM.from_pretrained(os.path.join(path, "HFWrapper_transformer_ph_model"))
tokenizer = AutoTokenizer.from_pretrained(os.path.join(path, "HFWrapper_transformer_ph_model"))

results_ph = simple_evaluate(
    model="hf",
    model_args= "pretrained="+os.path.join(path, "HFWrapper_transformer_ph_model"),
    tasks=["hellaswag", "piqa"],
    batch_size=1,
)

In [None]:
results_ph

In [None]:
results_ph['results']