In [None]:
!pip install transformer_lens
try:
    import google.colab # type: ignore
    IN_COLAB = True
except:
    IN_COLAB = False

import os, sys

if IN_COLAB:
    # Install packages
    %pip install einops
    %pip install jaxtyping
    %pip install transformer_lens
    %pip install git+https://github.com/callummcdougall/CircuitsVis.git#subdirectory=python


In [None]:
!pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [1]:
from transformers import GPT2Tokenizer
from torch.nn import CrossEntropyLoss
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from datasets import load_dataset


data_path = "/content/IMDB Dataset.csv"
# Load IMDB dataset (3000 train, 1000 test)
dataset = load_dataset('csv', data_files= data_path)
train_subset = dataset["train"].shuffle(seed=42).select(range(7000))
test_subset = dataset["train"].shuffle(seed=42).select(range(2000))
dataset = {"train": train_subset, "test": test_subset}

Generating train split: 0 examples [00:00, ? examples/s]

In [2]:
test_subset[:5]
# print how many positive and negative reviews are in the dataset
print("Number of positive reviews: ", sum(1 for sentiment in test_subset['sentiment'] if sentiment == 'positive'))
print("Number of negative reviews: ", sum(1 for sentiment in test_subset['sentiment'] if sentiment == 'negative'))

Number of positive reviews:  984
Number of negative reviews:  1016


In [36]:
from torch.utils.data import DataLoader, Dataset
label_map = {"negative": 0, "positive": 1}
class IMDBDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.texts = dataset["review"]
        self.labels = [label_map[label] for label in dataset["sentiment"]]
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        input_ids = encodings["input_ids"].squeeze()
        attention_mask = encodings["attention_mask"].squeeze()
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return input_ids, attention_mask, label, idx

In [4]:
class SentimentClassifier(nn.Module):
    def __init__(self, transformer, hidden_dim=768, num_classes=2):
        super(SentimentClassifier, self).__init__()
        self.transformer = transformer
        self.classifier = nn.Linear(50257, num_classes)  # Maps hidden state → sentiment classes

    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(input_ids)  # (batch, seq_len, hidden_dim)
        pooled_output = outputs.mean(dim=1)  # Mean pool across sequence length
        logits = self.classifier(pooled_output)  # Shape: (batch_size, 2)
        return logits

In [5]:
import torch
from transformer_lens import HookedTransformer
# load the base model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Load HookedTransformer GPT-2 model
base_model = HookedTransformer.from_pretrained("gpt2-small", device=device)

# Create the sentiment classifier model
base_classifier = SentimentClassifier(base_model).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Loaded pretrained model gpt2-small into HookedTransformer


In [6]:
test_dataset = IMDBDataset(dataset["test"], tokenizer)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True)

In [31]:
base_classifier.eval()
correct = 0
total = 0
# store the data points  where it predicts wrong
# make an empty dataset to store the wrong predictions
wrong_predictions = []
with torch.no_grad():
    for input_ids, attention_mask, labels, idx in test_loader:
        input_ids, attention_mask, labels, idx = input_ids.to(device), attention_mask.to(device), labels.to(device), idx.to(device)
        logits = base_classifier(input_ids, attention_mask)
        _, predicted = torch.max(logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        wrong_predictions.extend(idx[predicted != labels].tolist())

print(f"Accuracy: {correct / total:.2f}")
print(f"Number of wrong predictions: {total - correct}")
print("Correct predictions: ", correct)

Accuracy: 0.46
Number of wrong predictions: 1072
Correct predictions:  928


In [None]:
wrong_predictions

In [33]:
# convert the wrong predictions to a dataset
wrong_dataset = [test_dataset[i] for i in wrong_predictions]
wrong_loader = DataLoader(wrong_dataset, batch_size=4, shuffle=True)

In [None]:
for batch in wrong_loader:
    print(labels)
    break

tensor([0, 0, 1, 1], device='cuda:0')


In [34]:
loaded_model = SentimentClassifier(base_model).to(device)
loaded_model.load_state_dict(torch.load("/content/gpt2-small-imdb-finetuned.pt"))

<All keys matched successfully>

In [37]:
loaded_model.eval()
correct = 0
total = 0

with torch.no_grad():
    for input_ids, attention_mask, labels, idx in test_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        logits = loaded_model(input_ids, attention_mask)
        _, predicted = torch.max(logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy: {correct / total:.2f}")
print("Correct: ", correct)
print("Total: ", total)

Accuracy: 0.74
Correct:  1484
Total:  2000


In [38]:
# do evaluation on the wrong dataset
correct = 0
total = 0
with torch.no_grad():
    for input_ids, attention_mask, labels, idx in wrong_loader:
        input_ids, attention_mask, labels, idx = input_ids.to(device), attention_mask.to(device), labels.to(device), idx.to(device)
        logits = loaded_model(input_ids, attention_mask)
        _, predicted = torch.max(logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy: {correct / total:.2f}")
print("Correct: ", correct)
print("Total: ", total)

Accuracy: 0.62
Correct:  663
Total:  1072


In [39]:
gpt2_text = "Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."
# gpt2_tokens = base_classifier.transformer.to_tokens(gpt2_text)
# gpt2_logits, gpt2_cache = base_classifier.transformer.run_with_cache(gpt2_tokens, remove_batch_dim=True)

In [None]:
attn_patterns_layer_0 = gpt2_cache["pattern", 0]

In [None]:
gpt2_tokens.shape

torch.Size([1, 179])

In [None]:
attn_patterns_layer_0

tensor([[[1.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [9.3206e-01, 6.7942e-02, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [4.8721e-01, 4.6577e-01, 4.7019e-02,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         ...,
         [5.8291e-03, 3.4683e-03, 1.2094e-03,  ..., 2.7702e-02,
          0.0000e+00, 0.0000e+00],
         [9.4315e-03, 3.8223e-03, 9.4298e-04,  ..., 3.7034e-02,
          9.2164e-03, 0.0000e+00],
         [8.1226e-03, 7.1643e-03, 6.2962e-04,  ..., 4.4826e-02,
          1.4836e-02, 2.3263e-03]],

        [[1.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [1.2923e-04, 9.9987e-01, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [4.7447e-04, 1.4770e-02, 9.8476e-01,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         ...,
         [2.5892e-05, 1.1795e-04, 2.4392e-05,  ..., 9.9452e-01,
          0.000

In [None]:
q, k = gpt2_cache["q", 0], gpt2_cache["k", 0]
q.shape

torch.Size([179, 12, 64])

In [51]:
import torch as t
import einops

def get_top_k_attention(model,text,k=10):

    # tokenize the input text
    tokens = model.transformer.to_tokens(text)

    model_logits , model_cache = model.transformer.run_with_cache(tokens,remove_batch_dim = True)

    # we would analyse layer 1
    layer = 6
    attn_scores = model_cache["pattern", layer]  # Shape: (nhead, seq, seq)

    # Average over all attention heads to get a single matrix
    avg_attn_scores = attn_scores.mean(dim=0)  # Shape: (seq, seq)
    # Get top K attention scores for each token
    topk_values, topk_indices = t.topk(avg_attn_scores, k, dim=-1)

    return topk_indices, topk_values


k = 5
base_topk_indices, base_topk_values = get_top_k_attention(base_classifier, gpt2_text, k)
fine_topk_indices, fine_topk_values = get_top_k_attention(loaded_model, gpt2_text, k)

In [None]:
base_topk_values

In [None]:
fine_topk_values

In [52]:

def get_jaccard_scores(base_topk_indices, fine_topk_indices):
        seq_len = base_topk_indices.shape[0]
        jaccard_scores = []

        for i in range(seq_len):
            base_topk_set = set(base_topk_indices[i].tolist())
            fine_topk_set = set(fine_topk_indices[i].tolist())

            intersection = len(base_topk_set & fine_topk_set)  # Common elements
            union = len(base_topk_set | fine_topk_set)  # Total unique elements

            jaccard_coeff = intersection / union if union > 0 else 0  # Avoid division by zero
            jaccard_scores.append(jaccard_coeff)

        # Compute average Jaccard coefficient for the text
        average_jaccard = sum(jaccard_scores) / len(jaccard_scores)

        # Print results
        for i, (token, jaccard) in enumerate(zip(gpt2_text.split(), jaccard_scores)):
            print(f"Token: {token} | Jaccard Coefficient: {jaccard:.4f}")

        print(f"\nAverage Jaccard Coefficient for the text: {average_jaccard:.4f}")

In [53]:
get_jaccard_scores(base_topk_indices,fine_topk_indices)

Token: Basically | Jaccard Coefficient: 1.0000
Token: there's | Jaccard Coefficient: 1.0000
Token: a | Jaccard Coefficient: 1.0000
Token: family | Jaccard Coefficient: 1.0000
Token: where | Jaccard Coefficient: 1.0000
Token: a | Jaccard Coefficient: 1.0000
Token: little | Jaccard Coefficient: 1.0000
Token: boy | Jaccard Coefficient: 1.0000
Token: (Jake) | Jaccard Coefficient: 1.0000
Token: thinks | Jaccard Coefficient: 1.0000
Token: there's | Jaccard Coefficient: 1.0000
Token: a | Jaccard Coefficient: 1.0000
Token: zombie | Jaccard Coefficient: 1.0000
Token: in | Jaccard Coefficient: 1.0000
Token: his | Jaccard Coefficient: 1.0000
Token: closet | Jaccard Coefficient: 1.0000
Token: & | Jaccard Coefficient: 1.0000
Token: his | Jaccard Coefficient: 1.0000
Token: parents | Jaccard Coefficient: 1.0000
Token: are | Jaccard Coefficient: 1.0000
Token: fighting | Jaccard Coefficient: 1.0000
Token: all | Jaccard Coefficient: 1.0000
Token: the | Jaccard Coefficient: 1.0000
Token: time.<br | Jacca