In [None]:
# Cell 1: Install Necessary Libraries and Import Modules
!pip install datasets transformers

from datasets import load_dataset
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer
from torch.utils.data import DataLoader, Dataset

print("Libraries installed and modules imported.")




Libraries installed and modules imported.


In [None]:
# Cell 2: Load Datasets and Print Data Structure
# Load ARC-Easy dataset
arc_dataset = load_dataset("allenai/ai2_arc", "ARC-Easy")
print("ARC-Easy dataset structure:")
print(arc_dataset)

# Load MMLU dataset
mmlu_dataset = load_dataset("cais/mmlu", "abstract_algebra")
print("MMLU dataset structure:")
print(mmlu_dataset)

# Load GSM8K dataset
gsm8k_dataset = load_dataset("openai/gsm8k", "main")
print("GSM8K dataset structure:")
print(gsm8k_dataset)

# Load OpenAI HumanEval dataset
humaneval_dataset = load_dataset("openai/openai_humaneval")
print("OpenAI HumanEval dataset structure:")
print(humaneval_dataset)

print("Datasets loaded successfully.")


ARC-Easy dataset structure:
DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'choices', 'answerKey'],
        num_rows: 2251
    })
    test: Dataset({
        features: ['id', 'question', 'choices', 'answerKey'],
        num_rows: 2376
    })
    validation: Dataset({
        features: ['id', 'question', 'choices', 'answerKey'],
        num_rows: 570
    })
})
MMLU dataset structure:
DatasetDict({
    test: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 11
    })
    dev: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 5
    })
})
GSM8K dataset structure:
DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})
Open

In [None]:
# Cell 3: Define Model Classes, Connect to Google Drive, and Load Model
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Cell 3: Define Model Classes, Connect to Google Drive, and Load Model
from google.colab import drive
drive.mount('/content/drive')

class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)

        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))

class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg['emb_dim'], 4 * cfg['emb_dim']),
            GELU(),
            nn.Linear(4 * cfg['emb_dim'], cfg['emb_dim']),
            nn.Dropout(cfg['drop_rate'])
        )

    def forward(self, x):
        return self.layers(x)

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, block_size, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            'mask',
            torch.triu(torch.ones(block_size, block_size), diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        queries = self.W_query(x)
        keys = self.W_key(x)
        values = self.W_value(x)

        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)

        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        attn_scores = queries @ keys.transpose(2, 3)

        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        mask_unsqueezed = mask_bool.unsqueeze(0).unsqueeze(0)
        attn_scores.masked_fill_(mask_unsqueezed, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1] ** 0.5, dim=-1)

        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1, 2)
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)

        return context_vec

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.att = MultiHeadAttention(
            d_in=cfg['emb_dim'],
            d_out=cfg['emb_dim'],
            block_size=cfg['ctx_len'],
            num_heads=cfg['n_heads'],
            dropout=cfg['drop_rate'],
            qkv_bias=cfg['qkv_bias'],
        )

        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg['emb_dim'])
        self.norm2 = LayerNorm(cfg['emb_dim'])
        self.drop_resid = nn.Dropout(cfg['drop_rate'])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_resid(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_resid(x)
        x = x + shortcut

        return x

class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['ctx_len'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg['n_layers'])]
        )

        self.final_norm = LayerNorm(cfg['emb_dim'])
        self.out_head = nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds

        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)

        return logits

# Define the configuration
cfg = {
    'emb_dim': 768,
    'ctx_len': 1024,
    'n_heads': 12,
    'drop_rate': 0.1,
    'qkv_bias': False,
    'vocab_size': 50257,
    'n_layers': 12,
}

# Instantiate the model
model = GPTModel(cfg)

# Path to the model file
file_path = '/content/drive/MyDrive/model_and_optimizer (1).pth'

# Load the model state
try:
    model_state = torch.load(file_path)
    model.load_state_dict(model_state['model_state_dict'])
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model loaded successfully.


In [None]:
# Cell 4: Define Custom Dataset Classes and Evaluation Functions

# Custom dataset class for ARC
class ARCDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        question = item['question']
        choices = item['choices']['text']
        answer_key = item['answerKey']

        if ord(answer_key) - ord('A') >= len(choices):
            print(f"Error: answerKey {answer_key} out of range for choices {choices}")
            # Use a default label text or skip this entry
            label_text = choices[0] if choices else ""
        else:
            label_text = choices[ord(answer_key) - ord('A')]  # Convert 'A', 'B', 'C', 'D' to index

        input_text = question + " " + " ".join(choices)
        inputs = self.tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=1024)
        labels = self.tokenizer(label_text, return_tensors="pt", truncation=True, padding="max_length", max_length=1024)
        return inputs['input_ids'].squeeze(0), labels['input_ids'].squeeze(0)

# Custom dataset class for MMLU
class MMLUDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        question = item['question']
        choices = item['choices']
        answer = item['answer']
        input_text = question + " " + " ".join(choices)
        inputs = self.tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=1024)
        labels = self.tokenizer(choices[answer], return_tensors="pt", truncation=True, padding="max_length", max_length=1024)
        return inputs['input_ids'].squeeze(0), labels['input_ids'].squeeze(0)

# Custom dataset class for GSM8K
class GSM8KDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        inputs = self.tokenizer(item['question'], return_tensors="pt", truncation=True, padding="max_length", max_length=1024)
        labels = self.tokenizer(item['answer'], return_tensors="pt", truncation=True, padding="max_length", max_length=1024)
        return inputs['input_ids'].squeeze(0), labels['input_ids'].squeeze(0)

# Custom dataset class for OpenAI HumanEval
class HumanEvalDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        inputs = self.tokenizer(item['prompt'], return_tensors="pt", truncation=True, padding="max_length", max_length=1024)
        labels = self.tokenizer(item['canonical_solution'], return_tensors="pt", truncation=True, padding="max_length", max_length=1024)
        return inputs['input_ids'].squeeze(0), labels['input_ids'].squeeze(0)

# Define the evaluation function
def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in dataloader:
            if batch is None:
                continue
            inputs, labels = batch
            inputs = inputs.to(model.device)
            labels = labels.to(model.device)
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, outputs.size(-1)), labels.view(-1))
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    return avg_loss, accuracy

# Define the loss criterion
criterion = nn.CrossEntropyLoss()


In [None]:
# Cell 5: Run Evaluations and Print Results

# Prepare datasets
arc_dataloader = DataLoader(ARCDataset(arc_dataset['test'], tokenizer), batch_size=32, shuffle=False)
mmlu_dataloader = DataLoader(MMLUDataset(mmlu_dataset['test'], tokenizer), batch_size=32, shuffle=False)
gsm8k_dataloader = DataLoader(GSM8KDataset(gsm8k_dataset['test'], tokenizer), batch_size=32, shuffle=False)
humaneval_dataloader = DataLoader(HumanEvalDataset(humaneval_dataset['test'], tokenizer), batch_size=32, shuffle=False)

# Run evaluations
arc_avg_loss, arc_accuracy = evaluate_model(model, arc_dataloader, criterion)
print(f'ARC Challenge - Loss: {arc_avg_loss:.4f}, Accuracy: {arc_accuracy:.4f}')

mmlu_avg_loss, mmlu_accuracy = evaluate_model(model, mmlu_dataloader, criterion)
print(f'MMLU Benchmark - Loss: {mmlu_avg_loss:.4f}, Accuracy: {mmlu_accuracy:.4f}')

gsm8k_avg_loss, gsm8k_accuracy = evaluate_model(model, gsm8k_dataloader, criterion)
print(f'GSM8K Benchmark - Loss: {gsm8k_avg_loss:.4f}, Accuracy: {gsm8k_accuracy:.4f}')

humaneval_avg_loss, humaneval_accuracy = evaluate_model(model, humaneval_dataloader, criterion)
print(f'OpenAI HumanEval Benchmark - Loss: {humaneval_avg_loss:.4f}, Accuracy: {humaneval_accuracy:.4f}')


IndexError: list index out of range

In [1]:
# Step 1: Install the Necessary Libraries
!pip install datasets transformers

# Step 2: Import Libraries and Load the Dataset
from datasets import load_dataset
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer

# Load the ARC-Easy dataset from Hugging Face
dataset = load_dataset("allenai/ai2_arc", "ARC-Easy")
print("Dataset loaded successfully.")

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Step 3: Define the Model Classes and Load the Model
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)

        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))

class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg['emb_dim'], 4 * cfg['emb_dim']),
            GELU(),
            nn.Linear(4 * cfg['emb_dim'], cfg['emb_dim']),
            nn.Dropout(cfg['drop_rate'])
        )

    def forward(self, x):
        return self.layers(x)

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, block_size, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            'mask',
            torch.triu(torch.ones(block_size, block_size), diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        queries = self.W_query(x)
        keys = self.W_key(x)
        values = self.W_value(x)

        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)

        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        attn_scores = queries @ keys.transpose(2, 3)

        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        mask_unsqueezed = mask_bool.unsqueeze(0).unsqueeze(0)
        attn_scores.masked_fill_(mask_unsqueezed, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1] ** 0.5, dim=-1)

        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1, 2)
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)

        return context_vec

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.att = MultiHeadAttention(
            d_in=cfg['emb_dim'],
            d_out=cfg['emb_dim'],
            block_size=cfg['ctx_len'],
            num_heads=cfg['n_heads'],
            dropout=cfg['drop_rate'],
            qkv_bias=cfg['qkv_bias'],
        )

        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg['emb_dim'])
        self.norm2 = LayerNorm(cfg['emb_dim'])
        self.drop_resid = nn.Dropout(cfg['drop_rate'])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_resid(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_resid(x)
        x = x + shortcut

        return x

class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['ctx_len'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg['n_layers'])]
        )

        self.final_norm = LayerNorm(cfg['emb_dim'])
        self.out_head = nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds

        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)

        return logits

# Define the configuration
cfg = {
    'emb_dim': 768,
    'ctx_len': 1024,
    'n_heads': 12,
    'drop_rate': 0.1,
    'qkv_bias': False,
    'vocab_size': 50257,
    'n_layers': 12,
}

# Instantiate the model
model = GPTModel(cfg)

# Path to the model file in Google Drive
file_path = '/content/drive/MyDrive/model_and_optimizer (1).pth'

# Load the model state
try:
    model_state = torch.load(file_path)
    model.load_state_dict(model_state['model_state_dict'])
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")

# Step 4: Define the Benchmark Evaluation Function for ARC Challenge
def evaluate_arc_challenge(model, dataset):
    arc_score = 0
    total = len(dataset['test'])
    for example in dataset['test']:
        question = example['question']
        options = example['choices']['text']
        answer = example['choices']['label'].index(example['answerKey'])

        # Create inputs for each option and compute logits
        option_logits = []
        for option in options:
            input_text = question + " " + option
            inputs = tokenizer(input_text, return_tensors="pt")
            with torch.no_grad():
                logits = model(inputs['input_ids'])
            option_logits.append(logits[:, -1, :].mean().item())

        prediction = torch.tensor(option_logits).argmax().item()

        if prediction == answer:
            arc_score += 1

    return arc_score / total

# Run evaluation
arc_score = evaluate_arc_challenge(model, dataset)
print(f"ARC Challenge Score: {arc_score}")


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.8/547.8 kB[0m [31m6.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/547.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/9.00k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/331k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/346k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/86.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2251 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2376 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/570 [00:00<?, ? examples/s]

Dataset loaded successfully.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Error loading model: [Errno 2] No such file or directory: '/content/drive/MyDrive/model_and_optimizer (1).pth'
ARC Challenge Score: 0.24705387205387205
