In [1]:
!pip install torch



In [3]:
!pip install torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [2]:
!pip install datasets transformers



In [3]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import torch
import torch.nn as nn

# Define the model classes (as provided earlier)
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)

        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))

class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg['emb_dim'], 4 * cfg['emb_dim']),
            GELU(),
            nn.Linear(4 * cfg['emb_dim'], cfg['emb_dim']),
            nn.Dropout(cfg['drop_rate'])
        )

    def forward(self, x):
        return self.layers(x)

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, block_size, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            'mask',
            torch.triu(torch.ones(block_size, block_size), diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        queries = self.W_query(x)
        keys = self.W_key(x)
        values = self.W_value(x)

        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)

        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        attn_scores = queries @ keys.transpose(2, 3)

        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        mask_unsqueezed = mask_bool.unsqueeze(0).unsqueeze(0)
        attn_scores.masked_fill_(mask_unsqueezed, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1] ** 0.5, dim=-1)

        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1, 2)
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)

        return context_vec

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.att = MultiHeadAttention(
            d_in=cfg['emb_dim'],
            d_out=cfg['emb_dim'],
            block_size=cfg['ctx_len'],
            num_heads=cfg['n_heads'],
            dropout=cfg['drop_rate'],
            qkv_bias=cfg['qkv_bias'],
        )

        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg['emb_dim'])
        self.norm2 = LayerNorm(cfg['emb_dim'])
        self.drop_resid = nn.Dropout(cfg['drop_rate'])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_resid(x)
        x = x + shortcut # Add the original input back

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_resid(x)
        x = x + shortcut

        return x

class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['ctx_len'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg['n_layers'])]
        )                                                                                   #A

        self.final_norm = LayerNorm(cfg['emb_dim'])                                        #B
        self.out_head = nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds

        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)

        return logits

# Define the configuration
cfg = {
    'emb_dim': 768,
    'ctx_len': 1024,  # Updated to match the saved model
    'n_heads': 12,
    'drop_rate': 0.1,
    'qkv_bias': False,
    'vocab_size': 50257,
    'n_layers': 12,
}

# Instantiate the model
model = GPTModel(cfg)

# Path to the model file in Google Drive
file_path = '/content/drive/MyDrive/model_and_optimizer (1).pth'

# Load the model state
try:
    model_state = torch.load(file_path)
    model.load_state_dict(model_state['model_state_dict'])
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")

# Test the model with dummy data
dummy_input = torch.randint(0, cfg['vocab_size'], (1, cfg['ctx_len']))
output = model(dummy_input)
print(output.shape)  # Should print torch.Size([1, ctx_len, vocab_size])


Model loaded successfully.
torch.Size([1, 1024, 50257])


In [5]:
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
          (3): Dropout(p=0.1, inplace=False)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAtten

In [6]:
from datasets import load_dataset
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer

# Load the ARC-Easy dataset from Hugging Face
dataset = load_dataset("allenai/ai2_arc", "ARC-Easy")
print("Dataset loaded successfully.")

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/9.00k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/331k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/346k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/86.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2251 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2376 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/570 [00:00<?, ? examples/s]

Dataset loaded successfully.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [7]:
def evaluate_arc_challenge(model, dataset):
    arc_score = 0
    total = len(dataset['test'])
    for example in dataset['test']:
        question = example['question']
        options = example['choices']['text']
        answer = example['choices']['label'].index(example['answerKey'])

        inputs = tokenizer(question, return_tensors="pt")
        with torch.no_grad():
            logits = model(inputs['input_ids'])
        prediction = logits.argmax(dim=-1).item()

        if prediction == answer:
            arc_score += 1

    return arc_score / total

# Run evaluation
arc_score = evaluate_arc_challenge(model, dataset)
print(f"ARC Challenge Score: {arc_score}")


RuntimeError: a Tensor with 15 elements cannot be converted to Scalar

In [8]:
# Load the dataset
import pandas as pd
data_file_path = '/content/53739ced-07c1-4223-8496-742745cd4d73-processed_data_file.parquet'
dataset = pd.read_parquet(data_file_path)
print("Dataset loaded successfully.")
print(dataset.head())


Dataset loaded successfully.
                                      extracted_text
0  than if I'd never touched a brush." And his to...
1  to hide her nervousness; and I followed her be...
2  But she couldn't bear not to have all the draw...
3  to have my hand on such a 'subject.' Then his ...
4  glory of my painting him! Of course I meant to...


In [11]:
!pip install datasets transformers


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 

In [10]:
# Define the Benchmark Evaluation Functions
def evaluate_arc_challenge(model, dataset):
    # Implement the ARC Challenge evaluation logic here
    # This is a placeholder function
    # Example: Assuming dataset contains 'questions' and 'answers' columns for ARC Challenge
    arc_score = 0
    for index, row in dataset.iterrows():
        question = row['question']
        answer = row['answer']
        # Generate prediction using the model
        inputs = torch.tensor([question])  # Convert to appropriate tensor format
        logits = model(inputs)
        prediction = logits.argmax(dim=-1).item()
        if prediction == answer:
            arc_score += 1
    return arc_score / len(dataset)

def evaluate_mmlu(model, dataset):
    # Implement the MMLU evaluation logic here
    # This is a placeholder function
    # Example: Assuming dataset contains 'questions' and 'answers' columns for MMLU
    mmlu_score = 0
    for index, row in dataset.iterrows():
        question = row['question']
        answer = row['answer']
        # Generate prediction using the model
        inputs = torch.tensor([question])  # Convert to appropriate tensor format
        logits = model(inputs)
        prediction = logits.argmax(dim=-1).item()
        if prediction == answer:
            mmlu_score += 1
    return mmlu_score / len(dataset)

def evaluate_gsm8k(model, dataset):
    # Implement the GSM8K evaluation logic here
    # This is a placeholder function
    # Example: Assuming dataset contains 'questions' and 'answers' columns for GSM8K
    gsm8k_score = 0
    for index, row in dataset.iterrows():
        question = row['question']
        answer = row['answer']
        # Generate prediction using the model
        inputs = torch.tensor([question])  # Convert to appropriate tensor format
        logits = model(inputs)
        prediction = logits.argmax(dim=-1).item()
        if prediction == answer:
            gsm8k_score += 1
    return gsm8k_score / len(dataset)

def evaluate_humaneval(model, dataset):
    # Implement the HumanEval evaluation logic here
    # This is a placeholder function
    # Example: Assuming dataset contains 'questions' and 'answers' columns for HumanEval
    humaneval_score = 0
    for index, row in dataset.iterrows():
        question = row['question']
        answer = row['answer']
        # Generate prediction using the model
        inputs = torch.tensor([question])  # Convert to appropriate tensor format
        logits = model(inputs)
        prediction = logits.argmax(dim=-1).item()
        if prediction == answer:
            humaneval_score += 1
    return humaneval_score / len(dataset)

# Run evaluations
arc_score = evaluate_arc_challenge(model, dataset)
mmlu_score = evaluate_mmlu(model, dataset)
gsm8k_score = evaluate_gsm8k(model, dataset)
humaneval_score = evaluate_humaneval(model, dataset)

# Print scores
print(f"ARC Challenge Score: {arc_score}")
print(f"MMLU Score: {mmlu_score}")
print(f"GSM8K Score: {gsm8k_score}")
print(f"HumanEval Score: {humaneval_score}")

KeyError: 'question'

In [8]:
# Step 1: Install the Necessary Libraries
!pip install datasets transformers

# Step 2: Import Libraries and Load the Dataset
from datasets import load_dataset
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer

# Load the ARC-Easy dataset from Hugging Face
dataset = load_dataset("allenai/ai2_arc", "ARC-Easy")
print("Dataset loaded successfully.")

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Step 3: Define the Model Classes and Load the Model
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)

        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))

class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg['emb_dim'], 4 * cfg['emb_dim']),
            GELU(),
            nn.Linear(4 * cfg['emb_dim'], cfg['emb_dim']),
            nn.Dropout(cfg['drop_rate'])
        )

    def forward(self, x):
        return self.layers(x)

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, block_size, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            'mask',
            torch.triu(torch.ones(block_size, block_size), diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        queries = self.W_query(x)
        keys = self.W_key(x)
        values = self.W_value(x)

        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)

        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        attn_scores = queries @ keys.transpose(2, 3)

        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        mask_unsqueezed = mask_bool.unsqueeze(0).unsqueeze(0)
        attn_scores.masked_fill_(mask_unsqueezed, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1] ** 0.5, dim=-1)

        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1, 2)
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)

        return context_vec

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.att = MultiHeadAttention(
            d_in=cfg['emb_dim'],
            d_out=cfg['emb_dim'],
            block_size=cfg['ctx_len'],
            num_heads=cfg['n_heads'],
            dropout=cfg['drop_rate'],
            qkv_bias=cfg['qkv_bias'],
        )

        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg['emb_dim'])
        self.norm2 = LayerNorm(cfg['emb_dim'])
        self.drop_resid = nn.Dropout(cfg['drop_rate'])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_resid(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_resid(x)
        x = x + shortcut

        return x

class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['ctx_len'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg['n_layers'])]
        )

        self.final_norm = LayerNorm(cfg['emb_dim'])
        self.out_head = nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds

        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)

        return logits

# Define the configuration
cfg = {
    'emb_dim': 768,
    'ctx_len': 1024,
    'n_heads': 12,
    'drop_rate': 0.1,
    'qkv_bias': False,
    'vocab_size': 50257,
    'n_layers': 12,
}

# Instantiate the model
model = GPTModel(cfg)

# Path to the model file in Google Drive
file_path = '/content/drive/MyDrive/model_and_optimizer (1).pth'

# Load the model state
try:
    model_state = torch.load(file_path)
    model.load_state_dict(model_state['model_state_dict'])
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")

# Step 4: Define the Benchmark Evaluation Function for ARC Challenge
def evaluate_arc_challenge(model, dataset):
    arc_score = 0
    total = len(dataset['test'])
    for example in dataset['test']:
        question = example['question']
        options = example['choices']['text']
        answer = example['choices']['label'].index(example['answerKey'])

        # Create inputs for each option and compute logits
        option_logits = []
        for option in options:
            input_text = question + " " + option
            inputs = tokenizer(input_text, return_tensors="pt")
            with torch.no_grad():
                logits = model(inputs['input_ids'])
            option_logits.append(logits[:, -1, :].mean().item())

        prediction = torch.tensor(option_logits).argmax().item()

        if prediction == answer:
            arc_score += 1

    return arc_score / total

# Run evaluation
arc_score = evaluate_arc_challenge(model, dataset)
print(f"ARC Challenge Score: {arc_score}")


Dataset loaded successfully.
Model loaded successfully.
ARC Challenge Score: 0.2478956228956229
