#### The code for this experiment is adapted from [this repository](https://github.com/karpathy/build-nanogpt/blob/master/hellaswag.py)

In [1]:
import os
import json
import requests
import tiktoken
from tqdm import tqdm
import torch
from torch.nn import functional as F
from transformers import GPT2LMHeadModel

enc = tiktoken.get_encoding("gpt2")
DATA_CACHE_DIR = "hellaswag"

def download_file(url: str, fname: str, chunk_size=1024):
    resp = requests.get(url, stream=True)
    total = int(resp.headers.get("content-length", 0))
    with open(fname, "wb") as file, tqdm(
        desc=fname,
        total=total,
        unit="iB",
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for data in resp.iter_content(chunk_size=chunk_size):
            size = file.write(data)
            bar.update(size)

def download(split):
    os.makedirs(DATA_CACHE_DIR, exist_ok=True)
    data_url = f"https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_{split}.jsonl"
    data_filename = os.path.join(DATA_CACHE_DIR, f"hellaswag_{split}.jsonl")
    if not os.path.exists(data_filename):
        download_file(data_url, data_filename)

def render_example(example):
    ctx = example["ctx"]
    label = example["label"]
    endings = example["endings"]

    ctx_tokens = enc.encode(ctx)
    tok_rows = []
    mask_rows = []
    for end in endings:
        end_tokens = enc.encode(" " + end)
        tok_rows.append(ctx_tokens + end_tokens)
        mask_rows.append([0]*len(ctx_tokens) + [1]*len(end_tokens))

    max_len = max(len(row) for row in tok_rows)
    tokens = torch.zeros((4, max_len), dtype=torch.long)
    mask = torch.zeros((4, max_len), dtype=torch.long)
    for i, (tok_row, mask_row) in enumerate(zip(tok_rows, mask_rows)):
        tokens[i, :len(tok_row)] = torch.tensor(tok_row)
        mask[i, :len(mask_row)] = torch.tensor(mask_row)

    return tokens, mask, label

def iterate_examples(split):
    download(split)
    with open(os.path.join(DATA_CACHE_DIR, f"hellaswag_{split}.jsonl"), "r") as f:
        for line in f:
            yield json.loads(line)

@torch.no_grad()
def evaluate(device):
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    model.to(device)

    num_correct_norm = 0
    num_total = 0
    
    for example in tqdm(iterate_examples("val"), desc="Evaluating"):
        tokens, mask, label = render_example(example)
        tokens = tokens.to(device)
        mask = mask.to(device)

        logits = model(tokens).logits
        shift_logits = logits[..., :-1, :].contiguous()
        shift_tokens = tokens[..., 1:].contiguous()
        flat_shift_logits = shift_logits.view(-1, shift_logits.size(-1))
        flat_shift_tokens = shift_tokens.view(-1)
        shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none')
        shift_losses = shift_losses.view(tokens.size(0), -1)
        
        shift_mask = mask[..., 1:].contiguous()
        masked_shift_losses = shift_losses * shift_mask
        sum_loss = masked_shift_losses.sum(dim=1)
        avg_loss = sum_loss / shift_mask.sum(dim=1)
        
        pred_norm = avg_loss.argmin().item()
        num_total += 1
        num_correct_norm += int(pred_norm == label)

    print(f"acc_norm: {num_correct_norm}/{num_total}={num_correct_norm/num_total:.4f}")

evaluate("cuda")

Evaluating: 10042it [02:47, 59.91it/s]

acc_norm: 2967/10042=0.2955





**GPT2 (small) HellaSwag accuracy is 0.2955**