In [1]:
!pip install -q transformers huggingface_hub accelerate bitsandbytes pandas sentencepiece


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m68.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
os.environ['HF_HOME'] = '/content/drive/MyDrive/hf_cache'
os.environ['HUGGINGFACE_HUB_CACHE'] = '/content/drive/MyDrive/huggingface_cache'

!rm -rf /content/sample_data  # free space
!df -h | grep drive


Mounted at /content/drive
drive           113G   45G   68G  40% /content/drive


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
import torch
import gc

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
    llm_int8_enable_fp32_cpu_offload=True,
)

def load_model_safely(model_name, quantize=False):
    try:
        print(f"🔍 Checking for local copy of: {model_name}")
        tok = AutoTokenizer.from_pretrained(model_name, cache_dir=os.environ['HUGGINGFACE_HUB_CACHE'])
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            cache_dir=os.environ['HUGGINGFACE_HUB_CACHE'],
            device_map="auto",
            quantization_config=(bnb_config if quantize else None),
        )
        pipe = pipeline("text-generation", model=model, tokenizer=tok, max_new_tokens=128, device_map="auto", do_sample=False)
        return pipe
    except Exception as e:
        print(f"❌ Failed to load {model_name}: {e}")
        gc.collect()
        torch.cuda.empty_cache()
        return None


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

JUDGE_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
judge_tok = AutoTokenizer.from_pretrained(JUDGE_NAME, cache_dir=os.environ['HUGGINGFACE_HUB_CACHE'])
judge_model = AutoModelForCausalLM.from_pretrained(JUDGE_NAME, cache_dir=os.environ['HUGGINGFACE_HUB_CACHE']).cpu()

judge_gen = pipeline("text-generation", model=judge_model, tokenizer=judge_tok, device=-1, max_new_tokens=256, do_sample=False)
print("✅ Judge loaded (on CPU)")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import json

def safe_load_json(filepath, max_bytes=None):
    with open(filepath, "r") as f:
        raw = f.read(max_bytes) if max_bytes else f.read()
    safe_json = raw.rstrip(",

	") + "]"
    return json.loads(safe_json)

alpaca = safe_load_json("alpaca_prompts.json")
# mmlu = safe_load_json("mmlu_prompts.json", max_bytes=140_000_000)  # slice before corrupted area

alpaca = alpaca[:1]
alpaca[0]["prompts"] = alpaca[0]["prompts"][:10]
# mmlu = mmlu[:1]

datasets = {"alpaca": alpaca}
print({k: len(v) for k, v in datasets.items()})


In [None]:
compare_template = """Below are two answers to the same question.
Answer A:
{A}
Answer B:
{B}
Which one is better, A or B, at addressing the question? Respond with exactly one token: A, B, or Tie.""".strip()

generation_cache = {}

def get_cached_response(generator, prompt, model_name):
    key = (model_name, prompt)
    if key not in generation_cache:
        generation_cache[key] = generator(prompt)[0]["generated_text"][len(prompt):].strip()
    return generation_cache[key]


In [None]:
import time
import numpy as np
results = []

TARGET_MODELS = {
    "gpt2": "gpt2",
    "distilgpt2": "distilgpt2",
    "tiny-gpt2": "sshleifer/tiny-gpt2"
}

for ds_name, data in datasets.items():
    for ex in data:
        for model_name, model_id in TARGET_MODELS.items():
            print(f"🚀 Evaluating {model_name}")
            try:
                gc.collect()
                torch.cuda.empty_cache()
                time.sleep(2)

                gen = load_model_safely(model_id, quantize=False)
                if gen is None:
                    continue

                scores = []
                for prompt in ex["prompts"]:
                    model_output = get_cached_response(gen, prompt, model_name)
                    gold_output = get_cached_response(judge_gen, prompt, "judge")
                    cmp_prompt = compare_template.format(A=model_output, B=gold_output)
                    vote = judge_gen(cmp_prompt)[0]["generated_text"].strip().split()[0]
                    scores.append(1 if vote in ("A", "Tie") else 0)

                del gen
                gc.collect()
                torch.cuda.empty_cache()

                arr = np.array(scores)
                results.append({
                    "dataset": ds_name,
                    "model": model_name,
                    "id": ex["id"],
                    "original_score": float(arr[0]),
                    "worst_score": float(arr.min()),
                    "best_score": float(arr.max()),
                    "average_score": float(arr.mean()),
                    "std_score": float(arr.std())
                })

            except Exception as e:
                print(f"💥 Skipped {model_name} due to error: {e}")
                gc.collect()
                torch.cuda.empty_cache()
                time.sleep(2)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.DataFrame(results)
print(df)

pivot = df.pivot(index="model", columns="dataset", values="average_score")
ax = pivot.plot.barh(figsize=(8, 4))
ax.set_title("Average Score (model ≥ gold)")
ax.set_xlabel("Score")
plt.tight_layout()
plt.show()
