In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
The token `colab2` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `colab2`


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from datasets import load_dataset
import numpy as np
from tqdm import tqdm
from collections import defaultdict

model_id = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto"
)

# CommonsenseQA prompt template
CSQA_TEMPLATE = """Question: {question}
A. {choice0}
B. {choice1}
C. {choice2}
D. {choice3}
E. {choice4}
Answer: {answer}\n\n"""

def format_example(example):
    choices = example["choices"]["text"]
    return CSQA_TEMPLATE.format(
        question=example["question"],
        choice0=choices[0],
        choice1=choices[1],
        choice2=choices[2],
        choice3=choices[3],
        choice4=choices[4],
        answer=example["answerKey"]
    )

def get_label_probs(prompt, question, choices):
    full_input = prompt + f"""Question: {question}
A. {choices[0]}
B. {choices[1]}
C. {choices[2]}
D. {choices[3]}
E. {choices[4]}
Answer:"""

    inputs = tokenizer(full_input, return_tensors="pt", truncation=True, max_length=2048).to(model.device)

    with torch.no_grad():
        outputs = model(**inputs).logits

    label_tokens = [tokenizer(f" {c}")["input_ids"][-1] for c in ["A", "B", "C", "D", "E"]]
    last_token_logits = outputs[0, -1, label_tokens]
    return torch.softmax(last_token_logits, dim=-1).cpu().numpy()

def evaluate_subset(Si, val_set):
    prompt = "".join([format_example(ex) for ex in Si])
    correct = 0
    for ex in val_set:
        probs = get_label_probs(prompt, ex["question"], ex["choices"]["text"])
        pred_idx = np.argmax(probs)
        pred_choice = ["A", "B", "C", "D", "E"][pred_idx]
        if pred_choice == ex["answerKey"]:
            correct += 1
    return correct / len(val_set)

def compute_influences(train_set, val_set, k=5, M=100):
    # Balanced subset sampling
    class_counts = defaultdict(int)
    for ex in train_set:
        class_counts[ex["answerKey"]] += 1

    k_per_class = max(1, k // len(class_counts))
    subsets = []
    for _ in range(M):
        Si = []
        for label in class_counts:
            candidates = [ex for ex in train_set if ex["answerKey"] == label]
            Si.extend(np.random.choice(candidates, k_per_class, replace=False))
        Si = list(np.random.choice(Si, k, replace=False))
        subsets.append(Si)

    # Sequential evaluation
    D = []
    for Si in tqdm(subsets, desc="Evaluating subsets"):
        D.append(evaluate_subset(Si, val_set))

    # Calculate influences
    influence_scores = defaultdict(list)
    for idx, ex in enumerate(train_set):
        included = []
        excluded = []
        for Si, acc in zip(subsets, D):
            if ex in Si:
                included.append(acc)
            else:
                excluded.append(acc)

        Nj = len(included)
        Mj = len(D) - Nj
        if Nj > 0 and Mj > 0:
            influence = (sum(included)/Nj) - (sum(excluded)/Mj)
        else:
            influence = 0
        influence_scores[idx] = influence

    return influence_scores

def load_csqa(split="train", num_samples=1000):
    dataset = load_dataset("commonsense_qa", split=split)
    dataset = dataset.shuffle(seed=42).select(range(min(num_samples, len(dataset))))
    if split!="train":
        full_val = load_dataset("commonsense_qa", split="validation").shuffle(seed=42)
        val_set = full_val.select(range(0, 500))  # First 500 for validation
        test_set = full_val.select(range(500, 1000))
        return val_set, test_set
    return dataset

def run_experiment():
    # Load CommonsenseQA
    train_set = load_csqa("train", 5000)
    val_set, test_set = load_csqa("validation", 1000)

    # Compute influences
    influence_scores = compute_influences(train_set, val_set, k=5, M=100)

    # Select top examples
    sorted_indices = sorted(influence_scores, key=influence_scores.get, reverse=True)[:5]
    top_examples = [train_set[i] for i in sorted_indices]

    # Final evaluation
    prompt = "".join([format_example(ex) for ex in top_examples])
    correct = 0
    for ex in tqdm(test_set, desc="Testing"):
        probs = get_label_probs(prompt, ex["question"], ex["choices"]["text"])
        pred_idx = np.argmax(probs)
        pred_choice = ["A", "B", "C", "D", "E"][pred_idx]
        if pred_choice == ex["answerKey"]:
            correct += 1

    print(f"Final Test Accuracy: {correct/len(test_set):.2%}")

if __name__ == "__main__":
    run_experiment()


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Evaluating subsets: 100%|██████████| 100/100 [09:53<00:00,  5.94s/it]
Testing: 100%|██████████| 500/500 [00:05<00:00, 84.00it/s]

Final Test Accuracy: 18.40%



