In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [6]:
splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df_train_full = pd.read_parquet("hf://datasets/wangrongsheng/ag_news/" + splits["train"])

df_sampled = df_train_full.groupby('label', group_keys=False).apply(lambda x: x.sample(n=1500, random_state=42)).reset_index(drop=True)

df_train, df_val = train_test_split(
    df_sampled,
    test_size=1000,
    stratify=df_sampled['label'],
    random_state=42
)

df_test_full = pd.read_parquet("hf://datasets/wangrongsheng/ag_news/" + splits["test"])
df_test = df_test_full.sample(n=1000, random_state=42)

  df_sampled = df_train_full.groupby('label', group_keys=False).apply(lambda x: x.sample(n=1500, random_state=42)).reset_index(drop=True)


In [7]:
df_train.shape, df_test.shape, df_val.shape

((5000, 2), (1000, 2), (1000, 2))

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from datasets import Dataset

model_id = "google/gemma-2-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto"
)

# AG News template
AG_NEWS_TEMPLATE = "Headline: {text}\nCategory: {label}\n\n"

def format_example(example):
    return AG_NEWS_TEMPLATE.format(
        text=example["text"],
        label=example["label"]
    )

def get_label_probs(prompt, text):
    full_input = prompt + f"Headline: {text}\nCategory:"
    inputs = tokenizer(full_input, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
    
    with torch.no_grad():
        outputs = model(**inputs).logits
    
    # Label tokens for 4-class AG News
    label_tokens = [tokenizer(f" {i}")["input_ids"][-1] for i in range(4)]
    last_token_logits = outputs[0, -1, label_tokens]
    return torch.softmax(last_token_logits, dim=-1).cpu().numpy()

def evaluate_subset(Si, val_set):
    prompt = "".join([format_example(ex) for ex in Si])
    correct = 0
    for ex in val_set:
        probs = get_label_probs(prompt, ex["text"])
        if np.argmax(probs) == ex["label"]:
            correct += 1
    return correct / len(val_set)

def load_agnews():
    splits = {'train': 'data/train-00000-of-00001.parquet', 
             'test': 'data/test-00000-of-00001.parquet'}
    
    # Load and sample data
    df_train_full = pd.read_parquet("hf://datasets/wangrongsheng/ag_news/" + splits["train"])
    df_sampled = df_train_full.groupby('label', group_keys=False).apply(
        lambda x: x.sample(n=1500, random_state=42)).reset_index(drop=True)
    
    # Train/val split
    df_train, df_val = train_test_split(
        df_sampled,
        test_size=1000,
        stratify=df_sampled['label'],
        random_state=42
    )
    
    # Test set
    df_test_full = pd.read_parquet("hf://datasets/wangrongsheng/ag_news/" + splits["test"])
    df_test = df_test_full.sample(n=1000, random_state=42)
    
    # Convert to HuggingFace datasets
    train_set = Dataset.from_pandas(df_train[['text', 'label']])
    val_set = Dataset.from_pandas(df_val[['text', 'label']])
    test_set = Dataset.from_pandas(df_test[['text', 'label']])
    
    return train_set, val_set, test_set

def compute_influences(train_set, val_set, k=5, M=100):
    # Balanced subset sampling
    class_counts = defaultdict(int)
    for ex in train_set:
        class_counts[ex["label"]] += 1
    
    k_per_class = max(1, k // len(class_counts))
    subsets = []
    for _ in range(M):
        Si = []
        for label in class_counts:
            candidates = [ex for ex in train_set if ex["label"] == label]
            Si.extend(np.random.choice(candidates, k_per_class, replace=False))
        Si = list(np.random.choice(Si, k))
        subsets.append(Si)
    
    # Sequential evaluation
    D = []
    for Si in tqdm(subsets, desc="Evaluating subsets"):
        D.append(evaluate_subset(Si, val_set))
    
    # Calculate influences
    influence_scores = defaultdict(list)
    for idx, ex in enumerate(train_set):
        included = []
        excluded = []
        for Si, acc in zip(subsets, D):
            if ex in Si:
                included.append(acc)
            else:
                excluded.append(acc)
        
        Nj = len(included)
        Mj = len(D) - Nj
        if Nj > 0 and Mj > 0:
            influence = (sum(included)/Nj) - (sum(excluded)/Mj)
        else:
            influence = 0
        influence_scores[idx] = influence
    
    return influence_scores

def run_experiment():
    # Load AG News
    train_set, val_set, test_set = load_agnews()
    
    # Compute influences
    influence_scores = compute_influences(train_set, val_set, k=5, M=100)
    
    # Select top examples
    sorted_indices = sorted(influence_scores, key=influence_scores.get, reverse=True)[:5]
    top_examples = [train_set[i] for i in sorted_indices]
    
    # Final evaluation
    prompt = "".join([format_example(ex) for ex in top_examples])
    correct = 0
    for ex in tqdm(test_set, desc="Testing"):
        probs = get_label_probs(prompt, ex["text"])
        if np.argmax(probs) == ex["label"]:
            correct += 1
    
    print(f"Final Test Accuracy: {correct/len(test_set):.2%}")

if __name__ == "__main__":
    run_experiment()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  df_sampled = df_train_full.groupby('label', group_keys=False).apply(
Evaluating subsets: 100%|██████████| 100/100 [4:33:08<00:00, 163.88s/it] 
Testing: 100%|██████████| 1000/1000 [02:58<00:00,  5.61it/s]


Final Test Accuracy: 52.40%


In [2]:
import os
os.environ['HF_HOME'] = '/scratch/user/vp190545/huggingface_models'

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from datasets import Dataset

model_id = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto"
)

# AG News template
AG_NEWS_TEMPLATE = "Headline: {text}\nCategory: {label}\n\n"

def format_example(example):
    return AG_NEWS_TEMPLATE.format(
        text=example["text"],
        label=example["label"]
    )

def get_label_probs(prompt, text):
    full_input = prompt + f"Headline: {text}\nCategory:"
    inputs = tokenizer(full_input, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
    
    with torch.no_grad():
        outputs = model(**inputs).logits
    
    # Label tokens for 4-class AG News
    label_tokens = [tokenizer(f" {i}")["input_ids"][-1] for i in range(4)]
    last_token_logits = outputs[0, -1, label_tokens]
    return torch.softmax(last_token_logits, dim=-1).cpu().numpy()

def evaluate_subset(Si, val_set):
    prompt = "".join([format_example(ex) for ex in Si])
    correct = 0
    for ex in val_set:
        probs = get_label_probs(prompt, ex["text"])
        if np.argmax(probs) == ex["label"]:
            correct += 1
    return correct / len(val_set)

def load_agnews():
    splits = {'train': 'data/train-00000-of-00001.parquet', 
             'test': 'data/test-00000-of-00001.parquet'}
    
    # Load and sample data
    df_train_full = pd.read_parquet("hf://datasets/wangrongsheng/ag_news/" + splits["train"])
    df_sampled = df_train_full.groupby('label', group_keys=False).apply(
        lambda x: x.sample(n=1500, random_state=42)).reset_index(drop=True)
    
    # Train/val split
    df_train, df_val = train_test_split(
        df_sampled,
        test_size=1000,
        stratify=df_sampled['label'],
        random_state=42
    )
    
    # Test set
    df_test_full = pd.read_parquet("hf://datasets/wangrongsheng/ag_news/" + splits["test"])
    df_test = df_test_full.sample(n=1000, random_state=42)
    
    # Convert to HuggingFace datasets
    train_set = Dataset.from_pandas(df_train[['text', 'label']])
    val_set = Dataset.from_pandas(df_val[['text', 'label']])
    test_set = Dataset.from_pandas(df_test[['text', 'label']])
    
    return train_set, val_set, test_set

def compute_influences(train_set, val_set, k=5, M=100):
    # Balanced subset sampling
    class_counts = defaultdict(int)
    for ex in train_set:
        class_counts[ex["label"]] += 1
    
    k_per_class = max(1, k // len(class_counts))
    subsets = []
    for _ in range(M):
        Si = []
        for label in class_counts:
            candidates = [ex for ex in train_set if ex["label"] == label]
            Si.extend(np.random.choice(candidates, k_per_class, replace=False))
        Si = list(np.random.choice(Si, k))
        subsets.append(Si)
    
    # Sequential evaluation
    D = []
    for Si in tqdm(subsets, desc="Evaluating subsets"):
        D.append(evaluate_subset(Si, val_set))
    
    # Calculate influences
    influence_scores = defaultdict(list)
    for idx, ex in enumerate(train_set):
        included = []
        excluded = []
        for Si, acc in zip(subsets, D):
            if ex in Si:
                included.append(acc)
            else:
                excluded.append(acc)
        
        Nj = len(included)
        Mj = len(D) - Nj
        if Nj > 0 and Mj > 0:
            influence = (sum(included)/Nj) - (sum(excluded)/Mj)
        else:
            influence = 0
        influence_scores[idx] = influence
    
    return influence_scores

def run_experiment():
    # Load AG News
    train_set, val_set, test_set = load_agnews()
    
    # Compute influences
    influence_scores = compute_influences(train_set, val_set, k=5, M=100)
    
    # Select top examples
    sorted_indices = sorted(influence_scores, key=influence_scores.get, reverse=True)[:5]
    top_examples = [train_set[i] for i in sorted_indices]
    
    # Final evaluation
    prompt = "".join([format_example(ex) for ex in top_examples])
    correct = 0
    for ex in tqdm(test_set, desc="Testing"):
        probs = get_label_probs(prompt, ex["text"])
        if np.argmax(probs) == ex["label"]:
            correct += 1
    
    print(f"Final Test Accuracy: {correct/len(test_set):.2%}")

if __name__ == "__main__":
    run_experiment()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  df_sampled = df_train_full.groupby('label', group_keys=False).apply(
Evaluating subsets: 100%|██████████| 100/100 [4:35:27<00:00, 165.27s/it] 
Testing: 100%|██████████| 1000/1000 [02:44<00:00,  6.07it/s]


Final Test Accuracy: 24.80%
