# Data Selection for LLM Training

**Authors:** Samhith Kakarla, Brenton Law, Issac To, Peter Lee

We explore data selection techniques (forgettability, GreeDi, CRAIG, CREST) on a tiny transformer model for basic NLP classification tasks (SST2, IMDb).

## Benchmarks
- [stanfordnlp/imdb](https://huggingface.co/datasets/stanfordnlp/imdb)
- [stanfordnlp/sst2](https://huggingface.co/datasets/stanfordnlp/sst2)

In [2]:
!git clone https://github.com/SamhithKakarla/Data-Selection-for-LLM-Training.git

Cloning into 'Data-Selection-for-LLM-Training'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 34 (delta 15), reused 25 (delta 7), pack-reused 0 (from 0)[K
Receiving objects: 100% (34/34), 11.51 KiB | 5.76 MiB/s, done.
Resolving deltas: 100% (15/15), done.


In [3]:
import sys
sys.path.append('/content/Data-Selection-for-LLM-Training')

In [None]:
from datasets import load_dataset

sst2_dataset = load_dataset("stanfordnlp/sst2")
imdb_dataset = load_dataset("stanfordnlp/imdb")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [8]:
import os
import argparse
import math
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset

# Import model and train utils
from model import TinyGPT
from train import make_dataset, collate_fn, compute_accuracy

## Train TinyGPT on Full Dataset

In [9]:
config = {
    'tokenizer': 'gpt2',
    'output_dir': './tiny_gpt_runs',
    'max_len': 64,
    'batch_size': 32,
    'd_model': 128,
    'n_layers': 4,
    'n_heads': 4,
    'epochs': 3,
    'lr': 3e-4,
    'num_classes': 2,
}

os.makedirs(config['output_dir'], exist_ok=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

tokenizer = AutoTokenizer.from_pretrained(config['tokenizer'])
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

train_texts = [ex['sentence'] for ex in sst2_dataset['train']]
train_labels = [ex['label'] for ex in sst2_dataset['train']]
val_texts = [ex['sentence'] for ex in sst2_dataset['validation']]
val_labels = [ex['label'] for ex in sst2_dataset['validation']]

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")
print(f"Label distribution (train): {set(train_labels)}")

train_data = make_dataset(tokenizer, train_texts, train_labels, config['max_len'])
val_data = make_dataset(tokenizer, val_texts, val_labels, config['max_len'])
train_loader = DataLoader(train_data, batch_size=config['batch_size'], shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_data, batch_size=config['batch_size'], shuffle=False, collate_fn=collate_fn)

vocab_size = tokenizer.vocab_size
actual_num_classes = max(max(train_labels), max(val_labels)) + 1
print(f"Number of classes detected: {actual_num_classes}")

model = TinyGPT(
    vocab_size=vocab_size,
    max_len=config['max_len'],
    d_model=config['d_model'],
    n_layers=config['n_layers'],
    n_heads=config['n_heads'],
    num_classes=actual_num_classes
).to(device)

print('Param count:', sum(p.numel() for p in model.parameters()))

optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'])
criterion = nn.CrossEntropyLoss()

for epoch in range(1, config['epochs'] + 1):
    model.train()
    total_loss = 0.0
    total_acc = 0.0
    steps = 0
    loop = tqdm(train_loader, desc=f'Epoch {epoch}')

    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        logits = model(input_ids, attention_mask=attention_mask)
        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        preds = torch.argmax(logits, dim=-1)
        acc = compute_accuracy(preds, labels)

        total_loss += loss.item()
        total_acc += acc
        steps += 1

        if steps % 50 == 0:
            loop.set_postfix({
                'loss': f'{loss.item():.4f}',
                'avg_loss': f'{total_loss/steps:.4f}',
                'acc': f'{acc:.4f}',
                'avg_acc': f'{total_acc/steps:.4f}'
            })

    avg_train_loss = total_loss / max(1, steps)
    avg_train_acc = total_acc / max(1, steps)

    # Validation
    model.eval()
    val_loss = 0.0
    val_acc = 0.0
    vsteps = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids, attention_mask=attention_mask)
            loss = criterion(logits, labels)

            preds = torch.argmax(logits, dim=-1)
            acc = compute_accuracy(preds, labels)

            val_loss += loss.item()
            val_acc += acc
            vsteps += 1

    avg_val_loss = val_loss / max(1, vsteps)
    avg_val_acc = val_acc / max(1, vsteps)

    print(f'Epoch {epoch} — Train loss: {avg_train_loss:.4f}, Train acc: {avg_train_acc:.4f}, '
          f'Val loss: {avg_val_loss:.4f}, Val acc: {avg_val_acc:.4f}')

    torch.save(model.state_dict(), os.path.join(config['output_dir'], f'model_epoch{epoch}.pt'))

print('Done. Models saved to', config['output_dir'])

Device: cpu
Training samples: 67349
Validation samples: 872
Label distribution (train): {0, 1}
Number of classes detected: 2
Param count: 7234690


Epoch 1:   1%|          | 24/2105 [00:11<17:18,  2.00it/s]


KeyboardInterrupt: 

# Interactive Demo

Let's see how our trained model performs on a few live examples from our classmates.

In [7]:


# Config
CKPT = './tiny_gpt_runs/model_epoch3.pt'
TOKENIZER = 'gpt2'
MAX_LEN = 64
D_MODEL = 128
N_LAYERS = 4
N_HEADS = 4
NUM_CLASSES = 2
LABELS = {0: 'Negative', 1: 'Positive'}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

# Model
model = TinyGPT(
    vocab_size=tokenizer.vocab_size,
    max_len=MAX_LEN,
    d_model=D_MODEL,
    n_layers=N_LAYERS,
    n_heads=N_HEADS,
    num_classes=NUM_CLASSES
).to(device)

model.load_state_dict(torch.load(CKPT, map_location=device))
model.eval()

# Prediction Utils
def _encode(texts):
    enc = tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=MAX_LEN,
        return_tensors='pt'
    )
    return enc['input_ids'].to(device), enc['attention_mask'].to(device)

def predict_sentiment(text):
    ids, mask = _encode([text])
    with torch.no_grad():
        logits = model(ids, attention_mask=mask)
        probs = torch.softmax(logits, dim=-1)[0]
    pred = probs.argmax().item()
    return pred, probs[pred].item(), probs.cpu().numpy()

def predict_batch(texts):
    ids, mask = _encode(texts)
    with torch.no_grad():
        logits = model(ids, attention_mask=mask)
        probs = torch.softmax(logits, dim=-1)
    preds = probs.argmax(dim=-1)
    confs = probs.gather(1, preds.unsqueeze(1)).squeeze(1)
    return preds.cpu().numpy(), confs.cpu().numpy(), probs.cpu().numpy()

# Example
text = "This movie is absolutely fantastic!"
pred, conf, probs = predict_sentiment(text)
print(f'"{text}" → {LABELS[pred]} ({conf:.4f})')
for i, p in enumerate(probs):
    print(f'  {LABELS[i]}: {p:.4f}')

texts = [
    "This movie is great!",
    "This could be worse",
    "It was okay, nothing special.",
    "Absolutely loved it!",
    "Worst product ever."
]

preds, confs, _ = predict_batch(texts)
for t, p, c in zip(texts, preds, confs):
    print(f'"{t}" → {LABELS[p]} ({c:.4f})')

"This movie is absolutely fantastic!" → Negative (0.5332)
  Negative: 0.5332
  Positive: 0.4668
"This movie is great!" → Negative (0.6205)
"This could be worse" → Positive (0.5138)
"It was okay, nothing special." → Positive (0.6053)
"Absolutely loved it!" → Positive (0.5562)
"Worst product ever." → Positive (0.6274)
