# Data Preparation

In [None]:
from datasets import load_dataset, concatenate_datasets

dataset = load_dataset("yelp_review_full")

samples_per_class = 30_000
subsets = []

for label in range(5):
    ds_label = dataset["train"].filter(lambda x: x["label"] == label)
    ds_label = ds_label.shuffle(seed=42).select(range(samples_per_class))
    subsets.append(ds_label)

train_subset = concatenate_datasets(subsets)
train_subset = train_subset.shuffle(seed=42)

print("Final training dataset size:", len(train_subset))


Final training dataset size: 150000


In [None]:
train_subset.to_csv("/content/drive/MyDrive/yelp_subset_150k.csv")


Creating CSV from Arrow format:   0%|          | 0/150 [00:00<?, ?ba/s]

110492659

In [None]:
from datasets import load_dataset

csv_path = "/content/drive/MyDrive/yelp_subset_150k.csv"

dataset = load_dataset("csv", data_files=csv_path)

# rename default split name to "train"
dataset["train"] = dataset.pop("train")

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 150000
    })
})


In [None]:
train_val = dataset["train"].train_test_split(test_size=0.1, seed=42)

train_ds = train_val["train"]   # 135k samples
val_ds   = train_val["test"]    # 15k samples

print(train_ds, val_ds)


Dataset({
    features: ['label', 'text'],
    num_rows: 135000
}) Dataset({
    features: ['label', 'text'],
    num_rows: 15000
})


In [None]:
from datasets import load_dataset

test_set = load_dataset("yelp_review_full", split="test")


# Teacher Model Training

In [None]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=200
    )

train_enc = train_ds.map(tokenize, batched=True)
val_enc   = val_ds.map(tokenize, batched=True)
test_enc  = test_set.map(tokenize, batched=True)


Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro")
    }


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=5
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"


In [None]:
args = TrainingArguments(
    output_dir="/content/drive/MyDrive/teacher_checkpoints",

    eval_strategy="epoch",
    save_strategy="epoch",

    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,

    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,

    fp16=True,

    logging_steps=100,

    save_total_limit=2,
    report_to="none",
)


In [None]:
from transformers import Trainer, DataCollatorWithPadding

collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Build Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_enc,
    eval_dataset=val_enc,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics
)

trainer.train()


test_results = trainer.evaluate(test_enc)
print("Teacher Test Results:", test_results)


best_model_path = "/content/drive/MyDrive/teacher_model_final"
trainer.save_model(best_model_path)

print(f"Teacher model saved to: {best_model_path}")


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.8356,0.807854,0.648533,0.648997
2,0.7262,0.792163,0.6548,0.65567
3,0.6497,0.818505,0.654733,0.654736


Teacher Test Results: {'eval_loss': 0.8036631941795349, 'eval_accuracy': 0.64788, 'eval_f1_macro': 0.6488189948129556, 'eval_runtime': 45.646, 'eval_samples_per_second': 1095.387, 'eval_steps_per_second': 34.242, 'epoch': 3.0}
Teacher model saved to: /content/drive/MyDrive/teacher_model_final


# Evaluation of Teacher Model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_path = "/content/drive/MyDrive/teacher_model_final"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.cuda()
model.eval()


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/yelp_subset_150k.csv")
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head()


Unnamed: 0,label,text
0,2,So i took it in for an oil change and mentione...
1,3,I absolutely love this place. Just to give you...
2,2,I've studied and had my share of great and aut...
3,0,http://www.buzzfeed.com/emleschh/21-reasons-yo...
4,4,This place is hands down awesome! Food is grea...


In [None]:
# picking only the 50000 rows from the subset dataset for student model training
# this replicates API calls (which are lesser in number compared to actually dataset size)
N = 50000
df = df[:N]

In [None]:
import torch.nn.functional as F
from tqdm import tqdm

texts = df["text"].tolist()
labels = df["label"].tolist()

teacher_logits = []
teacher_probs = []
teacher_preds = []

batch_size = 32

for i in tqdm(range(0, len(texts), batch_size)):
    batch = texts[i : i + batch_size]

    enc = tokenizer(batch, padding=True, truncation=True, max_length=200, return_tensors="pt")
    enc = {k: v.cuda() for k, v in enc.items()}

    with torch.no_grad():
        outputs = model(**enc)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)
        preds = torch.argmax(probs, dim=-1)

    teacher_logits.extend(logits.cpu().tolist())
    teacher_probs.extend(probs.cpu().tolist())
    teacher_preds.extend(preds.cpu().tolist())


100%|██████████| 1563/1563 [02:11<00:00, 11.89it/s]


In [None]:
result_df = pd.DataFrame({
    "text": texts,
    "true_label": labels,
    "teacher_pred": teacher_preds,
    "teacher_logits": teacher_logits,
    "teacher_probs": teacher_probs,
})


In [None]:
result_df.head()

Unnamed: 0,text,true_label,teacher_pred,teacher_logits,teacher_probs
0,So i took it in for an oil change and mentione...,2,2,"[-1.5976415872573853, 1.8925485610961914, 3.02...","[0.007287892978638411, 0.23898588120937347, 0...."
1,I absolutely love this place. Just to give you...,3,4,"[-4.64385986328125, -4.064105987548828, -1.389...","[0.00017741607734933496, 0.0003167939430568367..."
2,I've studied and had my share of great and aut...,2,2,"[-3.567349672317505, -0.29614511132240295, 2.2...","[0.001959405839443207, 0.051616739481687546, 0..."
3,http://www.buzzfeed.com/emleschh/21-reasons-yo...,0,0,"[0.7219317555427551, -0.16742348670959473, -0....","[0.4184754490852356, 0.17196018993854523, 0.11..."
4,This place is hands down awesome! Food is grea...,4,4,"[-3.5193538665771484, -3.677042007446289, -1.9...","[0.0002963162842206657, 0.0002530885103624314,..."


In [None]:
save_path = "/content/drive/MyDrive/teacher_predictions_subset.csv"
result_df.to_csv(save_path, index=False)
print("Saved CSV to:", save_path)


Saved CSV to: /content/drive/MyDrive/teacher_predictions_subset.csv


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/teacher_predictions_subset.csv")
df.head()


Unnamed: 0,text,true_label,teacher_pred,teacher_logits,teacher_probs
0,So i took it in for an oil change and mentione...,2,2,"[-1.5976415872573853, 1.8925485610961914, 3.02...","[0.007287892978638411, 0.23898588120937347, 0...."
1,I absolutely love this place. Just to give you...,3,4,"[-4.64385986328125, -4.064105987548828, -1.389...","[0.00017741607734933496, 0.0003167939430568367..."
2,I've studied and had my share of great and aut...,2,2,"[-3.567349672317505, -0.29614511132240295, 2.2...","[0.001959405839443207, 0.051616739481687546, 0..."
3,http://www.buzzfeed.com/emleschh/21-reasons-yo...,0,0,"[0.7219317555427551, -0.16742348670959473, -0....","[0.4184754490852356, 0.17196018993854523, 0.11..."
4,This place is hands down awesome! Food is grea...,4,4,"[-3.5193538665771484, -3.677042007446289, -1.9...","[0.0002963162842206657, 0.0002530885103624314,..."


In [None]:
import ast

df["teacher_probs"] = df["teacher_probs"].apply(ast.literal_eval)
df["teacher_logits"] = df["teacher_logits"].apply(ast.literal_eval)


In [None]:
print(type(df["teacher_probs"][0]))
print(df["teacher_probs"][0])


<class 'list'>
[0.007287892978638411, 0.23898588120937347, 0.7436054348945618, 0.00954984501004219, 0.000570858595892787]


# Student Model Training

In [None]:
import pandas as pd
import ast
import numpy as np

df = pd.read_csv("/content/drive/MyDrive/teacher_predictions_subset.csv")

df["teacher_probs"] = df["teacher_probs"].apply(ast.literal_eval)
df["teacher_logits"] = df["teacher_logits"].apply(ast.literal_eval)
df["teacher_pred"] = df["teacher_pred"].astype(int)


In [None]:
from datasets import Dataset

ds = Dataset.from_pandas(df)


In [None]:
df.head()

Unnamed: 0,text,true_label,teacher_pred,teacher_logits,teacher_probs
0,So i took it in for an oil change and mentione...,2,2,"[-1.5976415872573853, 1.8925485610961914, 3.02...","[0.007287892978638411, 0.23898588120937347, 0...."
1,I absolutely love this place. Just to give you...,3,4,"[-4.64385986328125, -4.064105987548828, -1.389...","[0.00017741607734933496, 0.0003167939430568367..."
2,I've studied and had my share of great and aut...,2,2,"[-3.567349672317505, -0.29614511132240295, 2.2...","[0.001959405839443207, 0.051616739481687546, 0..."
3,http://www.buzzfeed.com/emleschh/21-reasons-yo...,0,0,"[0.7219317555427551, -0.16742348670959473, -0....","[0.4184754490852356, 0.17196018993854523, 0.11..."
4,This place is hands down awesome! Food is grea...,4,4,"[-3.5193538665771484, -3.677042007446289, -1.9...","[0.0002963162842206657, 0.0002530885103624314,..."


In [None]:
print(type(df['teacher_probs'][0]))


<class 'list'>


# Using Soft Label

### MultiOutputRegressor + Tf-idf 10000-dim

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split

# Convert probs to numpy arrays
df["teacher_probs"] = df["teacher_probs"].apply(np.array)

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    df["text"],
    df["teacher_probs"],
    test_size=0.2,
    random_state=42,
    stratify=df["teacher_pred"]
)

# TF-IDF
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train_vec = tfidf.fit_transform(X_train)
X_val_vec   = tfidf.transform(X_val)

# Student Model (KD Regression)
student = MultiOutputRegressor(Ridge(alpha=1.0))
student.fit(X_train_vec, list(y_train))

# Predictions
pred_probs = student.predict(X_val_vec)
pred_probs = np.clip(pred_probs, 1e-12, None)
pred_probs /= pred_probs.sum(axis=1, keepdims=True)

pred_classes = np.argmax(pred_probs, axis=1)

# Accuracy vs teacher
teacher_val = df["teacher_pred"].iloc[y_val.index].values
acc_vs_teacher = (pred_classes == teacher_val).mean()

# Accuracy vs true
true_val = df["true_label"].iloc[y_val.index].values
acc_vs_true = (pred_classes == true_val).mean()

print("Soft-label TF-IDF Student Results")
print("Acc vs teacher:", acc_vs_teacher)
print("Acc vs true_label:", acc_vs_true)


Soft-label TF-IDF Student Results
Acc vs teacher: 0.6725
Acc vs true_label: 0.5865


### MultiOutputRegressor + Tf-idf 50000-dim

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split

# Convert probs to numpy arrays
df["teacher_probs"] = df["teacher_probs"].apply(np.array)

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    df["text"],
    df["teacher_probs"],
    test_size=0.2,
    random_state=42,
    stratify=df["teacher_pred"]
)

# TF-IDF
tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1,2))
X_train_vec = tfidf.fit_transform(X_train)
X_val_vec   = tfidf.transform(X_val)

# Student Model (KD Regression)
student = MultiOutputRegressor(Ridge(alpha=1.0))
student.fit(X_train_vec, list(y_train))

# Predictions
pred_probs = student.predict(X_val_vec)
pred_probs = np.clip(pred_probs, 1e-12, None)
pred_probs /= pred_probs.sum(axis=1, keepdims=True)

pred_classes = np.argmax(pred_probs, axis=1)

# Accuracy vs teacher
teacher_val = df["teacher_pred"].iloc[y_val.index].values
acc_vs_teacher = (pred_classes == teacher_val).mean()

# Accuracy vs true
true_val = df["true_label"].iloc[y_val.index].values
acc_vs_true = (pred_classes == true_val).mean()

print("Soft-label TF-IDF Student Results")
print("Acc vs teacher:", acc_vs_teacher)
print("Acc vs true_label:", acc_vs_true)


Soft-label TF-IDF Student Results
Acc vs teacher: 0.6762
Acc vs true_label: 0.5897


### MultiOutputRegressor + Tf-idf 100000-dim

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split

# Convert probs to numpy arrays
df["teacher_probs"] = df["teacher_probs"].apply(np.array)

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    df["text"],
    df["teacher_probs"],
    test_size=0.2,
    random_state=42,
    stratify=df["teacher_pred"]
)

# TF-IDF
tfidf = TfidfVectorizer(max_features=100_000, ngram_range=(1,2))
X_train_vec = tfidf.fit_transform(X_train)
X_val_vec   = tfidf.transform(X_val)

# Student Model (KD Regression)
student = MultiOutputRegressor(Ridge(alpha=1.0))
student.fit(X_train_vec, list(y_train))

# Predictions
pred_probs = student.predict(X_val_vec)
pred_probs = np.clip(pred_probs, 1e-12, None)
pred_probs /= pred_probs.sum(axis=1, keepdims=True)

pred_classes = np.argmax(pred_probs, axis=1)

# Accuracy vs teacher
teacher_val = df["teacher_pred"].iloc[y_val.index].values
acc_vs_teacher = (pred_classes == teacher_val).mean()

# Accuracy vs true
true_val = df["true_label"].iloc[y_val.index].values
acc_vs_true = (pred_classes == true_val).mean()

print("Soft-label TF-IDF Student Results")
print("Acc vs teacher:", acc_vs_teacher)
print("Acc vs true_label:", acc_vs_true)


Soft-label TF-IDF Student Results
Acc vs teacher: 0.68
Acc vs true_label: 0.5914


### MultiOutputRegressor + Tf-idf 200000-dim

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split

# Convert probs to numpy arrays
df["teacher_probs"] = df["teacher_probs"].apply(np.array)

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    df["text"],
    df["teacher_probs"],
    test_size=0.2,
    random_state=42,
    stratify=df["teacher_pred"]
)

# TF-IDF
tfidf = TfidfVectorizer(max_features=200_000, ngram_range=(1,2))
X_train_vec = tfidf.fit_transform(X_train)
X_val_vec   = tfidf.transform(X_val)

# Student Model (KD Regression)
student = MultiOutputRegressor(Ridge(alpha=1.0))
student.fit(X_train_vec, list(y_train))

# Predictions
pred_probs = student.predict(X_val_vec)
pred_probs = np.clip(pred_probs, 1e-12, None)
pred_probs /= pred_probs.sum(axis=1, keepdims=True)

pred_classes = np.argmax(pred_probs, axis=1)

# Accuracy vs teacher
teacher_val = df["teacher_pred"].iloc[y_val.index].values
acc_vs_teacher = (pred_classes == teacher_val).mean()

# Accuracy vs true
true_val = df["true_label"].iloc[y_val.index].values
acc_vs_true = (pred_classes == true_val).mean()

print("Soft-label TF-IDF Student Results")
print("Acc vs teacher:", acc_vs_teacher)
print("Acc vs true_label:", acc_vs_true)


Soft-label TF-IDF Student Results
Acc vs teacher: 0.6823
Acc vs true_label: 0.5949


In [None]:
# hard label 10000 dim tfidf
from sklearn.linear_model import LogisticRegression

X_train, X_val, y_train, y_val = train_test_split(
    df["text"],
    df["teacher_pred"],     # Hard labels
    test_size=0.2,
    stratify=df["teacher_pred"],
    random_state=42,
)

tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train_vec = tfidf.fit_transform(X_train)
X_val_vec   = tfidf.transform(X_val)

student_hard = LogisticRegression(max_iter=2000)
student_hard.fit(X_train_vec, y_train)

pred_hard = student_hard.predict(X_val_vec)

acc_vs_teacher = (pred_hard == df["teacher_pred"].iloc[y_val.index]).mean()
acc_vs_true = (pred_hard == df["true_label"].iloc[y_val.index]).mean()

print("Hard-label TF-IDF Student Results")
print("Acc vs teacher:", acc_vs_teacher)
print("Acc vs true_label:", acc_vs_true)


Hard-label TF-IDF Student Results
Acc vs teacher: 0.6814
Acc vs true_label: 0.5884


In [None]:
# hard label 50000 dim tfidf
from sklearn.linear_model import LogisticRegression

X_train, X_val, y_train, y_val = train_test_split(
    df["text"],
    df["teacher_pred"],
    test_size=0.2,
    stratify=df["teacher_pred"],
    random_state=42,
)

tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1,2))
X_train_vec = tfidf.fit_transform(X_train)
X_val_vec   = tfidf.transform(X_val)

student_hard = LogisticRegression(max_iter=4000)
student_hard.fit(X_train_vec, y_train)

pred_hard = student_hard.predict(X_val_vec)

acc_vs_teacher = (pred_hard == df["teacher_pred"].iloc[y_val.index]).mean()
acc_vs_true = (pred_hard == df["true_label"].iloc[y_val.index]).mean()

print("Hard-label TF-IDF Student Results")
print("Acc vs teacher:", acc_vs_teacher)
print("Acc vs true_label:", acc_vs_true)


Hard-label TF-IDF Student Results
Acc vs teacher: 0.6837
Acc vs true_label: 0.5893


In [None]:
# hard label 200000 dim tfidf
# compare this with the result of soft label 200000-dim tfidf
from sklearn.linear_model import LogisticRegression

X_train, X_val, y_train, y_val = train_test_split(
    df["text"],
    df["teacher_pred"],
    test_size=0.2,
    stratify=df["teacher_pred"],
    random_state=42,
)

tfidf = TfidfVectorizer(max_features=200_000, ngram_range=(1,2))
X_train_vec = tfidf.fit_transform(X_train)
X_val_vec   = tfidf.transform(X_val)

student_hard = LogisticRegression(max_iter=2000)
student_hard.fit(X_train_vec, y_train)

pred_hard = student_hard.predict(X_val_vec)

acc_vs_teacher = (pred_hard == df["teacher_pred"].iloc[y_val.index]).mean()
acc_vs_true = (pred_hard == df["true_label"].iloc[y_val.index]).mean()

print("Hard-label TF-IDF Student Results")
print("Acc vs teacher:", acc_vs_teacher)
print("Acc vs true_label:", acc_vs_true)


Hard-label TF-IDF Student Results
Acc vs teacher: 0.678
Acc vs true_label: 0.5842


# 1 layer NN with soft labels

This is similar to logistic regression

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset


# TF-IDF
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X = tfidf.fit_transform(df["text"]).astype("float32")
Y = np.stack(df["teacher_probs"].values).astype("float32")


In [None]:
from sklearn.model_selection import train_test_split

import numpy as np

indices = np.arange(len(df))

X_train, X_val, y_train, y_val, teacher_train, teacher_val, true_train, true_val, idx_train, idx_val = train_test_split(
    X,                              # TF-IDF features
    Y,                              # teacher_probs
    df["teacher_pred"].values,      # teacher hard labels
    df["true_label"].values,        # true labels
    indices,                        # row indices (optional but useful)
    test_size=0.2,
    random_state=42,
    stratify=df["teacher_pred"]
)



In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

X_train_t = torch.tensor(X_train.toarray(), device=device)
y_train_t = torch.tensor(y_train, device=device)

X_val_t = torch.tensor(X_val.toarray(), device=device)
y_val_t = torch.tensor(y_val, device=device)

train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)


In [None]:
class StudentNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return torch.softmax(self.fc(x), dim=1)

model = StudentNN(X_train_t.shape[1], y_train_t.shape[1]).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.KLDivLoss(reduction="batchmean")


In [None]:
for epoch in range(10):
    model.train()
    for xb, yb in train_dl:
        optimizer.zero_grad()
        out = torch.log(model(xb))
        loss = loss_fn(out, yb)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} loss = {loss.item():.4f}")


Epoch 1 loss = 0.7182
Epoch 2 loss = 0.6268
Epoch 3 loss = 0.6193
Epoch 4 loss = 0.5542
Epoch 5 loss = 0.4671
Epoch 6 loss = 0.4289
Epoch 7 loss = 0.4147
Epoch 8 loss = 0.4081
Epoch 9 loss = 0.3519
Epoch 10 loss = 0.3606


In [None]:
model.eval()
with torch.no_grad():
    pred_probs = model(X_val_t).cpu().numpy()

pred_classes = np.argmax(pred_probs, axis=1)

acc_vs_teacher = (pred_classes == teacher_val).mean()
acc_vs_true    = (pred_classes == true_val).mean()

print("Acc vs teacher:", acc_vs_teacher)
print("Acc vs true:", acc_vs_true)


Acc vs teacher: 0.6712
Acc vs true: 0.5824


# 2 layer NN, 10000-dim tf-idf



In [None]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np


# TF-IDF
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X = tfidf.fit_transform(df["text"]).astype("float32")
Y = np.stack(df["teacher_probs"].values).astype("float32")


# -----------------------------
# 1. Train/Val Split
# -----------------------------
indices = np.arange(len(df))

X_train, X_val, y_train, y_val, teacher_train, teacher_val, true_train, true_val, idx_train, idx_val = train_test_split(
    X,                              # TF-IDF features
    Y,                              # teacher_probs (soft labels)
    df["teacher_pred"].values,      # teacher hard labels
    df["true_label"].values,        # ground truth labels
    indices,
    test_size=0.2,
    random_state=42,
    stratify=df["teacher_pred"]
)

# -----------------------------
# 2. Move dense TF-IDF to GPU
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

X_train_t = torch.tensor(X_train.toarray(), dtype=torch.float32, device=device)
y_train_t = torch.tensor(y_train,           dtype=torch.float32, device=device)

X_val_t   = torch.tensor(X_val.toarray(),   dtype=torch.float32, device=device)
y_val_t   = torch.tensor(y_val,             dtype=torch.float32, device=device)

train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)

# -----------------------------
# 3. Define Student Model (Medium)
# -----------------------------
class StudentNN_Medium(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=1024, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        logits = self.net(x)
        return torch.softmax(logits, dim=1)

# Initialize student
model = StudentNN_Medium(
    input_dim=X_train_t.shape[1],
    output_dim=y_train_t.shape[1],
    hidden_dim=1024
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.KLDivLoss(reduction="batchmean")

# -----------------------------
# 4. Training Loop
# -----------------------------
for epoch in range(10):
    model.train()
    for xb, yb in train_dl:
        optimizer.zero_grad()

        preds = model(xb)
        loss = loss_fn(torch.log(preds), yb)

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} | Loss = {loss.item():.4f}")

# -----------------------------
# 5. Evaluation
# -----------------------------
model.eval()
with torch.no_grad():
    pred_probs = model(X_val_t).cpu().numpy()

pred_classes = np.argmax(pred_probs, axis=1)

teacher_val_lbls = teacher_val
true_val_lbls    = true_val

acc_vs_teacher = (pred_classes == teacher_val_lbls).mean()
acc_vs_true    = (pred_classes == true_val_lbls).mean()

print("\n----------------------------------")
print("Evaluation Results")
print("----------------------------------")
print("Acc vs Teacher:", acc_vs_teacher)
print("Acc vs True:   ", acc_vs_true)


Using device: cuda
Epoch 1 | Loss = 0.2437
Epoch 2 | Loss = 0.1804
Epoch 3 | Loss = 0.1271
Epoch 4 | Loss = 0.0994
Epoch 5 | Loss = 0.0611
Epoch 6 | Loss = 0.0360
Epoch 7 | Loss = 0.0229
Epoch 8 | Loss = 0.0191
Epoch 9 | Loss = 0.0155
Epoch 10 | Loss = 0.0157

----------------------------------
Evaluation Results
----------------------------------
Acc vs Teacher: 0.6666
Acc vs True:    0.5806


# 2 layer NN, 10000-dim tf-idf


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np


# TF-IDF
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X = tfidf.fit_transform(df["text"]).astype("float32")
Y = np.stack(df["teacher_probs"].values).astype("float32")


# -----------------------------
# 1. Train/Val Split
# -----------------------------
indices = np.arange(len(df))

X_train, X_val, y_train, y_val, teacher_train, teacher_val, true_train, true_val, idx_train, idx_val = train_test_split(
    X,                              # TF-IDF features
    Y,                              # teacher_probs (soft labels)
    df["teacher_pred"].values,      # teacher hard labels
    df["true_label"].values,        # ground truth labels
    indices,
    test_size=0.2,
    random_state=42,
    stratify=df["teacher_pred"]
)

# -----------------------------
# 2. Move dense TF-IDF to GPU
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

X_train_t = torch.tensor(X_train.toarray(), dtype=torch.float32, device=device)
y_train_t = torch.tensor(y_train,           dtype=torch.float32, device=device)

X_val_t   = torch.tensor(X_val.toarray(),   dtype=torch.float32, device=device)
y_val_t   = torch.tensor(y_val,             dtype=torch.float32, device=device)

train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)

# -----------------------------
# 3. Define Student Model (Medium)
# -----------------------------
class StudentNN_Medium(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=1024, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        logits = self.net(x)
        return torch.softmax(logits, dim=1)

# Initialize student
model = StudentNN_Medium(
    input_dim=X_train_t.shape[1],
    output_dim=y_train_t.shape[1],
    hidden_dim=1024
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.KLDivLoss(reduction="batchmean")

# -----------------------------
# 4. Training Loop
# -----------------------------
for epoch in range(10):
    model.train()
    for xb, yb in train_dl:
        optimizer.zero_grad()

        preds = model(xb)
        loss = loss_fn(torch.log(preds), yb)

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} | Loss = {loss.item():.4f}")

# -----------------------------
# 5. Evaluation
# -----------------------------
model.eval()
with torch.no_grad():
    pred_probs = model(X_val_t).cpu().numpy()

pred_classes = np.argmax(pred_probs, axis=1)

teacher_val_lbls = teacher_val
true_val_lbls    = true_val

acc_vs_teacher = (pred_classes == teacher_val_lbls).mean()
acc_vs_true    = (pred_classes == true_val_lbls).mean()

print("\n----------------------------------")
print("Evaluation Results")
print("----------------------------------")
print("Acc vs Teacher:", acc_vs_teacher)
print("Acc vs True:   ", acc_vs_true)


Using device: cuda
Epoch 1 | Loss = 0.2533
Epoch 2 | Loss = 0.1738
Epoch 3 | Loss = 0.1725
Epoch 4 | Loss = 0.1964
Epoch 5 | Loss = 0.1072
Epoch 6 | Loss = 0.0936
Epoch 7 | Loss = 0.0626
Epoch 8 | Loss = 0.0589
Epoch 9 | Loss = 0.0480
Epoch 10 | Loss = 0.0207

----------------------------------
Evaluation Results
----------------------------------
Acc vs Teacher: 0.6754
Acc vs True:    0.5894


# 3-layer NN, 10000-dim tfidf

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np

# TF-IDF
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X = tfidf.fit_transform(df["text"]).astype("float32")
Y = np.stack(df["teacher_probs"].values).astype("float32")

# ---------------------------------------------------
# 1. Train/Val Split (same as your previous code)
# ---------------------------------------------------
indices = np.arange(len(df))

X_train, X_val, y_train, y_val, teacher_train, teacher_val, true_train, true_val, idx_train, idx_val = train_test_split(
    X,                              # TF-IDF matrix
    Y,                              # teacher_probs (soft labels)
    df["teacher_pred"].values,      # teacher predicted labels (hard labels)
    df["true_label"].values,        # ground truth labels
    indices,
    test_size=0.2,
    random_state=42,
    stratify=df["teacher_pred"]
)

# ---------------------------------------------------
# 2. Convert TF-IDF to dense GPU tensors
# ---------------------------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

X_train_t = torch.tensor(X_train.toarray(), dtype=torch.float32, device=device)
y_train_t = torch.tensor(y_train,           dtype=torch.float32, device=device)

X_val_t   = torch.tensor(X_val.toarray(),   dtype=torch.float32, device=device)
y_val_t   = torch.tensor(y_val,             dtype=torch.float32, device=device)

train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)

# ---------------------------------------------------
# 3. Define 3-Layer Neural Network Student
# ---------------------------------------------------
class StudentNN_Large(nn.Module):
    def __init__(self, input_dim, output_dim, h1=2048, h2=1024, dropout=0.15):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, h1),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(h1, h2),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(h2, output_dim)
        )

    def forward(self, x):
        logits = self.net(x)
        return torch.softmax(logits, dim=1)

# Instantiate model
model = StudentNN_Large(
    input_dim=X_train_t.shape[1],
    output_dim=y_train_t.shape[1],
    h1=2048,
    h2=1024
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.KLDivLoss(reduction="batchmean")

# ---------------------------------------------------
# 4. Training Loop
# ---------------------------------------------------
for epoch in range(10):
    model.train()
    running_loss = 0

    for xb, yb in train_dl:
        optimizer.zero_grad()

        preds = model(xb)                   # softmax probabilities
        loss = loss_fn(torch.log(preds), yb)  # KLDivLoss requires log-probs

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1} | Loss = {running_loss/len(train_dl):.4f}")

# ---------------------------------------------------
# 5. Evaluation
# ---------------------------------------------------
model.eval()
with torch.no_grad():
    pred_probs = model(X_val_t).cpu().numpy()

pred_classes = np.argmax(pred_probs, axis=1)

teacher_val_lbls = teacher_val
true_val_lbls    = true_val

acc_vs_teacher = (pred_classes == teacher_val_lbls).mean()
acc_vs_true    = (pred_classes == true_val_lbls).mean()

print("\n===============================")
print(" Evaluation Results")
print("===============================")
print("Acc vs Teacher:", acc_vs_teacher)
print("Acc vs True:   ", acc_vs_true)


Using device: cuda
Epoch 1 | Loss = 0.3361
Epoch 2 | Loss = 0.1766
Epoch 3 | Loss = 0.1094
Epoch 4 | Loss = 0.0538
Epoch 5 | Loss = 0.0334
Epoch 6 | Loss = 0.0251
Epoch 7 | Loss = 0.0201
Epoch 8 | Loss = 0.0170
Epoch 9 | Loss = 0.0152
Epoch 10 | Loss = 0.0136

 Evaluation Results
Acc vs Teacher: 0.6956
Acc vs True:    0.6023


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# ================================
# 0. TF-IDF FEATURE EXTRACTION
# ================================
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X = tfidf.fit_transform(df["text"]).astype("float32")
Y = np.stack(df["teacher_probs"].values).astype("float32")

# ================================
# 1. TRAIN / VALIDATION SPLIT
# ================================
indices = np.arange(len(df))

X_train, X_val, y_train, y_val, teacher_train, teacher_val, true_train, true_val, idx_train, idx_val = train_test_split(
    X,                              # TF-IDF sparse matrix
    Y,                              # teacher soft labels
    df["teacher_pred"].values,      # teacher hard labels
    df["true_label"].values,        # ground truth labels
    indices,
    test_size=0.2,
    random_state=42,
    stratify=df["teacher_pred"]
)

# ================================
# 2. MOVE DATA TO GPU (dense form)
# ================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

X_train_t = torch.tensor(X_train.toarray(), dtype=torch.float32, device=device)
y_train_t = torch.tensor(y_train,           dtype=torch.float32, device=device)

X_val_t   = torch.tensor(X_val.toarray(),   dtype=torch.float32, device=device)
y_val_t   = torch.tensor(y_val,             dtype=torch.float32, device=device)

train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)

# ================================
# 3. DEFINE 4-LAYER XL STUDENT NN
# ================================
class StudentNN_XL(nn.Module):
    def __init__(self, input_dim, output_dim,
                 h1=2048, h2=1536, h3=1024, dropout=0.20):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, h1),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(h1, h2),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(h2, h3),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(h3, output_dim)
        )

    def forward(self, x):
        logits = self.net(x)
        return torch.softmax(logits, dim=1)

# Instantiate the XL student
model = StudentNN_XL(
    input_dim=X_train_t.shape[1],
    output_dim=y_train_t.shape[1],
    h1=2048, h2=1536, h3=1024,
    dropout=0.20
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.KLDivLoss(reduction="batchmean")

# ================================
# 4. TRAINING LOOP
# ================================
for epoch in range(20):
    model.train()
    running_loss = 0

    for xb, yb in train_dl:
        optimizer.zero_grad()

        preds = model(xb)                    # softmax probabilities
        loss = loss_fn(torch.log(preds), yb) # KLDivLoss requires log-probs

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1} | Loss = {running_loss/len(train_dl):.4f}")

# ================================
# 5. EVALUATION
# ================================
model.eval()
with torch.no_grad():
    pred_probs = model(X_val_t).cpu().numpy()

pred_classes = np.argmax(pred_probs, axis=1)

teacher_val_lbls = teacher_val
true_val_lbls    = true_val

acc_vs_teacher = (pred_classes == teacher_val_lbls).mean()
acc_vs_true    = (pred_classes == true_val_lbls).mean()

print("\n===============================")
print(" Evaluation Results")
print("===============================")
print("Acc vs Teacher:", acc_vs_teacher)
print("Acc vs True:   ", acc_vs_true)


Using device: cuda
Epoch 1 | Loss = 0.3404
Epoch 2 | Loss = 0.1752
Epoch 3 | Loss = 0.0955
Epoch 4 | Loss = 0.0564
Epoch 5 | Loss = 0.0381
Epoch 6 | Loss = 0.0278
Epoch 7 | Loss = 0.0216
Epoch 8 | Loss = 0.0182
Epoch 9 | Loss = 0.0162
Epoch 10 | Loss = 0.0145
Epoch 11 | Loss = 0.0132
Epoch 12 | Loss = 0.0127
Epoch 13 | Loss = 0.0120
Epoch 14 | Loss = 0.0115
Epoch 15 | Loss = 0.0112
Epoch 16 | Loss = 0.0108
Epoch 17 | Loss = 0.0104
Epoch 18 | Loss = 0.0102
Epoch 19 | Loss = 0.0099
Epoch 20 | Loss = 0.0096

 Evaluation Results
Acc vs Teacher: 0.7037
Acc vs True:    0.6044


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    get_linear_schedule_with_warmup
)

from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

# ======================================================
# 1. Train/Val Split (same as earlier)
# ======================================================
indices = np.arange(len(df))

X_train, X_val, y_train, y_val, teacher_train, teacher_val, true_train, true_val, idx_train, idx_val = train_test_split(
    df["text"].values,
    np.stack(df["teacher_probs"].values),
    df["teacher_pred"].values,
    df["true_label"].values,
    indices,
    test_size=0.2,
    random_state=42,
    stratify=df["teacher_pred"]
)

# ======================================================
# 2. Tokenizer and Dataset Class
# ======================================================
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

class KDTextDataset(Dataset):
    def __init__(self, texts, teacher_probs):
        self.texts = texts
        self.teacher_probs = teacher_probs  # soft labels from teacher

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=128,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "teacher_probs": torch.tensor(self.teacher_probs[idx], dtype=torch.float32)
        }

train_ds = KDTextDataset(X_train, y_train)
val_ds   = KDTextDataset(X_val, y_val)

train_dl = DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl   = DataLoader(val_ds, batch_size=16)

# ======================================================
# 3. Define DistilBERT Student Model for KD
# ======================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=5
).to(device)

# This loss computes KL(student || teacher)
kd_loss_fn = nn.KLDivLoss(reduction="batchmean")

optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 2
total_steps = len(train_dl) * num_epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

# ======================================================
# 4. Training Loop with KD Loss
# ======================================================
for epoch in range(num_epochs):
    model.train()
    running_loss = 0

    pbar = tqdm(train_dl, desc=f"Epoch {epoch+1}")

    for batch in pbar:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        teacher_probs = batch["teacher_probs"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        student_probs = torch.softmax(logits, dim=-1)

        # KD loss: KL(student || teacher)
        loss = kd_loss_fn(torch.log(student_probs + 1e-12), teacher_probs)

        loss.backward()
        optimizer.step()
        scheduler.step()

        running_loss += loss.item()
        pbar.set_postfix({"loss": loss.item()})

    print(f"Epoch {epoch+1} | Mean Loss = {running_loss/len(train_dl):.4f}")

# ======================================================
# 5. Evaluation
# ======================================================
model.eval()
all_preds = []

with torch.no_grad():
    for batch in val_dl:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        probs = torch.softmax(logits, dim=-1).cpu().numpy()
        all_preds.append(probs)

pred_probs = np.vstack(all_preds)
pred_classes = np.argmax(pred_probs, axis=1)

teacher_val_lbls = teacher_val
true_val_lbls = true_val

acc_vs_teacher = (pred_classes == teacher_val_lbls).mean()
acc_vs_true    = (pred_classes == true_val_lbls).mean()

print("\n===============================")
print(" DistilBERT KD Evaluation")
print("===============================")
print("Acc vs Teacher:", acc_vs_teacher)
print("Acc vs True:   ", acc_vs_true)


Using device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1:   0%|          | 0/2500 [00:00<?, ?it/s]

Epoch 1 | Mean Loss = 0.2722


Epoch 2:   0%|          | 0/2500 [00:00<?, ?it/s]

Epoch 2 | Mean Loss = 0.1050

 DistilBERT KD Evaluation
Acc vs Teacher: 0.8063
Acc vs True:    0.6193


# Hard Label

In [None]:
# hard label 2000 dim tfidf
# compare this with the result of soft label 200000-dim tfidf
from sklearn.linear_model import LogisticRegression

X_train, X_val, y_train, y_val = train_test_split(
    df["text"],
    df["teacher_pred"],
    test_size=0.2,
    stratify=df["teacher_pred"],
    random_state=42,
)

tfidf = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
X_train_vec = tfidf.fit_transform(X_train)
X_val_vec   = tfidf.transform(X_val)

student_hard = LogisticRegression(max_iter=2000)
student_hard.fit(X_train_vec, y_train)

pred_hard = student_hard.predict(X_val_vec)

acc_vs_teacher = (pred_hard == df["teacher_pred"].iloc[y_val.index]).mean()
acc_vs_true = (pred_hard == df["true_label"].iloc[y_val.index]).mean()

print("Hard-label TF-IDF Student Results")
print("Acc vs teacher:", acc_vs_teacher)
print("Acc vs true_label:", acc_vs_true)


Hard-label TF-IDF Student Results
Acc vs teacher: 0.6616
Acc vs true_label: 0.5702


# 3 layer NN + 10000-dim Tf-idf

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

# ============================================================
# 1. TF-IDF VECTORIZATION
# ============================================================
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X = tfidf.fit_transform(df["text"]).astype("float32")

Y = np.stack(df["teacher_probs"].values).astype("float32")
teacher_hard = df["teacher_pred"].values.astype("int64")
true_labels  = df["true_label"].values.astype("int64")

# ============================================================
# 2. TRAIN/VAL SPLIT
# ============================================================
indices = np.arange(len(df))

X_train, X_val, y_train_hard, y_val_hard, true_train, true_val, idx_train, idx_val = train_test_split(
    X,
    teacher_hard,     # TRAIN ON HARD LABELS ONLY
    true_labels,
    indices,
    test_size=0.2,
    random_state=42,
    stratify=teacher_hard
)

# ============================================================
# 3. CONVERT TO TORCH TENSORS
# ============================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

X_train_t = torch.tensor(X_train.toarray(), dtype=torch.float32, device=device)
X_val_t   = torch.tensor(X_val.toarray(),   dtype=torch.float32, device=device)

y_train_t = torch.tensor(y_train_hard, dtype=torch.long, device=device)
y_val_t   = torch.tensor(y_val_hard,   dtype=torch.long, device=device)

train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)

# ============================================================
# 4. DEFINE 3-LAYER HARD-LABEL NN STUDENT
# ============================================================
class StudentNN_3Layer(nn.Module):
    def __init__(self, input_dim, output_dim, h1=2048, h2=1024, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, h1),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(h1, h2),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(h2, output_dim)   # logits
        )

    def forward(self, x):
        return self.net(x)

model = StudentNN_3Layer(
    input_dim=X_train_t.shape[1],
    output_dim=len(np.unique(teacher_hard)),
    h1=2048,
    h2=1024,
    dropout=0.15
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# ============================================================
# 5. TRAINING LOOP
# ============================================================
for epoch in range(20):
    model.train()
    running_loss = 0

    for xb, yb in train_dl:
        optimizer.zero_grad()

        logits = model(xb)
        loss = loss_fn(logits, yb)

        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch {epoch+1} | Loss = {running_loss/len(train_dl):.4f}")

# ============================================================
# 6. EVALUATION
# ============================================================
model.eval()
with torch.no_grad():
    logits_val = model(X_val_t)
    pred_classes = torch.argmax(logits_val, dim=1).cpu().numpy()

# teacher & true label arrays
teacher_val_lbls = y_val_hard
true_val_lbls    = true_val

acc_vs_teacher = (pred_classes == teacher_val_lbls).mean()
acc_vs_true    = (pred_classes == true_val_lbls).mean()

print("\n===============================")
print("   HARD LABEL – 3 Layer NN")
print("===============================")
print("Acc vs Teacher:", acc_vs_teacher)
print("Acc vs True:   ", acc_vs_true)


Using device: cuda
Epoch 1 | Loss = 0.8737
Epoch 2 | Loss = 0.5539
Epoch 3 | Loss = 0.3250
Epoch 4 | Loss = 0.0944
Epoch 5 | Loss = 0.0191
Epoch 6 | Loss = 0.0064
Epoch 7 | Loss = 0.0039
Epoch 8 | Loss = 0.0028
Epoch 9 | Loss = 0.0025
Epoch 10 | Loss = 0.0016
Epoch 11 | Loss = 0.0015
Epoch 12 | Loss = 0.0013
Epoch 13 | Loss = 0.0013
Epoch 14 | Loss = 0.0012
Epoch 15 | Loss = 0.0012
Epoch 16 | Loss = 0.0008
Epoch 17 | Loss = 0.0008
Epoch 18 | Loss = 0.0006
Epoch 19 | Loss = 0.0008
Epoch 20 | Loss = 0.0007

   HARD LABEL – 3 Layer NN
Acc vs Teacher: 0.6533
Acc vs True:    0.5746


# 4 layer NN + 10000-dim Tf-idf

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

# ============================================================
# 1. TF-IDF VECTORIZATION
# ============================================================
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X = tfidf.fit_transform(df["text"]).astype("float32")

teacher_probs = np.stack(df["teacher_probs"].values).astype("float32")
teacher_hard  = df["teacher_pred"].values.astype("int64")
true_labels   = df["true_label"].values.astype("int64")

# ============================================================
# 2. TRAIN/VAL SPLIT
# ============================================================
indices = np.arange(len(df))

X_train, X_val, y_train_hard, y_val_hard, true_train, true_val, idx_train, idx_val = train_test_split(
    X,
    teacher_hard,
    true_labels,
    indices,
    test_size=0.2,
    random_state=42,
    stratify=teacher_hard
)

# ============================================================
# 3. CONVERT TO TORCH TENSORS
# ============================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

X_train_t = torch.tensor(X_train.toarray(), dtype=torch.float32, device=device)
X_val_t   = torch.tensor(X_val.toarray(),   dtype=torch.float32, device=device)

y_train_t = torch.tensor(y_train_hard, dtype=torch.long, device=device)
y_val_t   = torch.tensor(y_val_hard,   dtype=torch.long, device=device)

train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)

# ============================================================
# 4. DEFINE 4-LAYER HARD-LABEL NN STUDENT
# ============================================================
class StudentNN_4Layer(nn.Module):
    def __init__(self, input_dim, output_dim, h1=4096, h2=2048, h3=1024, dropout=0.15):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, h1),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(h1, h2),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(h2, h3),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(h3, output_dim)   # logits
        )

    def forward(self, x):
        return self.net(x)


model = StudentNN_4Layer(
    input_dim=X_train_t.shape[1],
    output_dim=len(np.unique(teacher_hard)),
    h1=4096,
    h2=2048,
    h3=1024,
    dropout=0.15
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# ============================================================
# 5. TRAINING LOOP
# ============================================================
for epoch in range(20):
    model.train()
    running_loss = 0

    for xb, yb in train_dl:
        optimizer.zero_grad()
        logits = model(xb)
        loss = loss_fn(logits, yb)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1} | Loss = {running_loss / len(train_dl):.4f}")

# ============================================================
# 6. EVALUATION
# ============================================================
model.eval()
with torch.no_grad():
    logits_val = model(X_val_t)
    pred_classes = logits_val.argmax(dim=1).cpu().numpy()

teacher_val_lbls = y_val_hard
true_val_lbls    = true_val

acc_vs_teacher = (pred_classes == teacher_val_lbls).mean()
acc_vs_true    = (pred_classes == true_val_lbls).mean()

print("\n===============================")
print("       HARD LABEL – 4 Layer NN")
print("===============================")
print("Acc vs Teacher:", acc_vs_teacher)
print("Acc vs True:   ", acc_vs_true)


Using device: cuda
Epoch 1 | Loss = 0.8775
Epoch 2 | Loss = 0.5426
Epoch 3 | Loss = 0.2245
Epoch 4 | Loss = 0.0798
Epoch 5 | Loss = 0.0284
Epoch 6 | Loss = 0.0186
Epoch 7 | Loss = 0.0164
Epoch 8 | Loss = 0.0171
Epoch 9 | Loss = 0.0158
Epoch 10 | Loss = 0.0176
Epoch 11 | Loss = 0.0152
Epoch 12 | Loss = 0.0122
Epoch 13 | Loss = 0.0126
Epoch 14 | Loss = 0.0180
Epoch 15 | Loss = 0.0166
Epoch 16 | Loss = 0.0112
Epoch 17 | Loss = 0.0080
Epoch 18 | Loss = 0.0095
Epoch 19 | Loss = 0.0085
Epoch 20 | Loss = 0.0100

       HARD LABEL – 4 Layer NN
Acc vs Teacher: 0.6552
Acc vs True:    0.5744


In [None]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

# ======================================================
# 1. Train/Val Split (HARD LABELS)
# ======================================================
indices = np.arange(len(df))

# Here y_train / y_val are teacher hard labels, NOT probs
X_train, X_val, y_train, y_val, true_train, true_val, idx_train, idx_val = train_test_split(
    df["text"].values,          # raw text
    df["teacher_pred"].values,  # teacher hard labels (ints 0–4)
    df["true_label"].values,    # ground truth labels
    indices,
    test_size=0.2,
    random_state=42,
    stratify=df["teacher_pred"]
)

y_train = y_train.astype("int64")
y_val   = y_val.astype("int64")
true_train = true_train.astype("int64")
true_val   = true_val.astype("int64")

# ======================================================
# 2. Tokenizer and Dataset Class (HARD LABELS)
# ======================================================
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

class HardLabelTextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels  # hard labels from teacher

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=128,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

train_ds = HardLabelTextDataset(X_train, y_train)
val_ds   = HardLabelTextDataset(X_val,   y_val)

train_dl = DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl   = DataLoader(val_ds,   batch_size=16)

# ======================================================
# 3. Define DistilBERT Student Model for HARD LABELS
# ======================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=5
).to(device)

ce_loss_fn = nn.CrossEntropyLoss()

optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 2   # you can increase to 3 if time allows
total_steps = len(train_dl) * num_epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

# ======================================================
# 4. Training Loop with HARD-LABEL CE Loss
# ======================================================
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    pbar = tqdm(train_dl, desc=f"[HARD] Epoch {epoch+1}")

    for batch in pbar:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # [B, num_labels]

        # Hard-label loss: CrossEntropy between logits and teacher hard labels
        loss = ce_loss_fn(logits, labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

        running_loss += loss.item()
        pbar.set_postfix({"loss": loss.item()})

    print(f"[HARD] Epoch {epoch+1} | Mean Loss = {running_loss/len(train_dl):.4f}")

# ======================================================
# 5. Evaluation
# ======================================================
model.eval()
all_preds = []

with torch.no_grad():
    for batch in val_dl:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        probs = torch.softmax(logits, dim=-1).cpu().numpy()
        all_preds.append(probs)

pred_probs = np.vstack(all_preds)
pred_classes = np.argmax(pred_probs, axis=1)

teacher_val_lbls = y_val          # teacher hard labels on val split
true_val_lbls    = true_val       # true labels on val split

acc_vs_teacher = (pred_classes == teacher_val_lbls).mean()
acc_vs_true    = (pred_classes == true_val_lbls).mean()

print("\n===============================")
print(" DistilBERT Hard-Label Evaluation")
print("===============================")
print("Acc vs Teacher:", acc_vs_teacher)
print("Acc vs True:   ", acc_vs_true)


Using device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[HARD] Epoch 1:   0%|          | 0/2500 [00:00<?, ?it/s]

[HARD] Epoch 1 | Mean Loss = 0.7849


[HARD] Epoch 2:   0%|          | 0/2500 [00:00<?, ?it/s]

[HARD] Epoch 2 | Mean Loss = 0.4210

 DistilBERT Hard-Label Evaluation
Acc vs Teacher: 0.7947
Acc vs True:    0.6111
