## Packages & Imports

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.4 colorlog-6.9.0 optuna-4.5.0


In [3]:
!pip install transformers seaborn optuna wandb



In [11]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim
from sklearn.model_selection import train_test_split
from pathlib import Path
from sklearn.metrics import (
    precision_score, recall_score, f1_score, accuracy_score,
)

import optuna
import wandb

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
os.environ["WANDB_PROJECT"]   = "covid19-tweets-sentiment-roberta-ex5-EDA_new"
os.environ["WANDB_WATCH"]     = "gradients"
os.environ["WANDB_LOG_MODEL"] = "end"
wandb.login(key="60786c7e57091e6727aabddb0ba2af1cd90b58be")

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msaarshyovitz[0m ([33msaarshyovitz-tel-aviv-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Data & Model initialize

In [4]:
csv_path_train = "train_cleaned.csv"

# Use the Python engine and skip any malformed lines
df_train = pd.read_csv(
    csv_path_train,
    encoding="latin-1",
    engine="python",
    on_bad_lines="skip"    # drop lines with unmatched quotes or other parse errors
)

# Show the first 5 rows
print(df_train.head())

   UserName  ScreenName   Location     TweetAt  \
0      3799       48751     London  16-03-2020   
1      3800       48752         UK  16-03-2020   
2      3801       48753  Vagabonds  16-03-2020   
3      3802       48754        NaN  16-03-2020   
4      3803       48755        NaN  16-03-2020   

                                       OriginalTweet           Sentiment  \
0  @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...             Neutral   
1  advice Talk to your neighbours family to excha...            Positive   
2  Coronavirus Australia: Woolworths to give elde...            Positive   
3  My food stock is not the only one which is emp...            Positive   
4  Me, ready to go at supermarket during the #COV...  Extremely Negative   

                                          clean_text language  
0           @user @user @user http and http and http       en  
1  advice Talk to your neighbours family to excha...       en  
2  Coronavirus Australia: Woolworths to give elde.

In [5]:
# Use the Python engine and skip any malformed lines
csv_path_test = "test_cleaned.csv"
df_test = pd.read_csv(
    csv_path_test,
    encoding="latin-1",
    engine="python",
    on_bad_lines="skip"    # drop lines with unmatched quotes or other parse errors
)

# Show the first 5 rows
print(df_test.head())

   UserName  ScreenName             Location     TweetAt  \
0         1       44953                  NYC  02-03-2020   
1         2       44954          Seattle, WA  02-03-2020   
2         3       44955                  NaN  02-03-2020   
3         4       44956          Chicagoland  02-03-2020   
4         5       44957  Melbourne, Victoria  03-03-2020   

                                       OriginalTweet           Sentiment  \
0  TRENDING: New Yorkers encounter empty supermar...  Extremely Negative   
1  When I couldn't find hand sanitizer at Fred Me...            Positive   
2  Find out how you can protect yourself and love...  Extremely Positive   
3  #Panic buying hits #NewYork City as anxious sh...            Negative   
4  #toiletpaper #dunnypaper #coronavirus #coronav...             Neutral   

                                          clean_text language  
0  TRENDING: New Yorkers encounter empty supermar...       en  
1  When I couldn't find hand sanitizer at Fred Me...  

In [6]:
from sklearn.model_selection import train_test_split

# Split 10% of df_train into validation set
df_train, df_val = train_test_split(
    df_train,
    test_size=0.1,               # 10% of df_train goes to validation
    random_state=42,             # For reproducibility
    stratify=df_train['Sentiment']   # Preserve class distribution
)

# Save to CSV
df_train.to_csv('train_data.csv', index=False)
df_val.to_csv('val_data.csv', index=False)
df_test.to_csv('test_data.csv', index=False)

print(len(df_train))
print(len(df_val))
print(len(df_test))

29384
3265
3788


In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-large")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [8]:
import pandas as pd
from datasets import Dataset, DatasetDict, Features, ClassLabel, Value


# --- Robust CSV read (tweets often have stray quotes/newlines) ---
def read_csv_robust(path):
    return pd.read_csv(
        path,
        encoding="latin-1",
        engine="python",
        on_bad_lines="skip"  # if you prefer to keep all rows, we can repair instead of skipping
    )

train_df = df_train
test_df  = df_test

# --- Keep only needed columns and rename to the desired schema ---
# Text column in this dataset is 'clean_text'; label column is 'Sentiment'
train_df = train_df.rename(columns={"clean_text": "text", "Sentiment": "label"})[["text", "label"]]
test_df  = test_df.rename(columns={"clean_text": "text", "Sentiment": "label"})[["text", "label"]]

# --- Build label names directly from the CSV (union of train+test) ---
def label_names_from_frames(*dfs):
    seen = []
    canonical = ["Extremely Negative","Negative", "Neutral", "Positive", "Extremely Positive"]
    # collect in canonical order if present, then append any unexpected labels
    present = []
    for df in dfs:
        for s in df["label"].dropna().astype(str).unique().tolist():
            if s not in present:
                present.append(s)
    ordered = [x for x in canonical if x in present] + [x for x in present if x not in canonical]
    return ordered

label_names = label_names_from_frames(train_df, test_df)
label2id = {name: i for i, name in enumerate(label_names)}

# Map string labels -> integer ids using the names we just derived
train_df["label"] = train_df["label"].map(label2id).astype("int64")
test_df["label"]  = test_df["label"].map(label2id).astype("int64")

# --- Define features so 'label' is a proper ClassLabel with the right names ---
features = Features({
    "text":  Value("string"),
    "label": ClassLabel(names=label_names)
})

# --- Build DatasetDict ---
train_ds = Dataset.from_pandas(train_df, preserve_index=False, features=features)
test_ds  = Dataset.from_pandas(test_df,  preserve_index=False, features=features)
dataset  = DatasetDict({"train": train_ds, "test": test_ds})

print(dataset)
print("Label names:", dataset["train"].features["label"].names)
print("id2label:", {i: n for i, n in enumerate(dataset["train"].features["label"].names)})


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 29384
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3788
    })
})
Label names: ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']
id2label: {0: 'Extremely Negative', 1: 'Negative', 2: 'Neutral', 3: 'Positive', 4: 'Extremely Positive'}


In [9]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True,max_length=64)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/29384 [00:00<?, ? examples/s]

Map:   0%|          | 0/3788 [00:00<?, ? examples/s]

In [10]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("roberta-large", num_labels=5)

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

Downloading builder script: 0.00B [00:00, ?B/s]

In [13]:
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [14]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="covid19-tweets-sentiment-roberta-ex5-EDA_new" , eval_strategy="epoch")


## Train model

In [None]:
# פרמטרי אימון
training_args = TrainingArguments(
    output_dir="covid19-tweets-sentiment-roberta-ex5-EDA_new",
    eval_strategy="epoch",        # הערכה בסוף כל אפוק
    save_strategy="epoch",              # שמירה בסוף כל אפוק
    num_train_epochs=14,                # עד 20 אפוקים
    per_device_train_batch_size=8,     # batch size לאימון
    per_device_eval_batch_size=8,      # batch size להערכה
    learning_rate=0.00015,                 # LR אופטימלי
    weight_decay=0.00014,                 # regularization
    logging_strategy="steps",
    logging_steps=100,                  # לוג כל 100 באצ’ים
    load_best_model_at_end=True,        # שומר את המודל עם ה־accuracy הכי גבוה
    metric_for_best_model="accuracy",
    save_total_limit=6,                 # ישמור רק שש checkpoints
    report_to="wandb",
    run_name="roberta_best_run",
)


In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

In [None]:
# יצירת Trainer עם Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.5935,1.580876,0.249736
2,1.5973,1.652218,0.249736
3,1.5781,1.598545,0.249736
4,1.6,1.582158,0.274551
5,1.5871,1.594915,0.249736
6,1.59,1.584698,0.249736
7,1.5822,1.595956,0.249736


TrainOutput(global_step=25711, training_loss=1.5923759977809469, metrics={'train_runtime': 3048.7329, 'train_samples_per_second': 134.933, 'train_steps_per_second': 16.867, 'total_flos': 2.396113015891661e+16, 'train_loss': 1.5923759977809469, 'epoch': 7.0})

## Test the model with checkpoint

In [15]:
from io import BytesIO
import torch
from torch import nn
from torch.quantization import quantize_dynamic
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

torch.backends.quantized.engine = "fbgemm"

# --- Load model ---
CKPT_DIR = "/checkpoint_Roberta-14692"
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = AutoModelForSequenceClassification.from_pretrained(CKPT_DIR)
model.eval()


# --- Metrics: Accuracy + Precision/Recall/F1 (macro) ---
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(eval_pred):
    # Supports both EvalPrediction and (predictions, labels) tuple
    try:
        predictions = eval_pred.predictions
        labels = eval_pred.label_ids
    except AttributeError:
        predictions, labels = eval_pred
        if isinstance(predictions, tuple):
            predictions = predictions[0]
    preds = predictions.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average="macro", zero_division=0)
    return {"accuracy": acc, "precision_macro": p, "recall_macro": r, "f1_macro": f1}

# --- Evaluation arguments ---
eval_args = TrainingArguments(
    output_dir="/tmp_eval",
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=False,
    bf16=False,
    dataloader_pin_memory=False,
)

# --- Evaluation — full model (Test only) ---
trainer_orig = Trainer(
    model=model,
    args=eval_args,
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)
test_full = trainer_orig.evaluate()


# --- Pretty print ---
def show_test(m, name):
    print(f"{name} — Test: "
          f"Acc={m['eval_accuracy']:.4f} | "
          f"P={m['eval_precision_macro']:.4f} | "
          f"R={m['eval_recall_macro']:.4f} | "
          f"F1={m['eval_f1_macro']:.4f}")

show_test(test_full,  "FULL ")


FULL  — Test: Acc=0.2497 | P=0.0499 | R=0.2000 | F1=0.0799


## Squeeze

quantize_dynamic

Dynamic quantization in PyTorch converts layer weights (e.g., Linear/LSTM) to int8, while activations are quantized on-the-fly at inference.
This reduces model size and speeds up CPU inference without retraining or calibration, usually with minimal accuracy loss.



In [None]:
from io import BytesIO
import torch
from torch import nn
from torch.quantization import quantize_dynamic
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

torch.backends.quantized.engine = "fbgemm"

# --- Load model ---
CKPT_DIR = "/content/covid19-tweets-sentiment-roberta-ex5-EDA_new/checkpoint-14692"
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = AutoModelForSequenceClassification.from_pretrained(CKPT_DIR)
model.eval().to("cpu")  # stay on CPU

# --- Dynamic quantization (Linear → int8) on CPU ---
quantized_model = quantize_dynamic(model, {nn.Linear}, dtype=torch.qint8).eval()

# --- Metrics: Accuracy + Precision/Recall/F1 (macro) ---
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(eval_pred):
    # Supports both EvalPrediction and (predictions, labels) tuple
    try:
        predictions = eval_pred.predictions
        labels = eval_pred.label_ids
    except AttributeError:
        predictions, labels = eval_pred
        if isinstance(predictions, tuple):
            predictions = predictions[0]
    preds = predictions.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average="macro", zero_division=0)
    return {"accuracy": acc, "precision_macro": p, "recall_macro": r, "f1_macro": f1}

# --- Evaluation arguments (CPU) ---
eval_args = TrainingArguments(
    output_dir="/content/tmp_eval",
    per_device_eval_batch_size=16,
    report_to="none",
    no_cuda=True,     # run on CPU
    fp16=False,
    bf16=False,
    dataloader_pin_memory=False,
)

# --- Helper functions for size/parameter count ---
def state_dict_size_mb(m) -> float:
    buf = BytesIO()
    torch.save(m.state_dict(), buf)  # save to memory only
    return len(buf.getvalue()) / (1024 * 1024)

def count_params(m) -> int:
    return sum(p.numel() for p in m.parameters())

print(f"Model size (orig, in-memory):  {state_dict_size_mb(model):.2f} MB")
print(f"Model size (quant, in-memory): {state_dict_size_mb(quantized_model):.2f} MB")
print(f"Model parameters (orig):  {count_params(model):,}")
print(f"Model parameters (quant): {count_params(quantized_model):,}")

# --- Ensure both models are on CPU (just to be safe) ---
model.to("cpu")
quantized_model.to("cpu")

# --- Evaluation — full model (Test only) ---
trainer_orig = Trainer(
    model=model,
    args=eval_args,
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)
test_full = trainer_orig.evaluate()

# --- Evaluation — quantized model (Test only) ---
trainer_quant = Trainer(
    model=quantized_model,
    args=eval_args,
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)
test_quant = trainer_quant.evaluate()

# --- Pretty print ---
def show_test(m, name):
    print(f"{name} — Test: "
          f"Acc={m['eval_accuracy']:.4f} | "
          f"P={m['eval_precision_macro']:.4f} | "
          f"R={m['eval_recall_macro']:.4f} | "
          f"F1={m['eval_f1_macro']:.4f}")

show_test(test_full,  "FULL ")
show_test(test_quant, "QNTZD")
print(f"Δ Acc (quant - full): {test_quant['eval_accuracy'] - test_full['eval_accuracy']:+.4f}")



Model size (orig, in-memory):  1355.76 MB
Model size (quant, in-memory): 488.85 MB
Model parameters (orig):  355,364,869
Model parameters (quant): 52,099,072


FULL  — Test: Acc=0.2746 | P=0.0549 | R=0.2000 | F1=0.0862
QNTZD — Test: Acc=0.2746 | P=0.1216 | R=0.2000 | F1=0.0866
Δ Acc (quant - full): +0.0000


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

puring

The process of pruning in neural networks involves removing redundant or less important connections (weights) from the model. This makes the model smaller and faster, while maintaining its performance.

In [None]:
# ===== Safe PRUNING (layer-by-layer) + eval =====
import gc, torch
from torch import nn
from torch.nn.utils import prune
from transformers import Trainer, TrainingArguments

# If not defined earlier:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(eval_pred):
    try:
        predictions = eval_pred.predictions
        labels = eval_pred.label_ids
    except AttributeError:
        predictions, labels = eval_pred
        if isinstance(predictions, tuple):
            predictions = predictions[0]
    preds = predictions.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average="macro", zero_division=0)
    return {"accuracy": acc, "precision_macro": p, "recall_macro": r, "f1_macro": f1}

# 0) Move to GPU if available (we don't touch eval since you already called it earlier)
USE_GPU = torch.cuda.is_available()
DEVICE = "cuda" if USE_GPU else "cpu"
model.to(DEVICE)
if USE_GPU:
    torch.cuda.empty_cache()
gc.collect()

# 1) Layer-by-layer pruning (L1) with immediate remove — saves RAM
AMOUNT = 0.40  # 40%
for module in model.modules():
    if isinstance(module, nn.Linear):
        prune.l1_unstructured(module, name="weight", amount=AMOUNT)
        prune.remove(module, "weight")      # fixes the mask → no weight_orig/weight_mask tensors
        if USE_GPU: torch.cuda.empty_cache()
gc.collect()
print(f"Pruning done: {int(AMOUNT*100)}% on all Linear layers (with immediate remove)")

# Pretty printing for metrics
def show_test(name, m):
    print(f"{name} — Test: "
          f"Acc={m.get('eval_accuracy', float('nan')):.4f} | "
          f"P={m.get('eval_precision_macro', float('nan')):.4f} | "
          f"R={m.get('eval_recall_macro', float('nan')):.4f} | "
          f"F1={m.get('eval_f1_macro', float('nan')):.4f}")

# 2) Evaluation — try GPU first; if OOM, fall back to CPU with smaller batch
def eval_on(device="cuda", batch_size=16):
    args = TrainingArguments(
        output_dir="/content/tmp_eval_pruned",
        per_device_eval_batch_size=batch_size,
        report_to="none",
        no_cuda=(device != "cuda"),
        fp16=(device == "cuda"),
        dataloader_pin_memory=(device == "cuda"),
    )
    trainer = Trainer(
        model=model,
        args=args,
        eval_dataset=tokenized_datasets["test"],   # Test only
        compute_metrics=compute_metrics,           # Important: computes P/R/F1 macro
    )
    with torch.inference_mode():
        return trainer.evaluate()                  # Returns a dict with eval_accuracy and more

try:
    pruned_metrics = eval_on("cuda" if USE_GPU else "cpu", batch_size=16)
except RuntimeError:
    print("GPU OOM during eval → switching to CPU with smaller batch.")
    model.to("cpu"); gc.collect()
    if USE_GPU: torch.cuda.empty_cache()
    pruned_metrics = eval_on("cpu", batch_size=8)

show_test("PRUNED", pruned_metrics)

# Optional: delta vs full model if you have test_full from the previous step
try:
    print(f"Δ Acc (pruned - full): {pruned_metrics['eval_accuracy'] - test_full['eval_accuracy']:+.4f}")
except Exception:
    pass

Pruning done: 40% on all Linear layers (with immediate remove)


PRUNED — Test: Acc=0.2746 | P=0.0549 | R=0.2000 | F1=0.0862
Δ Acc (pruned - full): +0.0000


In [None]:
print("Pruned model non-zero parameters:",
      sum((p != 0).sum().item() for p in model.parameters()))

Pruned model non-zero parameters: 234147452


In [None]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Model size (parameters): {total_params:,}")

Model size (parameters): 355,364,869


distillation

Knowledge distillation is a model compression technique where a smaller, simpler model ("student") learns from a larger, more complex model ("teacher"). The student mimics the teacher's behavior, often by learning from its soft labels (probability distributions), allowing it to achieve similar performance with greater efficiency.

In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, Trainer, EarlyStoppingCallback
# If you already have compute_metrics defined, you can keep it; here we just use it.

# --- 1) Define the teacher checkpoint (after training) ---
TEACHER_CKPT = "/content/covid19-tweets-sentiment-roberta-ex5-EDA_new/checkpoint-14692"  # change if needed

def load_teacher(ckpt_path: str, num_labels: int):
    try:
        print(f"Loading teacher from: {ckpt_path}")
        return AutoModelForSequenceClassification.from_pretrained(ckpt_path, num_labels=num_labels)
    except Exception as e:
        print(f"Could not load teacher from '{ckpt_path}'. Falling back to 'roberta-large'. Reason: {e}")
        return AutoModelForSequenceClassification.from_pretrained("roberta-large", num_labels=num_labels)

NUM_LABELS = 5

teacher = load_teacher(TEACHER_CKPT, num_labels=NUM_LABELS)
student = AutoModelForSequenceClassification.from_pretrained("distilroberta-base", num_labels=NUM_LABELS)

# --- 2) Make sure the teacher is in eval mode and has no gradients ---
teacher.eval()
for p in teacher.parameters():
    p.requires_grad = False

# --- 3) Distillation-adapted Trainer ---
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, temperature=2.0, alpha=0.5, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        self.temperature = temperature
        self.alpha = alpha
        if self.teacher is not None:
            self.teacher.to(self.model.device)

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # The Trainer already prepares inputs on the correct device
        inputs = self._prepare_inputs(inputs)

        # Support both "labels" and "label"
        labels = inputs.get("labels", inputs.get("label", None))
        if labels is None:
            raise ValueError("No labels found in inputs (expected 'labels' or 'label').")

        # Student
        outputs_student = model(**{k: v for k, v in inputs.items() if k not in ["label"]})
        student_logits = outputs_student.logits

        # Teacher
        with torch.no_grad():
            outputs_teacher = self.teacher(**{k: v for k, v in inputs.items() if k not in ["label"]})
            teacher_logits = outputs_teacher.logits

        # Align dtype in mixed precision (fp16/bf16)
        if student_logits.dtype != teacher_logits.dtype:
            teacher_logits = teacher_logits.to(student_logits.dtype)

        # Standard cross-entropy against hard labels
        loss_ce = F.cross_entropy(student_logits, labels.long())

        # KL against teacher's soft distribution
        T = self.temperature
        loss_kl = F.kl_div(
            F.log_softmax(student_logits / T, dim=-1),
            F.softmax(teacher_logits / T, dim=-1),
            reduction="batchmean"
        ) * (T ** 2)

        loss = self.alpha * loss_ce + (1.0 - self.alpha) * loss_kl
        return (loss, outputs_student) if return_outputs else loss

# --- 4) Create the Trainer and run ---
trainer_distill = DistillationTrainer(
    model=student,
    teacher_model=teacher,
    args=training_args,                                # << you already have training_args in your code
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets.get("validation",  # if there is no validation, it will take the Test
                                        tokenized_datasets.get("test")),
    compute_metrics=compute_metrics,                   # << we use your version
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Important: ensure teacher and student are on the same device (just to be safe)
trainer_distill.model.to(trainer_distill.args.device)
if trainer_distill.teacher is not None:
    trainer_distill.teacher.to(trainer_distill.args.device)

train_output = trainer_distill.train()
print("\nDistillation complete. Student model trained.")
print("Student model size (params):", sum(p.numel() for p in student.parameters()))

# --- 5) ⭐ Evaluate the student on the Test set only + print Acc / P / R / F1 (macro) ---
student.eval()
student.to(trainer_distill.args.device)  # just to be safe

test_metrics = Trainer(
    model=student,
    args=trainer_distill.args,                 # use the same args
    eval_dataset=tokenized_datasets["test"],   # **Test only**
    compute_metrics=compute_metrics,           # returns Accuracy + Precision/Recall/F1-macro
).evaluate()

print(
    "STUDENT — Test: "
    f"Acc={test_metrics['eval_accuracy']:.4f} | "
    f"P={test_metrics['eval_precision_macro']:.4f} | "
    f"R={test_metrics['eval_recall_macro']:.4f} | "
    f"F1={test_metrics['eval_f1_macro']:.4f}"
)

Loading teacher from: /content/covid19-tweets-sentiment-roberta-ex5-EDA_new/checkpoint-14692


config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,0.7889,0.792448,0.249736,0.049947,0.2,0.079932
2,0.7913,0.795977,0.249736,0.049947,0.2,0.079932
3,0.7864,0.791965,0.249736,0.049947,0.2,0.079932
4,0.7949,0.79159,0.249736,0.049947,0.2,0.079932



Distillation complete. Student model trained.
Student model size (params): 82122245


STUDENT — Test: Acc=0.2497 | P=0.0499 | R=0.2000 | F1=0.0799


In [None]:
# Estimate in-memory size
# If the model is fp32 → 4 bytes per parameter; if fp16 → 2 bytes
dtype_size = 4 if student.dtype == torch.float32 else 2
size_mb = total_params * dtype_size / (1024**2)
print(f"Approx. model size in memory: {size_mb:.2f} MB")

Approx. model size in memory: 1355.61 MB


In [None]:
from google.colab import files
import shutil

folder = "/content/covid19-tweets-sentiment-roberta-ex5-EDA_new/checkpoint-7346"          # <-- change this
zip_base = "/content/covid19-tweets-sentiment-roberta-ex5-EDA_new/checkpoint-7346"        # path without .zip
zip_path = zip_base + ".zip"

shutil.make_archive(zip_base, "zip", folder)  # creates /content/myfolder.zip
files.download(zip_path)                      # downloads to your computer

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Additional

experiance

In [None]:
# ==== HEAD IMPORTANCE HEATMAP + RATIO SLIDER + DOWNLOAD JSON ====
import json, torch, numpy as np

# ------- Helper: access encoder and self-attention -------
def get_encoder_and_kind(m):
    if hasattr(m, "roberta"):
        return m.roberta.encoder, "roberta"
    if hasattr(m, "bert"):
        return m.bert.encoder, "bert"
    raise ValueError("Model is not BERT/Roberta-like (no .roberta/.bert).")

def get_self_attention_layer(layer):
    if hasattr(layer, "attention") and hasattr(layer.attention, "self"):
        return layer.attention.self
    raise ValueError("Layer has no attention.self")

# ------- Head importance scores (average L2 norms of Q/K/V) -------
@torch.no_grad()
def head_scores_for_layer(attn_self):
    Wq = attn_self.query.weight.detach()
    Wk = attn_self.key.weight.detach()
    Wv = attn_self.value.weight.detach()
    hidden = Wq.shape[0]
    num_heads = attn_self.num_attention_heads
    head_dim = hidden // num_heads

    def split_rows(W):
        # Split into row blocks (heads) — each head is head_dim x in_features
        return [W[i*head_dim:(i+1)*head_dim, :] for i in range(num_heads)]

    Q_parts = split_rows(Wq)
    K_parts = split_rows(Wk)
    V_parts = split_rows(Wv)

    scores = []
    for h in range(num_heads):
        s = (Q_parts[h].float().pow(2).sum().sqrt()
           + K_parts[h].float().pow(2).sum().sqrt()
           + V_parts[h].float().pow(2).sum().sqrt()) / 3.0
        scores.append(float(s.item()))
    return np.array(scores)  # [num_heads]

# ------- Compute for all layers -------
encoder, kind = get_encoder_and_kind(model)   # Assumes you have `model` loaded (BERT/Roberta)
n_layers = len(encoder.layer)

scores_by_layer = {}
heads_per_layer = {}
for li in range(n_layers):
    attn_self = get_self_attention_layer(encoder.layer[li])
    s = head_scores_for_layer(attn_self)
    scores_by_layer[li] = s.tolist()
    heads_per_layer[li] = int(attn_self.num_attention_heads)

meta = {
    "model_type": kind,
    "n_layers": n_layers,
    "heads_per_layer": heads_per_layer,
    "note": "Scores are avg L2 of Q/K/V per head (higher = more important)."
}

# ------- Interactive HTML -------
html = f"""
<!doctype html>
<html lang="en" dir="ltr">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<title>Head Importance Heatmap</title>
<style>
  body {{ font-family: system-ui, Segoe UI, Roboto, Arial; margin: 24px; }}
  .wrap {{ display:grid; gap:16px; max-width: 1000px; }}
  .row {{ display:flex; gap:12px; align-items:center; flex-wrap:wrap; }}
  .card {{ border:1px solid #eee; border-radius:12px; padding:16px; box-shadow:0 2px 8px rgba(0,0,0,0.04); }}
  .legend {{ display:flex; gap:8px; align-items:center; }}
  .box {{ width:16px; height:16px; border-radius:3px; border:1px solid #ddd; }}
  .hm {{ overflow:auto; border:1px solid #eee; border-radius:12px; }}
  table.hm {{ border-collapse: collapse; }}
  table.hm td {{ width:18px; height:18px; padding:0; border: 1px solid #fff; position:relative; }}
  td.mark::after {{ content:''; position:absolute; inset:0; border:3px solid #000; border-radius:3px; }} /* marked for removal */
  .muted {{ color:#666; }}
  .chip {{ padding: 4px 8px; border-radius: 999px; background:#f6f6f6; }}
  button {{ padding:8px 12px; border:1px solid #ddd; background:#fafafa; border-radius:8px; cursor:pointer; }}
</style>
</head>
<body>
<div class="wrap">
  <h1>Head Importance Heatmap</h1>
  <div class="row">
    <div class="chip">Layers: {n_layers}</div>
    <div class="chip">Model: {meta["model_type"]}</div>
    <div class="legend">
      <div class="box" style="background:hsl(0, 80%, 60%)"></div><span class="muted">Low</span>
      <div class="box" style="background:hsl(60, 80%, 60%)"></div><span class="muted">Medium</span>
      <div class="box" style="background:hsl(120, 80%, 40%)"></div><span class="muted">High</span>
    </div>
  </div>

  <div class="card">
    <div class="row">
      <label>Pruning ratio per layer:
        <input id="ratio" type="range" min="0" max="0.6" step="0.05" value="0.25" style="vertical-align:middle;">
        <span id="ratioVal">25%</span>
      </label>
      <button id="downloadBtn">Download heads_to_prune.json</button>
      <span class="muted">Always mark for removal the heads with the lowest scores in each layer.</span>
    </div>
  </div>

  <div class="card">
    <div class="row">
      <div class="chip" id="summary"></div>
    </div>
    <div class="hm">
      <table id="heat" class="hm"></table>
    </div>
    <small>Rows = layers (0 at top), columns = heads. Color = importance (green high, red low). Black border = will be pruned.</small>
  </div>
</div>

<script>
const SCORES = {json.dumps(scores_by_layer)};
const META   = {json.dumps(meta)};

const ratio    = document.getElementById('ratio');
const ratioVal = document.getElementById('ratioVal');
const heat     = document.getElementById('heat');
const summary  = document.getElementById('summary');
const dlBtn    = document.getElementById('downloadBtn');

function colorFor(v, vmin, vmax) {{
  // Map red→green without template literals (to avoid conflict with Python f-strings)
  var t = (v - vmin) / Math.max(1e-9, (vmax - vmin));
  var hue = 120 * t; // 0=red, 120=green
  return 'hsl(' + hue + ', 80%, ' + (40 + 20*(1 - t)) + '%)';
}}

function build() {{
  heat.innerHTML = '';
  var nl = META.n_layers;
  var totalHeads = 0, totalPruned = 0;

  // Global range for all cells (could also be per-layer)
  var globalMin = Infinity, globalMax = -Infinity;
  for (const li in SCORES) {{
    for (const s of SCORES[li]) {{
      if (s < globalMin) globalMin = s;
      if (s > globalMax) globalMax = s;
    }}
  }}

  var r = parseFloat(ratio.value);
  ratioVal.textContent = Math.round(r*100) + '%';

  for (var li=0; li<nl; li++) {{
    var row = document.createElement('tr');
    var scores = SCORES[li];
    if (!scores) continue;

    var H = scores.length;
    var k = Math.max(0, Math.min(H, Math.round(H * r)));
    totalHeads += H;

    // Indices by ascending value
    var order = scores.map(function(v,idx){{return [v,idx];}}).sort(function(a,b){{return a[0]-b[0];}}).map(function(x){{return x[1];}});
    var toPrune = new Set(order.slice(0, k));
    totalPruned += k;

    for (var h=0; h<H; h++) {{
      var td = document.createElement('td');
      td.style.background = colorFor(scores[h], globalMin, globalMax);
      if (toPrune.has(h)) td.classList.add('mark');
      td.title = 'layer ' + li + ', head ' + h + ', score ' + scores[h].toFixed(4) + (toPrune.has(h) ? ' (PRUNE)' : ' (KEEP)');
      row.appendChild(td);
    }}
    heat.appendChild(row);
  }}

  summary.textContent = 'Total heads: ' + totalHeads + ' | To prune by ratio: ' + totalPruned;
}}

function download() {{
  var nl = META.n_layers;
  var r = parseFloat(ratio.value);
  var out = {{}};
  for (var li=0; li<nl; li++) {{
    var scores = SCORES[li];
    if (!scores) continue;
    var H = scores.length;
    var k = Math.max(0, Math.min(H, Math.round(H * r)));
    if (k === 0) continue;
    var order = scores.map(function(v,idx){{return [v,idx];}}).sort(function(a,b){{return a[0]-b[0];}}).map(function(x){{return x[1];}});
    out[li] = Array.from(order.slice(0, k));
  }}
  var blob = new Blob([JSON.stringify(out, null, 2)], {{type:'application/json'}});
  var a = document.createElement('a');
  a.href = URL.createObjectURL(blob);
  a.download = 'heads_to_prune.json';
  document.body.appendChild(a);
  a.click();
  a.remove();
}}

ratio.addEventListener('input', build);
dlBtn.addEventListener('click', download);
build();
</script>
</body>
</html>
"""

with open("head_importance_heatmap.html","w",encoding="utf-8") as f:
    f.write(html)

print("✅ wrote head_importance_heatmap.html")

# Quick open inside notebook (Colab/Jupyter):
try:
    from IPython.display import HTML, display
    display(HTML(open("head_importance_heatmap.html", encoding="utf-8").read()))
except Exception as e:
    print("Open manually from the file browser, or run:")
    print("from IPython.display import HTML, display; display(HTML(open('head_importance_heatmap.html', encoding='utf-8').read()))")

✅ wrote head_importance_heatmap.html
