In [None]:
import pandas as pd
import numpy as np
import re
import warnings
import time
import os

from sklearn.metrics import f1_score, precision_score, recall_score, jaccard_score
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from tqdm import tqdm
import gc

from codecarbon import EmissionsTracker
from dataclasses import dataclass
from typing import Any, Dict, List

In [None]:
warnings.filterwarnings('ignore')

In [None]:
print(f"‚úÖ PyTorch: {torch.__version__}")
print(f"‚úÖ CUDA: {torch.cuda.is_available()}")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"‚úÖ Device: {device}\n")

‚úÖ PyTorch: 2.5.1+cu121
‚úÖ CUDA: True
‚úÖ Device: cuda



In [None]:
# ======================
# LOAD DATA
# ======================
print("üìÇ Loading data...")
df = pd.read_excel("dataset.xlsx")
df = df.dropna(subset=["text_raw"]).rename(columns={"text_raw": "text"})

# ======================
# PREPROCESSING
# ======================
def advanced_clean(text):
    if pd.isna(text): return ""
    text = str(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s\-\.\,\!\?\%]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower().strip()
    if len(text.split()) < 3: return ""
    return text


LABEL_COLUMNS = [
    "acne", "eye_contour", "homogeneity", "lack_firmness", "lack_radiance",
    "pores", "fine_lines", "wrinkles_fine-lines", "eye-wrinkles", "undereye-bags",
    "generic", "18-34", "35-54", "55-99", "dark_pigmentation", "dry", "normal",
    "oily", "combination", "sensitivity-high", "sensitivity-low", "no_sensitivity",
    "male", "female", "cleanse", "prepare", "treat", "targeted", "care",
    "moisturize", "protect", "day", "night"
]

print("üßπ Preprocessing...")

# ======================
# DATA PRUNING
# ======================
MAX_LABELS_PER_SAMPLE = 20  # more than 60% of labels

# Ensure labels are numeric and NaNs are treated as 0
label_matrix = df[LABEL_COLUMNS].fillna(0).astype(int)

# Count active labels per sample
labels_per_sample = label_matrix.sum(axis=1)

before_n = len(df)
removed_mask = labels_per_sample > MAX_LABELS_PER_SAMPLE
removed_n = int(removed_mask.sum())
removed_pct = (100.0 * removed_n / before_n) if before_n > 0 else 0.0

# Prune
df = df.loc[~removed_mask].copy().reset_index(drop=True)
label_matrix = label_matrix.loc[df.index]

print(f"Pruning rule: keep samples with ‚â§ {MAX_LABELS_PER_SAMPLE} labels")
print(f"Before: {before_n} | Removed: {removed_n} ({removed_pct:.2f}%) | After: {len(df)}")
print("Labels/sample (after) - mean:", round(label_matrix.sum(axis=1).mean(), 3))
print("Labels/sample (after) - max:", int(label_matrix.sum(axis=1).max()))

df["cleaned_text"] = df["text"].apply(advanced_clean)
df = df[df["cleaned_text"].str.len() > 0]
df["labels"] = df[LABEL_COLUMNS].apply(lambda row: row.tolist(), axis=1)
print(f"   Samples: {len(df)}\n")

üìÇ Loading data...
üßπ Preprocessing...
Pruning rule: keep samples with ‚â§ 20 labels
Before: 6240 | Removed: 16 (0.26%) | After: 6224
Labels/sample (after) - mean: 3.965
Labels/sample (after) - max: 33
   Samples: 6224



In [None]:
# ======================
# OVERSAMPLE RARE AGE & GENDER LABELS
# ======================
print("üîÑ Oversampling rare age & gender labels...")

# Define age and gender labels
AGE_LABELS = ["18-34", "35-54", "55-99"]
GENDER_LABELS = ["male", "female"]
RARE_LABELS = AGE_LABELS + GENDER_LABELS

# Calculate support for each label
label_counts = df[RARE_LABELS].sum()
print("\nBefore oversampling:")
print(label_counts)

# Define oversampling strategy
TARGET_MIN_SAMPLES = 500  # Target minimum samples per label
oversample_factor = {}

for label in RARE_LABELS:
    count = label_counts[label]
    if count < TARGET_MIN_SAMPLES and count > 0:
        oversample_factor[label] = int(TARGET_MIN_SAMPLES / count)
    else:
        oversample_factor[label] = 0

print(f"\nOversampling factors: {oversample_factor}")

# Perform oversampling
df_original = df.copy()
samples_to_add = []

for label, factor in oversample_factor.items():
    if factor > 1:
        # Get samples with this label
        label_samples = df[df[label] == 1]

        # Replicate (factor - 1) times (we already have original)
        for _ in range(factor - 1):
            samples_to_add.append(label_samples)

        print(f"   {label}: {len(label_samples)} ‚Üí {len(label_samples) * factor}")

# Concatenate all samples
if samples_to_add:
    df = pd.concat([df] + samples_to_add, ignore_index=True)

    # Shuffle to avoid clustering
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\nDataset size: {len(df_original)} ‚Üí {len(df)} (+{len(df) - len(df_original)})")
print("\nAfter oversampling:")
print(df[RARE_LABELS].sum())
print()

üîÑ Oversampling rare age & gender labels...

Before oversampling:
18-34     182
35-54     800
55-99     262
male      335
female    113
dtype: int64

Oversampling factors: {'18-34': 2, '35-54': 0, '55-99': 1, 'male': 1, 'female': 4}
   18-34: 182 ‚Üí 364
   female: 113 ‚Üí 452

Dataset size: 6224 ‚Üí 6745 (+521)

After oversampling:
18-34      430
35-54     1038
55-99      424
male       350
female     474
dtype: int64



In [None]:
# ======================
# CLASS WEIGHTS
# ======================
y = np.array(df[LABEL_COLUMNS].values, dtype=np.float32)
pos = y.sum(axis=0)
weights = []
for i in range(len(LABEL_COLUMNS)):
    if pos[i] < 50: w = min(20.0, (len(df) - pos[i]) / (pos[i] + 1))  # Increased from 15
    elif pos[i] < 100: w = min(12.0, (len(df) - pos[i]) / (pos[i] + 1))  # NEW bracket
    elif pos[i] < 200: w = min(8.0, (len(df) - pos[i]) / (pos[i] + 1))
    else: w = min(3.0, (len(df) - pos[i]) / (pos[i] + 1))
    weights.append(w)

class_weights = torch.tensor(weights, dtype=torch.float32).to(device)
print(f"‚öñÔ∏è  Weights: {min(weights):.2f} - {max(weights):.2f}\n")

‚öñÔ∏è  Weights: 1.60 - 12.00



In [None]:
# ======================
# SPLIT
# ======================
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df[LABEL_COLUMNS[0]])
print(f"‚úÇÔ∏è  Train: {len(train_df)} | Test: {len(test_df)}\n")

train_ds = Dataset.from_pandas(train_df[["cleaned_text", "labels"]].reset_index(drop=True))
test_ds = Dataset.from_pandas(test_df[["cleaned_text", "labels"]].reset_index(drop=True))

‚úÇÔ∏è  Train: 5396 | Test: 1349



In [None]:
# ======================
# DEFINE EMISSIONS TRACKER
# ======================
os.makedirs("/kaggle/working/EmissionsTracker", exist_ok=True)

tracker = EmissionsTracker(
    project_name="roberta_large_eval",
    output_dir="/kaggle/working/EmissionsTracker",
    log_level="error"
)

In [None]:
# ======================
# TEACHER MODEL CONFIGURATION
# ======================
MODEL_NAME = "roberta-large"
MAX_LENGTH = 256
BATCH_SIZE = 4
GRAD_ACCUM = 8
EPOCHS = 25
LEARNING_RATE = 3e-5

In [None]:
# 1. Path to the folder containing your uploaded files
#    (e.g., if you uploaded them to a folder named 'roberta_skin_model')
MODEL_PATH = "models/RoBERTa_final"

print(f"Loading model from {MODEL_PATH}...")

# 2. Load the Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

# 3. Load the Model
#    Note: We don't need to specify num_labels here; it reads it from config.json
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    use_safetensors=True  # Set to False if you only have 'pytorch_model.bin'
)

# 4. Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval() # Set to evaluation mode

print("‚úÖ Model and Tokenizer loaded successfully!")

Loading model from models/RoBERTa_final...
‚úÖ Model and Tokenizer loaded successfully!


In [None]:
# Assuming 'model' is your loaded Teacher (RoBERTa)
model.eval()
for param in model.parameters():
    param.requires_grad = False

print("‚ùÑÔ∏è Teacher model frozen and ready for distillation.")

‚ùÑÔ∏è Teacher model frozen and ready for distillation.


In [None]:
# Helper function to generate logits in batches (to avoid OOM errors)
def generate_logits(model, tokenizer, texts, batch_size=32, device='cuda'):
    model.to(device)
    all_logits = []

    # Simple batching loop
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i : i + batch_size]

        # Tokenize (Use the TEACHER'S tokenizer here!)
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            all_logits.append(outputs.logits.cpu()) # Move to CPU immediately to save GPU

    return torch.cat(all_logits)

print("üîÆ Generating Teacher Logits for Training Data...")
# Get the raw text list from your dataframe
train_texts = train_df['cleaned_text'].tolist()
test_texts = test_df['cleaned_text'].tolist()

# Generate
teacher_train_logits = generate_logits(model, tokenizer, train_texts)
teacher_test_logits = generate_logits(model, tokenizer, test_texts)

print(f"‚úÖ Logits Ready. Train shape: {teacher_train_logits.shape}")

üîÆ Generating Teacher Logits for Training Data...
‚úÖ Logits Ready. Train shape: torch.Size([5396, 33])


In [None]:
# Delete the teacher model to free up VRAM
del model
torch.cuda.empty_cache()
gc.collect()

print("üóëÔ∏è Teacher model removed from GPU to make room for the Student.")

üóëÔ∏è Teacher model removed from GPU to make room for the Student.


In [None]:
# ======================================================================================
# üéì KNOWLEDGE DISTILLATION: ROBERTA-LARGE ‚Üí DISTILROBERTA
# ======================================================================================
print("="*80)
print("üéì KNOWLEDGE DISTILLATION - SUSTAINABILITY-OPTIMIZED")
print("="*80)

# ======================
# STUDENT MODEL SETUP
# ======================
STUDENT_MODEL_NAME = "distilroberta-base"
STUDENT_MAX_LENGTH = 128
STUDENT_EPOCHS = 20
STUDENT_BATCH_SIZE = 16
STUDENT_GRAD_ACCUM = 4
STUDENT_LR = 4e-5
TEMPERATURE = 3.0
DISTILLATION_ALPHA = 0.9


print(f"\nüìö Teacher: {MODEL_NAME}")
print(f"üéì Student: {STUDENT_MODEL_NAME}")
print(f"üìè Student max length: {STUDENT_MAX_LENGTH}")
print(f"üî• Temperature: {TEMPERATURE}")
print(f"‚öñÔ∏è  Alpha (KD weight): {DISTILLATION_ALPHA}\n")

üéì KNOWLEDGE DISTILLATION - SUSTAINABILITY-OPTIMIZED

üìö Teacher: roberta-large
üéì Student: distilroberta-base
üìè Student max length: 128
üî• Temperature: 3.0
‚öñÔ∏è  Alpha (KD weight): 0.9



In [None]:
# ======================
# STEP 1: FORMATTING LABELS COLUMN
# ======================
print("‚öôÔ∏è formatting labels column...")

train_df['labels'] = train_df[LABEL_COLUMNS].values.tolist()
test_df['labels'] = test_df[LABEL_COLUMNS].values.tolist()

print("‚úÖ 'labels' column created successfully.")

‚öôÔ∏è formatting labels column...
‚úÖ 'labels' column created successfully.


In [None]:
# ======================
# STEP 2: CREATE STUDENT DATASETS (DICT-BASED)
# ======================
print("="*80)
print("STEP 2: Creating student datasets")
print("="*80)

# Create datasets as dictionaries for better compatibility
train_data_dict = {
    "cleaned_text": train_df["cleaned_text"].reset_index(drop=True).tolist(),
    "labels": train_df["labels"].reset_index(drop=True).tolist(),
    "teacher_logits": teacher_train_logits.tolist()
}

test_data_dict = {
    "cleaned_text": test_df["cleaned_text"].reset_index(drop=True).tolist(),
    "labels": test_df["labels"].reset_index(drop=True).tolist(),
    "teacher_logits": teacher_test_logits.tolist()
}

# Create HuggingFace datasets from dictionaries
train_ds_student = Dataset.from_dict(train_data_dict)
test_ds_student = Dataset.from_dict(test_data_dict)

print(f"‚úÖ Student datasets created:")
print(f"   Train: {len(train_ds_student)}")
print(f"   Test: {len(test_ds_student)}")
print(f"   Train columns: {train_ds_student.column_names}")

# Verify data structure
print(f"\n‚úÖ Sample verification:")
sample = train_ds_student[0]
print(f"   Text type: {type(sample['cleaned_text'])}")
print(f"   Labels type: {type(sample['labels'])}, len: {len(sample['labels'])}")
print(f"   Teacher logits type: {type(sample['teacher_logits'])}, len: {len(sample['teacher_logits'])}")
print()

STEP 2: Creating student datasets
‚úÖ Student datasets created:
   Train: 5396
   Test: 1349
   Train columns: ['cleaned_text', 'labels', 'teacher_logits']

‚úÖ Sample verification:
   Text type: <class 'str'>
   Labels type: <class 'list'>, len: 33
   Teacher logits type: <class 'list'>, len: 33



In [None]:
# ======================
# STEP 3: CREATE CUSTOM DATA COLLATOR
# ======================
print("="*80)
print("STEP 3: Creating custom data collator")
print("="*80)

student_tokenizer = AutoTokenizer.from_pretrained(STUDENT_MODEL_NAME)

# Create a custom collator that handles teacher_logits

@dataclass
class DistillationDataCollator:
    tokenizer: Any
    max_length: int = 128

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        # Extract teacher logits and labels (they're already lists/arrays in the dataset)
        teacher_logits = torch.tensor(
            [f["teacher_logits"] for f in features],
            dtype=torch.float32
        )
        labels = torch.tensor(
            [f["labels"] for f in features],
            dtype=torch.float32
        )

        # Extract texts
        texts = [f["cleaned_text"] for f in features]

        # Tokenize
        batch = self.tokenizer(
            texts,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        # Add teacher logits and labels to batch
        batch["teacher_logits"] = teacher_logits
        batch["labels"] = labels

        return batch

data_collator = DistillationDataCollator(
    tokenizer=student_tokenizer,
    max_length=STUDENT_MAX_LENGTH
)

print("‚úÖ Data collator created\n")

STEP 3: Creating custom data collator
‚úÖ Data collator created



In [None]:
# ======================
# STEP 4: DISTILLATION LOSS & TRAINER
# ======================
print("="*80)
print("STEP 4: Defining distillation loss")
print("="*80)

class DistillationLoss(nn.Module):
    def __init__(self, alpha=0.7, temperature=2.0, pos_weight=None):
        super().__init__()
        self.alpha = alpha
        self.temperature = temperature
        self.pos_weight = pos_weight

    def forward(self, student_logits, teacher_logits, labels):
        # Knowledge Distillation Loss (MSE on soft targets)
        kd_loss = nn.functional.mse_loss(
            student_logits / self.temperature,
            teacher_logits / self.temperature
        )

        # Supervised Loss (BCE on hard labels)
        bce_loss = nn.functional.binary_cross_entropy_with_logits(
            student_logits,
            labels,
            pos_weight=self.pos_weight
        )

        # Combined loss
        total_loss = self.alpha * kd_loss + (1 - self.alpha) * bce_loss
        return total_loss

class DistillationTrainer(Trainer):
    def __init__(self, *args, distillation_loss_fn=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.distillation_loss_fn = distillation_loss_fn

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels").float()
        teacher_logits = inputs.pop("teacher_logits").float()

        outputs = model(**inputs)
        student_logits = outputs.logits

        loss = self.distillation_loss_fn(student_logits, teacher_logits, labels)

        return (loss, outputs) if return_outputs else loss

print("‚úÖ Distillation loss defined\n")

STEP 4: Defining distillation loss
‚úÖ Distillation loss defined



In [None]:
# ======================
# STEP 5: LOAD STUDENT MODEL
# ======================
print("="*80)
print("STEP 5: Loading student model")
print("="*80)

student_model = AutoModelForSequenceClassification.from_pretrained(
    STUDENT_MODEL_NAME,
    num_labels=33,
    problem_type="multi_label_classification",
    ignore_mismatched_sizes=True
)

teacher_params = 355393569
student_params = sum(p.numel() for p in student_model.parameters())
compression_ratio = (1 - student_params / teacher_params) * 100

print(f"üë®‚Äçüè´ Teacher parameters: {teacher_params:,}")
print(f"üéì Student parameters: {student_params:,}")
print(f"üìâ Compression: {compression_ratio:.1f}% size reduction\n")

STEP 5: Loading student model


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üë®‚Äçüè´ Teacher parameters: 355,393,569
üéì Student parameters: 82,143,777
üìâ Compression: 76.9% size reduction



In [None]:
# ======================
# STEP 6: TRAINING CONFIGURATION
# ======================
print("="*80)
print("STEP 6: Configuring student training")
print("="*80)

distillation_loss_fn = DistillationLoss(
    alpha=DISTILLATION_ALPHA,
    temperature=TEMPERATURE,
    pos_weight=class_weights
)

student_args = TrainingArguments(
    output_dir="./DistilRoBERTa_distilled",

    # --- CRITICAL CHANGES FOR LEARNING RATE ---
    learning_rate=4e-5,          # Student needs to learn faster
    num_train_epochs=20,         # Increased from 5 (Distillation needs time to converge)
    warmup_ratio=0.1,            # Standard warm-up
    # ------------------------------------------

    per_device_train_batch_size=STUDENT_BATCH_SIZE, # Keep at 16 or 32
    gradient_accumulation_steps=STUDENT_GRAD_ACCUM,

    eval_strategy="epoch",
    save_strategy="epoch",
    weight_decay=0.01,
    max_grad_norm=1.0,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    logging_steps=50,
    save_total_limit=1,
    fp16=True,
    report_to="none",
    seed=42,
    remove_unused_columns=False,
    lr_scheduler_type="cosine"
)

STEP 6: Configuring student training


In [None]:
# ======================
# STEP 7: TRAIN STUDENT WITH EMISSION TRACKING
# ======================
print("="*80)
print("STEP 7: Training student model (TRACKED)")
print("="*80)

def compute_metrics(p):
    pred, labels = p
    probs = 1 / (1 + np.exp(-pred))
    binary = (probs > 0.5).astype(float)
    return {
        'f1': f1_score(labels, binary, average='weighted', zero_division=0),
        'f1_macro': f1_score(labels, binary, average='macro', zero_division=0),
        'f1_micro': f1_score(labels, binary, average='micro', zero_division=0),
        "jaccard_samples": jaccard_score(labels, binary, average="samples", zero_division=0)
    }

# Create new tracker for student
student_tracker = EmissionsTracker(
    project_name="DistilRoBERTa_distillation",
    output_dir="SustainabilityTracker/DistilRoBERTa",
    log_level="error"
)

student_trainer = DistillationTrainer(
    model=student_model,
    args=student_args,
    train_dataset=train_ds_student,
    eval_dataset=test_ds_student,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    distillation_loss_fn=distillation_loss_fn
)

print("üöÄ Starting student training with emission tracking...\n")
student_start_time = time.time()
student_tracker.start()

student_result = student_trainer.train()

student_training_emissions = student_tracker.stop()
student_train_time = time.time() - student_start_time

print(f"\n‚úÖ Student training completed in {student_train_time/60:.1f} min")
print(f"üå± Training emissions: {student_training_emissions:.6f} kg CO‚ÇÇ\n")

STEP 7: Training student model (TRACKED)
üöÄ Starting student training with emission tracking...



Epoch,Training Loss,Validation Loss,F1,F1 Macro,F1 Micro,Jaccard Samples
1,0.2485,0.307931,0.713733,0.684342,0.717902,0.470843
2,0.2476,0.309121,0.714536,0.688455,0.721132,0.482274
3,0.2401,0.301949,0.725383,0.696482,0.72563,0.496055
4,0.2328,0.291802,0.728531,0.702502,0.734148,0.491977
5,0.2196,0.293441,0.729901,0.703989,0.727358,0.496923
6,0.2096,0.286404,0.734814,0.711123,0.740458,0.497571
7,0.1958,0.285606,0.736218,0.711846,0.73691,0.497894
8,0.1874,0.279428,0.740451,0.715982,0.744809,0.504788
9,0.1745,0.281267,0.736903,0.716027,0.740494,0.499207
10,0.1685,0.277723,0.744269,0.721504,0.747718,0.507763



‚úÖ Student training completed in 10.9 min
üå± Training emissions: 0.001097 kg CO‚ÇÇ



In [None]:
# ==============================================================================
# STEP 8: Student inference and threshold optimization (TRACKED)
# ==============================================================================
print("="*80)
print("STEP 8: Student inference and threshold optimization (TRACKED)")
print("="*80)

# FIX 1: Explicitly move the model to the GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
student_trainer.model.to(device)
print(f"‚úÖ Model moved to: {device}")

student_tracker.start()

# FIX 2: Use the existing 'test_ds_student' dataset object
# (It's faster and ensures 'teacher_logits' are present)
student_preds = student_trainer.predict(test_ds_student)

student_probs = 1 / (1 + np.exp(-student_preds.predictions))
student_y_true = student_preds.label_ids

# Use same threshold optimization strategy as teacher
student_best_thresholds = []

print("üéØ Optimizing student thresholds...\n")
print("="*80)
print(f"{'Label':<25} {'Thresh':>8} {'F1':>8} {'Supp':>6}")
print("-"*80)

for i in range(33):
    label_name = LABEL_COLUMNS[i]
    support = int(student_y_true[:, i].sum())
    weight = weights[i]

    # Use weight-aware strategy
    if weight > 10.0:
        search_range = np.arange(0.1, 0.8, 0.03)
    elif weight > 5.0:
        search_range = np.arange(0.15, 0.85, 0.05)
    else:
        search_range = np.arange(0.25, 0.75, 0.05)

    best_f1, best_t = 0, 0.5
    for t in search_range:
        preds_binary = (student_probs[:, i] > t).astype(float)
        f1 = f1_score(student_y_true[:, i], preds_binary, zero_division=0)
        if f1 > best_f1:
            best_f1, best_t = f1, t

    student_best_thresholds.append(best_t)
    print(f"{label_name:<25} {best_t:>8.2f} {best_f1:>8.3f} {support:>6d}")

student_inference_emissions = student_tracker.stop()

print("="*80 + "\n")

STEP 8: Student inference and threshold optimization (TRACKED)
‚úÖ Model moved to: cuda


üéØ Optimizing student thresholds...

Label                       Thresh       F1   Supp
--------------------------------------------------------------------------------
acne                          0.70    0.776    215
eye_contour                   0.70    0.873    101
homogeneity                   0.40    0.573     91
lack_firmness                 0.45    0.812    193
lack_radiance                 0.65    0.767    233
pores                         0.55    0.774    207
fine_lines                    0.65    0.878    335
wrinkles_fine-lines           0.70    0.842    282
eye-wrinkles                  0.55    0.848    252
undereye-bags                 0.60    0.750     68
generic                       0.65    0.570    253
18-34                         0.50    0.780     92
35-54                         0.65    0.795    210
55-99                         0.70    0.737     93
dark_pigmentation             0.65    0.748    111
dry                           0.45    0.760    197
normal       

In [None]:
# ======================
# STEP 9: STUDENT EVALUATION
# ======================
print("="*80)
print("STEP 9: Final student evaluation")
print("="*80)

student_final_preds = np.array([
    (student_probs[:, i] > student_best_thresholds[i]).astype(float)
    for i in range(33)
]).T

student_f1_w = f1_score(student_y_true, student_final_preds, average='weighted', zero_division=0)
student_f1_ma = f1_score(student_y_true, student_final_preds, average='macro', zero_division=0)
student_f1_mi = f1_score(student_y_true, student_final_preds, average='micro', zero_division=0)
student_jac = float(jaccard_score(student_y_true, student_final_preds, average="samples"))

print("\nüéâ STUDENT MODEL RESULTS:")
print("="*80)
print(f"üéØ Weighted F1: {student_f1_w*100:.2f}%")
print(f"üìä Macro F1:    {student_f1_ma*100:.2f}%")
print(f"üìà Micro F1:    {student_f1_mi*100:.2f}%")
print(f"üìà Jaccard:     {student_jac*100:.2f}%")
print("="*80 + "\n")

STEP 9: Final student evaluation

üéâ STUDENT MODEL RESULTS:
üéØ Weighted F1: 76.26%
üìä Macro F1:    74.52%
üìà Micro F1:    76.36%
üìà Jaccard:     52.63%



In [None]:
# ======================
# STEP 10: COMPARISON ANALYSIS
# ======================
print("="*80)
print("STEP 10: Teacher vs Student comparison")
print("="*80)

# Teacher Metrics
f1_w = 0.7817           # Teacher Weighted F1
f1_ma = 0.7698          # Teacher Macro F1
f1_mi = 0.7821          # Teacher Micro F1
test_jac = 0.5579       # Teacher Jaccard Score

training_emissions = 0.031781  # Teacher Emissions (kg)
teacher_train_time = 214 * 30   # Teacher Time in seconds (e.g., 30 mins)

f1_retention = (student_f1_ma / f1_ma) * 100 if f1_ma > 0 else 0
emissions_reduction = ((training_emissions - student_training_emissions) / training_emissions) * 100 if training_emissions > 0 else 0
time_reduction = ((teacher_train_time - student_result.metrics['train_runtime']) / teacher_train_time) * 100

print("\nüìä PERFORMANCE COMPARISON:")
print("-"*80)
print(f"{'Metric':<25} {'Teacher':>12} {'Student':>12} {'Retention':>12}")
print("-"*80)
print(f"{'Weighted F1':<25} {f1_w*100:>11.2f}% {student_f1_w*100:>11.2f}% {(student_f1_w/f1_w)*100:>11.1f}%")
print(f"{'Macro F1':<25} {f1_ma*100:>11.2f}% {student_f1_ma*100:>11.2f}% {f1_retention:>11.1f}%")
print(f"{'Micro F1':<25} {f1_mi*100:>11.2f}% {student_f1_mi*100:>11.2f}% {(student_f1_mi/f1_mi)*100:>11.1f}%")
print(f"{'Jaccard':<25} {test_jac*100:>11.2f}% {student_jac*100:>11.2f}% {(student_jac/test_jac)*100:>11.1f}%")
print("-"*80 + "\n")

print("üå± SUSTAINABILITY COMPARISON:")
print("-"*80)
print(f"{'Metric':<25} {'Teacher':>15} {'Student':>15} {'Reduction':>12}")
print("-"*80)
print(f"{'Training emissions':<25} {training_emissions:>14.6f} {student_training_emissions:>14.6f} {emissions_reduction:>11.1f}%")
print(f"{'Training time (min)':<25} {teacher_train_time/60:>14.1f} {student_result.metrics['train_runtime']/60:>14.1f} {time_reduction:>11.1f}%")
print(f"{'Model parameters':<25} {teacher_params:>15,} {student_params:>15,} {compression_ratio:>11.1f}%")
print("-"*80 + "\n")

STEP 10: Teacher vs Student comparison

üìä PERFORMANCE COMPARISON:
--------------------------------------------------------------------------------
Metric                         Teacher      Student    Retention
--------------------------------------------------------------------------------
Weighted F1                     78.17%       76.26%        97.6%
Macro F1                        76.98%       74.52%        96.8%
Micro F1                        78.21%       76.36%        97.6%
Jaccard                         55.79%       52.63%        94.3%
--------------------------------------------------------------------------------

üå± SUSTAINABILITY COMPARISON:
--------------------------------------------------------------------------------
Metric                            Teacher         Student    Reduction
--------------------------------------------------------------------------------
Training emissions              0.031781       0.001097        96.5%
Training time (min)         

In [None]:
# ======================
# STEP 11: MODEL QUANTIZATION
# ======================
print("="*80)
print("STEP 11: Quantizing student model for production")
print("="*80)

# Dynamic INT8 quantization
student_model_cpu = student_model.cpu()
quantized_model = torch.quantization.quantize_dynamic(
    student_model_cpu,
    {nn.Linear},
    dtype=torch.qint8
)

print("‚úÖ Model quantized to INT8\n")

STEP 11: Quantizing student model for production
‚úÖ Model quantized to INT8



In [None]:
# ======================
# STEP 12: SAVE EVERYTHING
# ======================
print("="*80)
print("STEP 12: Saving student model artifacts")
print("="*80)

# Save quantized model
os.makedirs("./DistilRoBERTa_final", exist_ok=True)
torch.save(quantized_model.state_dict(), "./DistilRoBERTa_final/quantized_model.pth")
student_tokenizer.save_pretrained("./DistilRoBERTa_final")
np.save("./DistilRoBERTa_final/student_thresholds.npy", student_best_thresholds)

# Save comparison results
comparison_results = pd.DataFrame([{
    'model': 'Teacher (RoBERTa-large)',
    'parameters': teacher_params,
    'weighted_f1': f1_w,
    'macro_f1': f1_ma,
    'micro_f1': f1_mi,
    'jaccard': test_jac,
    'training_emissions_kg': training_emissions,
    'training_time_min': teacher_train_time/60
}, {
    'model': 'Student (MiniLM-L12)',
    'parameters': student_params,
    'weighted_f1': student_f1_w,
    'macro_f1': student_f1_ma,
    'micro_f1': student_f1_mi,
    'jaccard': student_jac,
    'training_emissions_kg': student_training_emissions,
    'training_time_min': student_result.metrics['train_runtime']/60
}])

comparison_results.to_csv("./DistilRoBERTa_final/teacher_student_comparison.csv", index=False)

# Model size comparison
try:
    teacher_size = sum(os.path.getsize(os.path.join("./models/RoBERTa_final", f))
                       for f in os.listdir("./models/RoBERTa_final")
                       if os.path.isfile(os.path.join("./models/RoBERTa_final", f)))
    student_size = os.path.getsize("./DistilRoBERTa_final/quantized_model.pth")

    print(f"üíæ Model sizes:")
    print(f"   Teacher (unquantized): {teacher_size / (1024**2):.1f} MB")
    print(f"   Student (quantized): {student_size / (1024**2):.1f} MB")
    print(f"   Size reduction: {(1 - student_size/teacher_size)*100:.1f}%\n")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not calculate model sizes: {e}\n")

print("‚úÖ All artifacts saved to ./DistilRoBERTa_final/\n")

STEP 12: Saving student model artifacts
üíæ Model sizes:
   Teacher (unquantized): 1360.4 MB
   Student (quantized): 190.2 MB
   Size reduction: 86.0%

‚úÖ All artifacts saved to ./DistilRoBERTa_final/



In [None]:
# ======================
# FINAL SUMMARY
# ======================
print("="*80)
print("üéâ KNOWLEDGE DISTILLATION COMPLETE")
print("="*80)
print(f"""
‚ú® ACHIEVEMENTS:
   ‚Ä¢ F1 Score retention: {f1_retention:.1f}%
   ‚Ä¢ Emissions reduction: {emissions_reduction:.1f}%
   ‚Ä¢ Model size reduction: {compression_ratio:.1f}%
   ‚Ä¢ Training time reduction: {time_reduction:.1f}%

üå± SUSTAINABILITY IMPACT:
   ‚Ä¢ Student emissions: {student_training_emissions:.6f} kg CO‚ÇÇ
   ‚Ä¢ Suitable for production deployment
   ‚Ä¢ Fast inference on CPU
""")

üéâ KNOWLEDGE DISTILLATION COMPLETE

‚ú® ACHIEVEMENTS:
   ‚Ä¢ F1 Score retention: 96.8%
   ‚Ä¢ Emissions reduction: 96.5%
   ‚Ä¢ Model size reduction: 76.9%
   ‚Ä¢ Training time reduction: 89.8%

üå± SUSTAINABILITY IMPACT:
   ‚Ä¢ Student emissions: 0.001097 kg CO‚ÇÇ
   ‚Ä¢ Suitable for production deployment
   ‚Ä¢ Fast inference on CPU

