In [None]:
import pandas as pd
import numpy as np
import re
import warnings
import time
import os

from sklearn.metrics import f1_score, precision_score, recall_score, jaccard_score
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from codecarbon import EmissionsTracker

In [None]:
warnings.filterwarnings('ignore')

In [None]:
print(f"‚úÖ PyTorch: {torch.__version__}")
print(f"‚úÖ CUDA: {torch.cuda.is_available()}")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"‚úÖ Device: {device}\n")

‚úÖ PyTorch: 2.8.0+cu126
‚úÖ CUDA: True
‚úÖ Device: cuda



In [None]:
# ======================
# LOAD DATA
# ======================
print("üìÇ Loading data...")
df = pd.read_excel("/kaggle/input/dataset-v1/dataset.xlsx")
df = df.dropna(subset=["text_raw"]).rename(columns={"text_raw": "text"})

# ======================
# PREPROCESSING
# ======================
def advanced_clean(text):
    if pd.isna(text): return ""
    text = str(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s\-\.\,\!\?\%]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower().strip()
    if len(text.split()) < 3: return ""
    return text


LABEL_COLUMNS = [
    "acne", "eye_contour", "homogeneity", "lack_firmness", "lack_radiance",
    "pores", "fine_lines", "wrinkles_fine-lines", "eye-wrinkles", "undereye-bags",
    "generic", "18-34", "35-54", "55-99", "dark_pigmentation", "dry", "normal",
    "oily", "combination", "sensitivity-high", "sensitivity-low", "no_sensitivity",
    "male", "female", "cleanse", "prepare", "treat", "targeted", "care",
    "moisturize", "protect", "day", "night"
]

print("üßπ Preprocessing...")

# ======================
# DATA PRUNING
# ======================
MAX_LABELS_PER_SAMPLE = 20  # more than 60% of labels

# Ensure labels are numeric and NaNs are treated as 0
label_matrix = df[LABEL_COLUMNS].fillna(0).astype(int)

# Count active labels per sample
labels_per_sample = label_matrix.sum(axis=1)

before_n = len(df)
removed_mask = labels_per_sample > MAX_LABELS_PER_SAMPLE
removed_n = int(removed_mask.sum())
removed_pct = (100.0 * removed_n / before_n) if before_n > 0 else 0.0

# Prune
df = df.loc[~removed_mask].copy().reset_index(drop=True)
label_matrix = label_matrix.loc[df.index]

print(f"Pruning rule: keep samples with ‚â§ {MAX_LABELS_PER_SAMPLE} labels")
print(f"Before: {before_n} | Removed: {removed_n} ({removed_pct:.2f}%) | After: {len(df)}")
print("Labels/sample (after) - mean:", round(label_matrix.sum(axis=1).mean(), 3))
print("Labels/sample (after) - max:", int(label_matrix.sum(axis=1).max()))

df["cleaned_text"] = df["text"].apply(advanced_clean)
df = df[df["cleaned_text"].str.len() > 0]
df["labels"] = df[LABEL_COLUMNS].apply(lambda row: row.tolist(), axis=1)
print(f"   Samples: {len(df)}\n")

üìÇ Loading data...
üßπ Preprocessing...
Pruning rule: keep samples with ‚â§ 20 labels
Before: 6240 | Removed: 16 (0.26%) | After: 6224
Labels/sample (after) - mean: 3.965
Labels/sample (after) - max: 33
   Samples: 6224



In [None]:
# ======================
# OVERSAMPLE RARE AGE & GENDER LABELS
# ======================
print("üîÑ Oversampling rare age & gender labels...")

# Define age and gender labels
AGE_LABELS = ["18-34", "35-54", "55-99"]
GENDER_LABELS = ["male", "female"]
RARE_LABELS = AGE_LABELS + GENDER_LABELS

# Calculate support for each label
label_counts = df[RARE_LABELS].sum()
print("\nBefore oversampling:")
print(label_counts)

# Define oversampling strategy
TARGET_MIN_SAMPLES = 500  # Target minimum samples per label
oversample_factor = {}

for label in RARE_LABELS:
    count = label_counts[label]
    if count < TARGET_MIN_SAMPLES and count > 0:
        oversample_factor[label] = int(TARGET_MIN_SAMPLES / count)
    else:
        oversample_factor[label] = 0

print(f"\nOversampling factors: {oversample_factor}")

# Perform oversampling
df_original = df.copy()
samples_to_add = []

for label, factor in oversample_factor.items():
    if factor > 1:
        # Get samples with this label
        label_samples = df[df[label] == 1]

        # Replicate (factor - 1) times (we already have original)
        for _ in range(factor - 1):
            samples_to_add.append(label_samples)

        print(f"   {label}: {len(label_samples)} ‚Üí {len(label_samples) * factor}")

# Concatenate all samples
if samples_to_add:
    df = pd.concat([df] + samples_to_add, ignore_index=True)

    # Shuffle to avoid clustering
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\nDataset size: {len(df_original)} ‚Üí {len(df)} (+{len(df) - len(df_original)})")
print("\nAfter oversampling:")
print(df[RARE_LABELS].sum())
print()

üîÑ Oversampling rare age & gender labels...

Before oversampling:
18-34     182
35-54     800
55-99     262
male      335
female    113
dtype: int64

Oversampling factors: {'18-34': 2, '35-54': 0, '55-99': 1, 'male': 1, 'female': 4}
   18-34: 182 ‚Üí 364
   female: 113 ‚Üí 452

Dataset size: 6224 ‚Üí 6745 (+521)

After oversampling:
18-34      430
35-54     1038
55-99      424
male       350
female     474
dtype: int64



In [None]:
# ======================
# CLASS WEIGHTS
# ======================
y = np.array(df[LABEL_COLUMNS].values, dtype=np.float32)
pos = y.sum(axis=0)
weights = []
for i in range(len(LABEL_COLUMNS)):
    if pos[i] < 50: w = min(20.0, (len(df) - pos[i]) / (pos[i] + 1))  # Increased from 15
    elif pos[i] < 100: w = min(12.0, (len(df) - pos[i]) / (pos[i] + 1))  # NEW bracket
    elif pos[i] < 200: w = min(8.0, (len(df) - pos[i]) / (pos[i] + 1))
    else: w = min(3.0, (len(df) - pos[i]) / (pos[i] + 1))
    weights.append(w)

class_weights = torch.tensor(weights, dtype=torch.float32).to(device)
print(f"‚öñÔ∏è  Weights: {min(weights):.2f} - {max(weights):.2f}\n")

‚öñÔ∏è  Weights: 1.60 - 12.00



In [None]:
# ======================
# SPLIT
# ======================
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df[LABEL_COLUMNS[0]])
print(f"‚úÇÔ∏è  Train: {len(train_df)} | Test: {len(test_df)}\n")

train_ds = Dataset.from_pandas(train_df[["cleaned_text", "labels"]].reset_index(drop=True))
test_ds = Dataset.from_pandas(test_df[["cleaned_text", "labels"]].reset_index(drop=True))

‚úÇÔ∏è  Train: 5396 | Test: 1349



In [None]:
# ======================
# DEFINE EMISSIONS TRACKER
# ======================
os.makedirs("/kaggle/working/EmissionsTracker", exist_ok=True)

tracker = EmissionsTracker(
    project_name="roberta_large_eval",
    output_dir="/kaggle/working/EmissionsTracker",
    log_level="error"
)

In [None]:
# ======================
# MODEL CONFIGURATION
# ======================
MODEL_NAME = "roberta-large"
MAX_LENGTH = 256
BATCH_SIZE = 4
GRAD_ACCUM = 8
EPOCHS = 25
LEARNING_RATE = 3e-5

print(f"üéØ MODEL CONFIGURATION:")
print(f"   Model: {MODEL_NAME}")
print(f"   Epochs: {EPOCHS}")
print(f"   Effective batch: {BATCH_SIZE * GRAD_ACCUM}")
print(f"   Learning rate: {LEARNING_RATE}")
print(f"   Max length: {MAX_LENGTH}\n")

üéØ MODEL CONFIGURATION:
   Model: roberta-large
   Epochs: 25
   Effective batch: 32
   Learning rate: 3e-05
   Max length: 256



In [None]:
# ======================
# TOKENIZE
# ======================
print("üî§ Tokenizing...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(x):
    return tokenizer(x["cleaned_text"], padding="max_length", truncation=True, max_length=MAX_LENGTH)

train_tok = train_ds.map(tokenize, batched=True, remove_columns=["cleaned_text"])
test_tok = test_ds.map(tokenize, batched=True, remove_columns=["cleaned_text"])
train_tok.set_format("torch")
test_tok.set_format("torch")
print("‚úÖ Done\n")

üî§ Tokenizing...


Map:   0%|          | 0/5396 [00:00<?, ? examples/s]

Map:   0%|          | 0/1349 [00:00<?, ? examples/s]

   ‚úÖ Done



In [None]:
# ======================
# FOCAL LOSS
# ======================
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, pos_weight=None):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.pos_weight = pos_weight

    def forward(self, inputs, targets):
        bce = nn.functional.binary_cross_entropy_with_logits(
            inputs, targets, reduction='none', pos_weight=self.pos_weight
        )
        pt = torch.exp(-bce)
        return (self.alpha * (1 - pt) ** self.gamma * bce).mean()

class AdvancedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels").float()
        outputs = model(**inputs)
        loss = FocalLoss(alpha=0.25, gamma=2.0, pos_weight=class_weights)(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss

def compute_metrics(p):
    pred, labels = p
    probs = 1 / (1 + np.exp(-pred))
    binary = (probs > 0.5).astype(float)
    return {
        'f1': f1_score(labels, binary, average='weighted', zero_division=0),
        'f1_macro': f1_score(labels, binary, average='macro', zero_division=0),
        'f1_micro': f1_score(labels, binary, average='micro', zero_division=0),
        "jaccard_samples": jaccard_score(labels, binary, average="samples", zero_division=0)
    }

In [None]:
# ======================
# LOAD MODEL
# ======================
print("ü§ñ Loading RoBERTa-large...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=33, problem_type="multi_label_classification", ignore_mismatched_sizes=True
)
print(f"   Parameters: {sum(p.numel() for p in model.parameters()):,}\n")

# ======================
# TRAINING ARGS
# ======================
args = TrainingArguments(
    output_dir="./roberta_final",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=GRAD_ACCUM,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    warmup_ratio=0.2,              # NEW: More warmup
    max_grad_norm=1.0,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=50,
    save_total_limit=1,
    fp16=True,
    report_to="none",
    seed=42,
)

ü§ñ Loading RoBERTa-large...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


   Parameters: 355,393,569



In [None]:
# ======================
# TRAIN THE MODEL AND SUSTAINIBILITY TRACKING (TRAINING)
# ======================
print("="*80)
print("üöÄ STARTING MODEL TRAINING")
print("="*80)

trainer = AdvancedTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
    compute_metrics=compute_metrics,
)

# START sustainability tracking (training)
train_start_time = time.time()
tracker.start()

result = trainer.train()

training_emissions = tracker.stop()
train_end_time = time.time()

print(f"\n‚úÖ Completed {EPOCHS} epochs in {result.metrics['train_runtime']/60:.1f} min\n")

üöÄ STARTING MODEL TRAINING


Epoch,Training Loss,Validation Loss,F1,F1 Macro,F1 Micro,Jaccard Samples
1,0.0799,0.065602,0.440556,0.351467,0.447444,0.2529
2,0.0533,0.04571,0.632664,0.602616,0.628894,0.406132
3,0.0414,0.03934,0.667863,0.638685,0.653598,0.431751
4,0.0373,0.037679,0.670165,0.637283,0.655429,0.439792
5,0.0339,0.036461,0.699559,0.677153,0.692503,0.473465
6,0.0288,0.036456,0.669061,0.637134,0.649327,0.446402
7,0.0243,0.034552,0.729194,0.707707,0.72312,0.506547
8,0.0206,0.035498,0.737938,0.720626,0.733348,0.51075
9,0.0165,0.035487,0.73124,0.713749,0.726313,0.504944
10,0.0143,0.035831,0.743977,0.729096,0.740997,0.520806



‚úÖ Completed 25 epochs in 214.0 min



In [None]:
# ======================
# STORE WEIGHTS FOR THRESHOLD TUNING
# ======================
# Save weights for use during threshold optimization
weights_dict = {label: weight for label, weight in zip(LABEL_COLUMNS, weights)}
print("\nüìä High-weight labels (will get special threshold tuning):")
high_weight_labels = [(label, w) for label, w in weights_dict.items() if w > 5.0]
for label, w in sorted(high_weight_labels, key=lambda x: x[1], reverse=True):
    print(f"   {label}: {w:.2f}")
print()


üìä High-weight labels (will get special threshold tuning):
   no_sensitivity: 12.00



In [None]:
# ======================
# WEIGHT-AWARE THRESHOLD OPTIMIZATION
# ======================
print("üéØ Weight-aware threshold optimization...\n")

inf_end_time = time.time()
tracker.start()
preds = trainer.predict(test_tok)
probs = 1 / (1 + np.exp(-preds.predictions))
y_true = preds.label_ids

best_thresholds = []
print("="*80)
print(f"{'Label':<25} {'Thresh':>8} {'F1':>8} {'Prec':>8} {'Rec':>8} {'Supp':>6} {'Wt':>6}")
print("-"*80)

for i in range(33):
    label_name = LABEL_COLUMNS[i]
    support = int(y_true[:, i].sum())
    weight = weights[i]

    # Strategy based on weight (which reflects training difficulty)
    if weight > 10.0:
        # Very rare/difficult: aggressive search, favor recall
        search_range = np.arange(0.1, 0.8, 0.03)
        metric_bias = 'recall'  # Accept lower precision for better recall
    elif weight > 5.0:
        # Rare: wide search
        search_range = np.arange(0.15, 0.85, 0.05)
        metric_bias = 'balanced'
    else:
        # Common: standard search
        search_range = np.arange(0.25, 0.75, 0.05)
        metric_bias = 'balanced'

    best_f1, best_t = 0, 0.5
    best_prec, best_rec = 0, 0

    for t in search_range:
        preds_binary = (probs[:, i] > t).astype(float)
        f1 = f1_score(y_true[:, i], preds_binary, zero_division=0)

        # For very rare labels, boost recall importance
        if metric_bias == 'recall' and support < 30:
            rec = recall_score(y_true[:, i], preds_binary, zero_division=0)
            # Weighted metric: 70% recall, 30% precision
            prec = precision_score(y_true[:, i], preds_binary, zero_division=0)
            weighted_f1 = 0.7 * rec + 0.3 * prec
            if weighted_f1 > best_f1:
                best_f1, best_t = f1, t
                best_prec, best_rec = prec, rec
        else:
            if f1 > best_f1:
                best_f1, best_t = f1, t
                best_prec = precision_score(y_true[:, i], preds_binary, zero_division=0)
                best_rec = recall_score(y_true[:, i], preds_binary, zero_division=0)

    best_thresholds.append(best_t)
    print(f"{label_name:<25} {best_t:>8.2f} {best_f1:>8.3f} {best_prec:>8.3f} {best_rec:>8.3f} {support:>6d} {weight:>6.2f}")

inference_emissions = tracker.stop()
inf_end_time = time.time()

print("="*80 + "\n")

üéØ Weight-aware threshold optimization...



Label                       Thresh       F1     Prec      Rec   Supp     Wt
--------------------------------------------------------------------------------
acne                          0.70    0.774    0.854    0.707    215   3.00
eye_contour                   0.65    0.889    0.907    0.871    101   3.00
homogeneity                   0.65    0.632    0.663    0.604     91   3.00
lack_firmness                 0.60    0.841    0.832    0.850    193   3.00
lack_radiance                 0.50    0.784    0.782    0.785    233   3.00
pores                         0.55    0.833    0.861    0.807    207   3.00
fine_lines                    0.65    0.882    0.861    0.904    335   3.00
wrinkles_fine-lines           0.70    0.870    0.901    0.840    282   3.00
eye-wrinkles                  0.65    0.835    0.848    0.821    252   3.00
undereye-bags                 0.60    0.835    0.817    0.853     68   3.00
generic                       0.65    0.575    0.672    0.502    253   3.00
18-34  

In [None]:
# ======================
# WEIGHT-THRESHOLD CORRELATION ANALYSIS
# ======================
print("üìä WEIGHT-THRESHOLD ANALYSIS")
print("="*80)

analysis_df = pd.DataFrame({
    'label': LABEL_COLUMNS,
    'weight': weights,
    'threshold': best_thresholds,
    'support': [int(y_true[:, i].sum()) for i in range(33)]
})

# Group by weight ranges
print("\nAverage thresholds by weight range:")
print("-"*80)
analysis_df['weight_range'] = pd.cut(analysis_df['weight'],
                                      bins=[0, 3, 5, 10, 20],
                                      labels=['Low (0-3)', 'Med (3-5)', 'High (5-10)', 'Very High (10+)'])

summary = analysis_df.groupby('weight_range', observed=True)[['threshold', 'support']].agg({
    'threshold': ['mean', 'min', 'max'],
    'support': 'mean'
}).round(3)

print(summary)

print("\n" + "="*80)
print("üí° INSIGHTS:")
print(f"   ‚Ä¢ High-weight labels (>5) avg threshold: {analysis_df[analysis_df['weight'] > 5]['threshold'].mean():.3f}")
print(f"   ‚Ä¢ Low-weight labels (‚â§5) avg threshold: {analysis_df[analysis_df['weight'] <= 5]['threshold'].mean():.3f}")
print(f"   ‚Ä¢ Labels with threshold < 0.3: {len(analysis_df[analysis_df['threshold'] < 0.3])}")
print(f"   ‚Ä¢ Labels with threshold > 0.7: {len(analysis_df[analysis_df['threshold'] > 0.7])}")
print(f"   ‚Ä¢ Threshold range: {analysis_df['threshold'].min():.2f} - {analysis_df['threshold'].max():.2f}")
print("="*80 + "\n")

üìä WEIGHT-THRESHOLD ANALYSIS

Average thresholds by weight range:
--------------------------------------------------------------------------------
                threshold              support
                     mean   min   max     mean
weight_range                                  
Low (0-3)           0.553  0.25  0.70  184.844
Very High (10+)     0.340  0.34  0.34   14.000

üí° INSIGHTS:
   ‚Ä¢ High-weight labels (>5) avg threshold: 0.340
   ‚Ä¢ Low-weight labels (‚â§5) avg threshold: 0.553
   ‚Ä¢ Labels with threshold < 0.3: 1
   ‚Ä¢ Labels with threshold > 0.7: 0
   ‚Ä¢ Threshold range: 0.25 - 0.70



In [None]:
# ======================
# FINAL PREDICTIONS
# ======================
final_preds = np.array([(probs[:, i] > best_thresholds[i]).astype(float) for i in range(33)]).T

f1_w = f1_score(y_true, final_preds, average='weighted', zero_division=0)
f1_ma = f1_score(y_true, final_preds, average='macro', zero_division=0)
f1_mi = f1_score(y_true, final_preds, average='micro', zero_division=0)
test_jac = float(jaccard_score(y_true, final_preds, average="samples"))

# ======================
# MODEL RESULTS
# ======================
print("="*80)
print("üéâ FINAL RESULTS - MODEL")
print("="*80)
print(f"üéØ Weighted F1: {f1_w*100:.2f}%")
print(f"üìä Macro F1:    {f1_ma*100:.2f}%")
print(f"üìà Micro F1:    {f1_mi*100:.2f}%")
print(f"üìà Jaccard Score:    {test_jac*100:.2f}%")
print(f"‚è±Ô∏è  Time: {result.metrics['train_runtime']/60:.1f} min")
print(f"üì¶ Epochs: {EPOCHS}")

üéâ FINAL RESULTS - MODEL
üéØ Weighted F1: 78.17%
üìä Macro F1:    76.98%
üìà Micro F1:    78.21%
üìà Jaccard Score:    55.79%
‚è±Ô∏è  Time: 214.0 min
üì¶ Epochs: 25


In [None]:
# ======================
# SUSTAINABILITY METRICS
# ======================
co2_per_epoch = training_emissions / EPOCHS
co2_per_1k_train = training_emissions / (len(train_df) / 1000)
co2_per_f1 = training_emissions / f1_w

num_params = sum(p.numel() for p in model.parameters())
model_size_mb = num_params * 4 / (1024 ** 2)  # fp32

kg_co2_per_min = training_emissions / (result.metrics["train_runtime"] / 60)

sustainability_score = (
    training_emissions * 0.6 +
    inference_emissions * 0.25 +
    (1 - f1_w) * 0.15
)

# ======================
# SUSTAINABILITY RESULTS
# ======================
print("=" * 80)
print("SUSTAINABILITY REPORT ‚Äî ROBERTA-LARGE")
print("=" * 80)

print(f"Sustainability Score: {sustainability_score:.6f}")
print(f"Training CO2 (kg): {training_emissions:.6f}")
print(f"Inference CO2 (kg): {inference_emissions:.6f}")
print(f"CO2 per epoch (kg): {co2_per_epoch:.6f}")
print(f"CO2 per 1k training samples (kg): {co2_per_1k_train:.6f}")
print(f"CO2 per F1 point: {co2_per_f1:.6f}")
print(f"Emissions per minute (kg): {kg_co2_per_min:.6f}")

print(f"Model parameters: {num_params:,}")
print(f"Model size (MB): {model_size_mb:.2f}")
print("=" * 80)

SUSTAINABILITY REPORT ‚Äî ROBERTA-LARGE
Sustainability Score: 0.059806
Training CO2 (kg): 0.031781
Inference CO2 (kg): 0.031937
CO2 per epoch (kg): 0.001271
CO2 per 1k training samples (kg): 0.005890
CO2 per F1 point: 0.040659
Emissions per minute (kg): 0.000148
Model parameters: 355,393,569
Model size (MB): 1355.72


In [None]:
# ======================
# SAVE
# ======================
print("üíæ Saving...")
trainer.save_model("./ML_256_EP_25")
tokenizer.save_pretrained("./ML_256_EP_25")
np.save("best_thresholds_ML_256_EP_25.npy", best_thresholds)

results = pd.DataFrame([{
    'weighted_f1': f1_w,
    'macro_f1': f1_ma,
    'micro_f1': f1_mi,
    'epochs': EPOCHS,
    'learning_rate': LEARNING_RATE,
    'training_minutes': result.metrics['train_runtime']/60
}])
results.to_csv("final_results_ML_256_EP_25.csv", index=False)
print("   ‚úÖ All outputs saved\n")

üíæ Saving...
   ‚úÖ All outputs saved



In [None]:
# ======================
# DETAILED REPORT
# ======================
print("="*80)
print("PER-LABEL BREAKDOWN")
print("="*80)
print(f"{'Label':<25} {'F1':>8} {'Prec':>8} {'Recall':>8} {'Thresh':>8} {'Supp':>6}")
print("-"*80)

for i in range(33):
    f1 = f1_score(y_true[:, i], final_preds[:, i], zero_division=0)
    prec = precision_score(y_true[:, i], final_preds[:, i], zero_division=0)
    rec = recall_score(y_true[:, i], final_preds[:, i], zero_division=0)
    print(f"{LABEL_COLUMNS[i]:<25} {f1:>8.3f} {prec:>8.3f} {rec:>8.3f} {best_thresholds[i]:>8.2f} {int(y_true[:, i].sum()):>6d}")

print("="*80 + "\n")

print(f"""
TRAINING SUMMARY:
- Full {EPOCHS} epochs completed
- Lower learning rate ({LEARNING_RATE}) for fine-tuning
- Larger effective batch ({BATCH_SIZE * GRAD_ACCUM})
- Advanced threshold optimization (0.5 step size)

FINAL F1: {f1_w*100:.2f}%
""")

PER-LABEL BREAKDOWN
Label                           F1     Prec   Recall   Thresh   Supp
--------------------------------------------------------------------------------
acne                         0.774    0.854    0.707     0.70    215
eye_contour                  0.889    0.907    0.871     0.65    101
homogeneity                  0.632    0.663    0.604     0.65     91
lack_firmness                0.841    0.832    0.850     0.60    193
lack_radiance                0.784    0.782    0.785     0.50    233
pores                        0.833    0.861    0.807     0.55    207
fine_lines                   0.882    0.861    0.904     0.65    335
wrinkles_fine-lines          0.870    0.901    0.840     0.70    282
eye-wrinkles                 0.835    0.848    0.821     0.65    252
undereye-bags                0.835    0.817    0.853     0.60     68
generic                      0.575    0.672    0.502     0.65    253
18-34                        0.842    0.911    0.783     0.65     92
35