In [None]:
!pip install transformers accelerate datasets peft -U

In [None]:
import os
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    DistilBertForSequenceClassification,
    DistilBertTokenizerFast,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import EarlyStoppingCallback

In [None]:
MODEL_NAME = "bhadresh-savani/distilbert-base-uncased-emotion"
FILE_NAME = '/combined_emotion.csv'
OUTPUT_DIR = "./peft_finetuned_emotion_model"
NUM_LABELS = 6
os.environ["WANDB_DISABLED"] = "true"

In [None]:
def load_and_tokenize_data(file_name, model_name):
    print("Loading and tokenizing full dataset...")
    df = pd.read_csv(file_name)
    print(len(df))
    df['emotion'] = df['emotion'].str.lower().str.strip()

    label_mapping = {'sad': 'sadness', 'suprise': 'surprise'}
    df['emotion'] = df['emotion'].replace(label_mapping)

    model_emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
    label_to_id = {label: i for i, label in enumerate(model_emotions)}

    df['label'] = df['emotion'].map(label_to_id)

    df.dropna(subset=['label'], inplace=True)
    df = df.reset_index(drop=True)
    df['label'] = df['label'].astype(int)

    print("\n--- Model Class ID Mappings ---")
    print("Label (Class Name) to ID:")
    print(label_to_id)
    print("\nID to Label (Class Name):")
    print(id_to_label)

    df_full_size = len(df)
    print(f"Cleaned training data size before sampling: {df_full_size}")
    FINAL_SAMPLE_SIZE = 30000

    if df_full_size > FINAL_SAMPLE_SIZE:
        sample_fraction = FINAL_SAMPLE_SIZE / df_full_size
        df = df.groupby('label', group_keys=False).apply(
            lambda x: x.sample(int(max(1, len(x) * sample_fraction)), random_state=42)
        ).reset_index(drop=True)

        if len(df) > FINAL_SAMPLE_SIZE:
            df = df.sample(n=FINAL_SAMPLE_SIZE, random_state=42).reset_index(drop=True)

        print(f"**SUCCESS: Stratified data reduced to EXACTLY {len(df)} samples for CPU stability and quality.**")

    dataset = Dataset.from_pandas(df)
    train_test_split = dataset.train_test_split(test_size=0.1, seed=42)

    tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
    def tokenize_function(examples):
        return tokenizer(examples['sentence'], truncation=True, padding='max_length', max_length=128)

    tokenized_train_dataset = train_test_split['train'].map(tokenize_function, batched=True)
    tokenized_eval_dataset = train_test_split['test'].map(tokenize_function, batched=True)

    columns_to_keep = ["input_ids", "attention_mask", "label"]
    tokenized_train_dataset = tokenized_train_dataset.select_columns(columns_to_keep)
    tokenized_eval_dataset = tokenized_eval_dataset.select_columns(columns_to_keep)

    return tokenized_train_dataset, tokenized_eval_dataset, tokenizer

tokenized_train_dataset, tokenized_eval_dataset, tokenizer = load_and_tokenize_data(FILE_NAME, MODEL_NAME)
print(f"Full training data size: {len(tokenized_train_dataset)}")

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_lin", "v_lin"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

In [None]:
def compute_metrics(p):
    labels = p.label_ids
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}


training_args = TrainingArguments(
    output_dir='./results_best_quality2',
    num_train_epochs=5,

    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=64,
    learning_rate=5e-5,
    weight_decay=0.1,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    fp16=False,
    report_to="none",
    disable_tqdm=True
)


trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

print("\nStarting PEFT Fine-Tuning (Training only <1% of parameters)...")
trainer.train()

In [None]:
SAVE_DIRECTORY = "./emotions_model_assets"
trainer.model.save_pretrained(SAVE_DIRECTORY)
tokenizer.save_pretrained(SAVE_DIRECTORY)
print(f"DistilBERT Model and Tokenizer saved to the folder: {SAVE_DIRECTORY}")

In [None]:
FOLDER_NAME = "emotions_model_assets"
ZIP_NAME = "distilbert_emotions_model.zip"
!zip -r $ZIP_NAME $FOLDER_NAME
print(f"Folder successfully zipped as {ZIP_NAME}")

In [None]:
from google.colab import files
files.download(ZIP_NAME)