In [16]:
import pandas as pd
import os

# --- 1. Define paths to BOTH datasets ---

# Path to the CSVs (train_sent_emo.csv, etc.)
metadata_path = '/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw' 

# Path to the audio (audio_train/, audio_dev/, etc.)
audio_root_path = '/kaggle/input/meld-audio'

# --- 2. Load data from the CSV files ---

all_df = []
total_rows_loaded = 0

print(f"Loading metadata from CSVs in: {metadata_path}")

for split in ['train', 'dev', 'test']:
    
    # --- MODIFICATION ---
    # Handle the special path for the 'train' file
    if split == 'train':
        csv_file_path = os.path.join(metadata_path, 'train', f"{split}_sent_emo.csv")
    else:
        # 'dev' and 'test' are in the base metadata path
        csv_file_path = os.path.join(metadata_path, f"{split}_sent_emo.csv")
    # --- END MODIFICATION ---

    print(f"Attempting to load: {csv_file_path}")
    
    try:
        # Load the CSV file
        df_split = pd.read_csv(csv_file_path)
        
        # Add a 'split' column (to find audio_train, audio_dev, etc.)
        df_split['split'] = split
        
        all_df.append(df_split)
        # Print the filename, not the full path, for clarity
        print(f"✓ Loaded {len(df_split)} rows from {split}_sent_emo.csv")
        total_rows_loaded += len(df_split)
        
    except FileNotFoundError:
        print(f"Error: Could not find file {csv_file_path}. Please double-check path.")

# --- 3. Combine, Filter, and Save ---

if not all_df:
    print("\nError: No CSV data was loaded. Cannot proceed.")
else:
    # Combine all DataFrames (train, dev, test) into one
    df = pd.concat(all_df, ignore_index=True)
    print(f"\nTotal rows loaded from all CSVs: {len(df)}")

    # Check if we have the necessary columns
    if 'Dialogue_ID' not in df.columns or 'Utterance_ID' not in df.columns:
        print("\nError: The loaded CSVs are missing 'Dialogue_ID' or 'Utterance_ID'.")
        print(f"Available columns are: {list(df.columns)}")
    else:
        # Add the full audio path
        df['audio_path'] = df.apply(
            lambda row: os.path.join(
                audio_root_path,
                f"audio_{row['split']}",
                f"dia{int(row['Dialogue_ID'])}_utt{int(row['Utterance_ID'])}.wav"
            ),
            axis=1
        )

        # Filter for existing .wav files
        print(f"Checking for .wav files in: {audio_root_path} (this may take a minute)...")
        df = df[df['audio_path'].apply(os.path.exists)]
        print("File check complete.")

        # Save the final, complete dataset
        df.to_csv('MELD_complete_dataset.csv', index=False)

        print(f"\n✓ CSV created: {len(df)} samples")
        if len(df) == 0:
            print("\nWarning: 0 audio samples were found.")
            print(f"This means the file names in the CSVs do not match the .wav files.")
        else:
            print(df.head())

Loading metadata from CSVs in: /kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw
Attempting to load: /kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/train/train_sent_emo.csv
✓ Loaded 9989 rows from train_sent_emo.csv
Attempting to load: /kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/dev_sent_emo.csv
✓ Loaded 1109 rows from dev_sent_emo.csv
Attempting to load: /kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/test_sent_emo.csv
✓ Loaded 2610 rows from test_sent_emo.csv

Total rows loaded from all CSVs: 13708
Checking for .wav files in: /kaggle/input/meld-audio (this may take a minute)...
File check complete.

✓ CSV created: 13706 samples
   Sr No.                                          Utterance          Speaker  \
0       1  also I was the point person on my companys tr...         Chandler   
1       2                   You mustve had your hands full.  The Interviewer   
2       3                            That I did. That I did.         Chandler   

In [3]:
!pip install transformers huggingface_hub --upgrade -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m908.2 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m101.2 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m75.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 4.1.1 requires pyarrow>=21.0.0, but you have pyarrow 19.0.1 which is incompatible.
gradio 5.38.1 requires pydantic<2.12,>=2.0, but you have pydantic 2.12.0a1 which is incompatible.[0m[31m
[0m

In [1]:
import pandas as pd
import numpy as np
import re
import torch
# --- MODIFICATION ---
# Using Auto* classes is the most robust way.
# They will read the config.json and load the correct classes (BertTokenizer, BertModel) automatically.
from transformers import AutoTokenizer, AutoModel
# --- END MODIFICATION ---
from tqdm.notebook import tqdm  # For a nice progress bar
import os # Import os to check the path

# --- 1. Configuration ---
MODEL_NAME = "/kaggle/input/distilbertdistilbert-base-uncased/transformers/default/1" 
CSV_PATH = "/kaggle/input/meld-data/MELD_complete_dataset.csv"
OUTPUT_FILE = "meld-text-embeddings.npy"
BATCH_SIZE = 128 # You can make this smaller if you get memory errors, e.g., 64 or 32

# --- 2. Text Cleaning Function ---
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[\x91\x92\x93\x94ââ]", "'", text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --- 3. Mean Pooling Function ---
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

# --- 4. Main Embedding Generation ---

print("Script started.")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if not os.path.exists(MODEL_NAME):
    print(f"CRITICAL ERROR: MODEL DATASET NOT FOUND at {MODEL_NAME}")
    raise FileNotFoundError("Model dataset not added to notebook.")
else:
    print(f"Found local model files at: {MODEL_NAME}. Proceeding...")

try:
    df = pd.read_csv(CSV_PATH)
    print(f"Successfully loaded CSV with {len(df)} rows.")
except FileNotFoundError:
    print(f"Error: Could not find file at {CSV_PATH}")
    raise

print("Cleaning text...")
df['cleaned_utterance'] = df['Utterance'].apply(clean_text)
texts = df['cleaned_utterance'].tolist()

# --- MODIFICATION ---
print(f"Loading tokenizer from local path...")
# Load the tokenizer using AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

print(f"Loading model from local path...")
# Load the model using AutoModel
model = AutoModel.from_pretrained(MODEL_NAME)
# --- END MODIFICATION ---

model.to(device)
model.eval()

all_embeddings = []
print(f"Starting embedding generation in batches of {BATCH_SIZE}...")

for i in tqdm(range(0, len(texts), BATCH_SIZE)):
    batch_texts = texts[i:i + BATCH_SIZE]
    
    inputs = tokenizer(
        batch_texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    batch_embeddings = mean_pooling(outputs, inputs['attention_mask'])
    all_embeddings.append(batch_embeddings.cpu().numpy())

print("Embedding generation complete. Concatenating results...")
embeddings = np.concatenate(all_embeddings, axis=0)
np.save(OUTPUT_FILE, embeddings)

print("-" * 30)
print(f"✓ Success! Embeddings saved to {OUTPUT_FILE}")
print(f"Embedding shape: {embeddings.shape}")
print(f"\nYour '{OUTPUT_FILE}' file is now in the output directory '/kaggle/working/'.")



Script started.
Using device: cuda
Found local model files at: /kaggle/input/distilbertdistilbert-base-uncased/transformers/default/1. Proceeding...
Successfully loaded CSV with 13706 rows.
Cleaning text...
Loading tokenizer from local path...
Loading model from local path...


2025-10-23 19:49:11.724895: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761248951.900504      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761248951.951854      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Starting embedding generation in batches of 128...


  0%|          | 0/108 [00:00<?, ?it/s]

Embedding generation complete. Concatenating results...
------------------------------
✓ Success! Embeddings saved to meld-text-embeddings.npy
Embedding shape: (13706, 768)

Your 'meld-text-embeddings.npy' file is now in the output directory '/kaggle/working/'.


In [2]:
!pip install librosa -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
import pandas as pd
import numpy as np
import torch
import librosa  # Audio loading library
from transformers import AutoProcessor, Wav2Vec2Model
from tqdm.notebook import tqdm
import os
import warnings

# --- 1. Configuration ---
warnings.filterwarnings('ignore', category=UserWarning, module='librosa')

MODEL_PATH = "/kaggle/input/meld-data/wave2vec-model" 
CSV_PATH = "/kaggle/input/meld-data/MELD_complete_dataset.csv"
OUTPUT_FILE = "meld-audio-embeddings.npy"
TARGET_SAMPLE_RATE = 16000
BATCH_SIZE = 32

# --- 2. Mean Pooling Function ---
# --- REMOVED --- (No longer needed with simpler pooling)

# --- 3. Audio Loading Function ---
def load_and_resample_audio(file_path):
    try:
        speech, sr = librosa.load(file_path, sr=TARGET_SAMPLE_RATE, mono=True)
        # Add a check for empty audio files after loading
        if speech.size == 0:
            print(f"Warning: Loaded empty audio from {file_path}. Returning zeros.")
            # Return zeros matching expected feature size (use model config later if needed)
            # For now, let's return zeros of a typical length, processor will pad
            return np.zeros(TARGET_SAMPLE_RATE) # e.g., 1 second of zeros
        return speech
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return np.zeros(TARGET_SAMPLE_RATE)

# --- 4. Main Embedding Generation ---

print("Script started.")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Check model path
if not os.path.exists(MODEL_PATH):
    print(f"CRITICAL ERROR: MODEL FOLDER NOT FOUND at {MODEL_PATH}")
    raise FileNotFoundError("Model folder not found in dataset.")
else:
    config_path = os.path.join(MODEL_PATH, 'config.json')
    if not os.path.exists(config_path):
         print(f"CRITICAL ERROR: config.json not found in {MODEL_PATH}")
         raise FileNotFoundError("config.json missing")
    print(f"Found local model files at: {MODEL_PATH}. Proceeding...")

# Load CSV
try:
    df = pd.read_csv(CSV_PATH)
    print(f"Successfully loaded CSV with {len(df)} rows.")
except FileNotFoundError:
    print(f"Error: Could not find file at {CSV_PATH}")
    raise

audio_paths = df['audio_path'].tolist()

# Load model and processor
print("Loading processor from local path...")
try:
    processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True) 
    print("Loading model from local path...")
    model = Wav2Vec2Model.from_pretrained(MODEL_PATH, trust_remote_code=True) 
    model.to(device)
    model.eval()
    print("Model and processor loaded successfully.")
except Exception as e:
    print(f"CRITICAL ERROR: FAILED TO LOAD MODEL FROM LOCAL PATH - {e}")
    raise 

all_embeddings = []

print(f"Starting audio embedding generation in batches of {BATCH_SIZE}...")
print("This will take 30-60+ minutes. Please be patient.")

# Process in batches
for i in tqdm(range(0, len(audio_paths), BATCH_SIZE)):
    batch_paths = audio_paths[i:i + BATCH_SIZE]
    # Filter out None values in case load_and_resample_audio failed severely, though it returns zeros now
    batch_audio = [audio for audio in (load_and_resample_audio(path) for path in batch_paths) if audio is not None]

    # Handle case where a whole batch failed to load (unlikely with zeros fallback)
    if not batch_audio:
        print(f"Warning: Skipping empty batch starting at index {i}")
        # Need to append placeholder embeddings if skipping
        batch_size_expected = len(batch_paths)
        embedding_dim = model.config.hidden_size 
        zeros = np.zeros((batch_size_expected, embedding_dim))
        all_embeddings.append(zeros)
        continue

    try:
        inputs = processor(
            batch_audio, 
            sampling_rate=TARGET_SAMPLE_RATE, 
            return_tensors="pt", 
            padding=True, 
            truncation=True, 
            max_length=240000 # ~15 seconds max length
        )
    except Exception as e:
        print(f"Error during processor call for batch starting at index {i}: {e}")
        batch_size_actual = len(batch_audio) # Use actual loaded count
        embedding_dim = model.config.hidden_size 
        zeros = np.zeros((batch_size_actual, embedding_dim))
        all_embeddings.append(zeros)
        continue # Skip this batch

    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # --- MODIFICATION ---
    # Simple mean pooling over the sequence length dimension (dim=1)
    # This works even without an explicit attention mask from the processor.
    batch_embeddings = outputs.last_hidden_state.mean(dim=1)
    # --- END MODIFICATION ---

    all_embeddings.append(batch_embeddings.cpu().numpy())

print("Embedding generation complete. Concatenating results...")
if not all_embeddings:
     print("Error: No embeddings were generated. Cannot save.")
else:
    try:
        embeddings = np.concatenate(all_embeddings, axis=0)
        np.save(OUTPUT_FILE, embeddings)

        print("-" * 30)
        print(f"✓ Success! Embeddings saved to {OUTPUT_FILE}")
        print(f"Embedding shape: {embeddings.shape}")
        # Add a check for expected number of embeddings
        if embeddings.shape[0] != len(audio_paths):
            print(f"Warning: Number of embeddings ({embeddings.shape[0]}) does not match number of audio paths ({len(audio_paths)}). Some might have failed.")
        print(f"\nYour '{OUTPUT_FILE}' file is now in the output directory '/kaggle/working/'.")
    except ValueError as e:
        print(f"Error during concatenation: {e}")
        print("This might happen if embedding shapes within batches are inconsistent.")
        # Optional: Add code here to inspect shapes in all_embeddings


Script started.
Using device: cuda
Found local model files at: /kaggle/input/meld-data/wave2vec-model. Proceeding...
Successfully loaded CSV with 13706 rows.
Loading processor from local path...
Loading model from local path...


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at /kaggle/input/meld-data/wave2vec-model and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and processor loaded successfully.
Starting audio embedding generation in batches of 32...
This will take 30-60+ minutes. Please be patient.


  0%|          | 0/429 [00:00<?, ?it/s]

Embedding generation complete. Concatenating results...
------------------------------
✓ Success! Embeddings saved to meld-audio-embeddings.npy
Embedding shape: (13706, 768)

Your 'meld-audio-embeddings.npy' file is now in the output directory '/kaggle/working/'.


In [7]:
!ls -lR /kaggle/input/meld-data/

/kaggle/input/meld-data/:
total 84504
-rw-r--r-- 1 nobody nogroup 42104960 Oct 23 21:11 meld-audio-embeddings.npy
-rw-r--r-- 1 nobody nogroup  2315511 Oct 23 21:11 MELD_complete_dataset.csv
-rw-r--r-- 1 nobody nogroup 42104960 Oct 23 21:11 meld-text-embeddings.npy
drwxr-xr-x 2 nobody nogroup        0 Oct 23 21:11 wave2vec-model

/kaggle/input/meld-data/wave2vec-model:
total 368832
-rw-r--r-- 1 nobody nogroup      1596 Oct 23 21:11 config.json
-rw-r--r-- 1 nobody nogroup       159 Oct 23 21:11 preprocessor_config.json
-rw-r--r-- 1 nobody nogroup 377667514 Oct 23 21:11 pytorch_model.bin
-rw-r--r-- 1 nobody nogroup       163 Oct 23 21:11 tokenizer_config.json
-rw-r--r-- 1 nobody nogroup       291 Oct 23 21:11 vocab.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [16]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import os
import time # To time training

# --- 1. Configuration ---
DATASET_PATH = "/kaggle/input/meld-data/"
CSV_FILE = os.path.join(DATASET_PATH, "MELD_complete_dataset.csv")
TEXT_EMBEDDINGS_FILE = os.path.join(DATASET_PATH, "meld-text-embeddings.npy")
AUDIO_EMBEDDINGS_FILE = os.path.join(DATASET_PATH, "meld-audio-embeddings.npy")
# MODEL_SAVE_PATH is now dynamic, defined in the training loop

# Model Hyperparameters
HIDDEN_UNITS_1 = 256
HIDDEN_UNITS_2 = 128
DROPOUT_RATE = 0.3
# LEARNING_RATE is now a list
LEARNING_RATES = [0.001, 0.0001, 1e-5] 
EPOCHS = 30
BATCH_SIZE = 64
PATIENCE = 5 # For early stopping

# --- 2. Load Data ---
print("Loading data...")
try:
    df = pd.read_csv(CSV_FILE)
    print(f"✓ Loaded CSV: {len(df)} rows")

    X_text = np.load(TEXT_EMBEDDINGS_FILE)
    print(f"✓ Loaded Text Embeddings: Shape {X_text.shape}")

    X_audio = np.load(AUDIO_EMBEDDINGS_FILE)
    print(f"✓ Loaded Audio Embeddings: Shape {X_audio.shape}")

    if len(df) != X_text.shape[0] or len(df) != X_audio.shape[0]:
        raise ValueError(f"Mismatch in samples! CSV:{len(df)}, Text:{X_text.shape[0]}, Audio:{X_audio.shape[0]}")

except FileNotFoundError as e:
    print(f"Error loading files: {e}")
    print("Please ensure 'meld-data' dataset is added.")
    raise

# --- 3. Prepare Data ---
print("\nPreparing data...")
X_combined = np.concatenate((X_text, X_audio), axis=1)
print(f"Combined Embeddings Shape: {X_combined.shape}")
input_dim = X_combined.shape[1]

y_text_labels = df['Emotion']
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y_text_labels)
num_classes = len(encoder.classes_)
print(f"Found {num_classes} emotion classes: {encoder.classes_}")

X_combined_tensor = torch.tensor(X_combined, dtype=torch.float32)
y_encoded_tensor = torch.tensor(y_encoded, dtype=torch.long)

split_col = df['split'].values
train_indices = np.where(split_col == 'train')[0]
dev_indices = np.where(split_col == 'dev')[0]
test_indices = np.where(split_col == 'test')[0]

X_train, y_train = X_combined_tensor[train_indices], y_encoded_tensor[train_indices]
X_dev, y_dev = X_combined_tensor[dev_indices], y_encoded_tensor[dev_indices]
X_test, y_test = X_combined_tensor[test_indices], y_encoded_tensor[test_indices]

print(f"\nData Split:")
print(f"Train samples: {len(X_train)}")
print(f"Dev samples:   {len(X_dev)}")
print(f"Test samples:  {len(X_test)}")

train_dataset = TensorDataset(X_train, y_train)
dev_dataset = TensorDataset(X_dev, y_dev)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Device setup (do once)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}")

# --- 4. Define the Dense Network (PyTorch) ---
# This class definition is used in the loop
class DenseClassifier(nn.Module):
    def __init__(self, input_dim, num_classes, h1=256, h2=128, dropout_rate=0.3):
        super(DenseClassifier, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, h1),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(h1, h2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(h2, num_classes)
        )
    def forward(self, x):
        return self.network(x)

# --- Main Training & Evaluation Loop ---
# Iterate over each learning rate
for lr in LEARNING_RATES:
    print(f"\n=======================================================")
    print(f"=== STARTING TRAINING RUN: LEARNING RATE = {lr} ===")
    print(f"=======================================================")
    
    # --- 4. Define the Dense Network (PyTorch) ---
    print("\nBuilding model...")
    model = DenseClassifier(input_dim, num_classes, HIDDEN_UNITS_1, HIDDEN_UNITS_2, DROPOUT_RATE)
    print(model)

    # --- 5. Setup Training ---
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr) # Use current LR
    
    # Define dynamic model save path
    MODEL_SAVE_PATH = f"meld_emotion_classifier_lr_{lr}.pth"

    best_val_accuracy = 0.0
    epochs_no_improve = 0
    best_model_state = None

    # --- 6. Training Loop ---
    print("\nStarting training...")
    start_time = time.time()

    for epoch in range(EPOCHS):
        model.train()
        running_loss = 0.0
        train_preds, train_targets = [], []
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            train_preds.extend(predicted.cpu().numpy())
            train_targets.extend(labels.cpu().numpy())

        epoch_loss = running_loss / len(train_loader)
        epoch_acc = accuracy_score(train_targets, train_preds)

        model.eval()
        val_loss = 0.0
        val_preds, val_targets = [], []
        with torch.no_grad():
            for inputs, labels in dev_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                val_preds.extend(predicted.cpu().numpy())
                val_targets.extend(labels.cpu().numpy())

        val_loss /= len(dev_loader)
        val_accuracy = accuracy_score(val_targets, val_preds)

        print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {epoch_loss:.4f} | Train Acc: {epoch_acc*100:.2f}% | Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy*100:.2f}%")

        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            epochs_no_improve = 0
            best_model_state = model.state_dict().copy() # Use .copy()
            print(f"  New best validation accuracy: {best_val_accuracy*100:.2f}%. Saving model state...")
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= PATIENCE:
                print(f"\nEarly stopping triggered after {epoch+1} epochs.")
                break

    end_time = time.time()
    print(f"\nTraining complete. Total time: {end_time - start_time:.2f} seconds.")

    # --- 7. Save the Best Model Weights ---
    if best_model_state:
        print(f"\nSaving best model weights to {MODEL_SAVE_PATH}...")
        torch.save(best_model_state, MODEL_SAVE_PATH) # <-- Save the state dictionary
        print("✓ Model saved.")
        # Load the best weights back into the model for evaluation
        model.load_state_dict(best_model_state)
    else:
         print("\nWarning: No best model state saved. Evaluating with final model state.")

    # --- 8. Evaluate on Test Set ---
    model.eval()
    test_preds, test_targets = [], []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            test_preds.extend(predicted.cpu().numpy())
            test_targets.extend(labels.cpu().numpy())

    test_accuracy = accuracy_score(test_targets, test_preds)
    print(f"\nEvaluating on Test Set (for LR={lr})...")
    print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

    print("\nClassification Report (Test Set):")
    # Use the integer labels for classification report
    print(classification_report(test_targets, test_preds, target_names=encoder.classes_, zero_division=0))

print("\nAll training runs finished.")

Loading data...
✓ Loaded CSV: 13706 rows
✓ Loaded Text Embeddings: Shape (13706, 768)
✓ Loaded Audio Embeddings: Shape (13706, 768)

Preparing data...
Combined Embeddings Shape: (13706, 1536)
Found 7 emotion classes: ['anger' 'disgust' 'fear' 'joy' 'neutral' 'sadness' 'surprise']

Data Split:
Train samples: 9988
Dev samples:   1108
Test samples:  2610

Using device: cuda

=== STARTING TRAINING RUN: LEARNING RATE = 0.001 ===

Building model...
DenseClassifier(
  (network): Sequential(
    (0): Linear(in_features=1536, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=256, out_features=128, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=128, out_features=7, bias=True)
  )
)

Starting training...
Epoch 1/30 | Train Loss: 1.4732 | Train Acc: 49.14% | Val Loss: 1.4971 | Val Acc: 48.10%
  New best validation accuracy: 48.10%. Saving model state...
Epoch 2/30 | Train Loss: 1.4007 | Tra

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import os
import time # To time training
import math # For sqrt in class weights

# --- 1. Configuration ---
DATASET_PATH = "/kaggle/input/meld-data/"
CSV_FILE = os.path.join(DATASET_PATH, "MELD_complete_dataset.csv")
TEXT_EMBEDDINGS_FILE = os.path.join(DATASET_PATH, "meld-text-embeddings.npy")
AUDIO_EMBEDDINGS_FILE = os.path.join(DATASET_PATH, "meld-audio-embeddings.npy")
# MODEL_SAVE_PATH is dynamic

# Model Hyperparameters
HIDDEN_UNITS_1 = 256
HIDDEN_UNITS_2 = 128
DROPOUT_RATE = 0.3
LEARNING_RATES = [0.001, 0.0001, 1e-5]
EPOCHS = 40 # Increased slightly to allow more time with weights
BATCH_SIZE = 64
PATIENCE = 7 # Increased slightly

# --- 2. Load Data ---
print("Loading data...")
try:
    df = pd.read_csv(CSV_FILE)
    print(f"✓ Loaded CSV: {len(df)} rows")
    X_text = np.load(TEXT_EMBEDDINGS_FILE)
    print(f"✓ Loaded Text Embeddings: Shape {X_text.shape}")
    X_audio = np.load(AUDIO_EMBEDDINGS_FILE)
    print(f"✓ Loaded Audio Embeddings: Shape {X_audio.shape}")
    if len(df) != X_text.shape[0] or len(df) != X_audio.shape[0]:
        raise ValueError("Mismatch in samples!")
except FileNotFoundError as e:
    print(f"Error loading files: {e}")
    raise

# --- 3. Prepare Data ---
print("\nPreparing data...")
X_combined = np.concatenate((X_text, X_audio), axis=1)
print(f"Combined Embeddings Shape: {X_combined.shape}")
input_dim = X_combined.shape[1]

y_text_labels = df['Emotion']
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y_text_labels)
num_classes = len(encoder.classes_)
print(f"Found {num_classes} emotion classes: {encoder.classes_}")

X_combined_tensor = torch.tensor(X_combined, dtype=torch.float32)
y_encoded_tensor = torch.tensor(y_encoded, dtype=torch.long)

split_col = df['split'].values
train_indices = np.where(split_col == 'train')[0]
dev_indices = np.where(split_col == 'dev')[0]
test_indices = np.where(split_col == 'test')[0]

X_train, y_train = X_combined_tensor[train_indices], y_encoded_tensor[train_indices]
X_dev, y_dev = X_combined_tensor[dev_indices], y_encoded_tensor[dev_indices]
X_test, y_test = X_combined_tensor[test_indices], y_encoded_tensor[test_indices]

print(f"\nData Split:")
print(f"Train samples: {len(X_train)}")
print(f"Dev samples:   {len(X_dev)}")
print(f"Test samples:  {len(X_test)}")

train_dataset = TensorDataset(X_train, y_train)
dev_dataset = TensorDataset(X_dev, y_dev)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}")

# --- 4. Define Dense Network (Unchanged) ---
class DenseClassifier(nn.Module):
    def __init__(self, input_dim, num_classes, h1=256, h2=128, dropout_rate=0.3):
        super(DenseClassifier, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, h1),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(h1, h2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(h2, num_classes)
        )
    def forward(self, x):
        return self.network(x)

# --- 5. Class Weight Calculation Function (NEW) ---
def calculate_class_weights(labels, num_classes, device):
    """ Calculate less aggressive weights using sqrt of inverse frequency """
    if isinstance(labels, torch.Tensor):
        labels_np = labels.cpu().numpy() # Ensure numpy array for calculation
    else:
        labels_np = labels # Assume it's already numpy or similar

    unique, counts = np.unique(labels_np, return_counts=True)
    total = len(labels_np)
    weights = torch.zeros(num_classes, device=device)
    for cls_idx, count in zip(unique, counts):
        if count > 0:
            weights[cls_idx] = math.sqrt(total / count)

    # Normalize weights (optional but recommended)
    weights = weights / weights.sum() * num_classes
    print(f"Calculated Class Weights (sqrt inverse freq): {weights}")
    return weights

# --- Calculate weights ONCE using the training data ---
print("\nCalculating class weights for the loss function...")
# Pass y_train tensor directly, function handles conversion
class_weights = calculate_class_weights(y_train, num_classes, device)

# --- Main Training & Evaluation Loop ---
for lr in LEARNING_RATES:
    print(f"\n=======================================================")
    print(f"=== STARTING TRAINING RUN: LEARNING RATE = {lr} ===")
    print(f"=======================================================")

    print("\nBuilding model...")
    model = DenseClassifier(input_dim, num_classes, HIDDEN_UNITS_1, HIDDEN_UNITS_2, DROPOUT_RATE)
    print(model)
    model.to(device)

    # --- Setup Training with WEIGHTED LOSS ---
    criterion = nn.CrossEntropyLoss(weight=class_weights) # Pass weights here
    optimizer = optim.Adam(model.parameters(), lr=lr)
    MODEL_SAVE_PATH = f"meld_emotion_classifier_weighted_lr_{lr}.pth" # Updated save name

    best_val_accuracy = 0.0
    epochs_no_improve = 0
    best_model_state = None

    print("\nStarting training...")
    start_time = time.time()
    for epoch in range(EPOCHS):
        model.train()
        running_loss = 0.0
        train_preds, train_targets = [], []
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels) # Loss calculation uses weights
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            train_preds.extend(predicted.cpu().numpy())
            train_targets.extend(labels.cpu().numpy())

        epoch_loss = running_loss / len(train_loader)
        epoch_acc = accuracy_score(train_targets, train_preds)

        # --- Validation ---
        model.eval()
        val_loss = 0.0
        val_preds, val_targets = [], []
        with torch.no_grad():
            for inputs, labels in dev_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels) # Can use weighted loss for val too
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                val_preds.extend(predicted.cpu().numpy())
                val_targets.extend(labels.cpu().numpy())

        val_loss /= len(dev_loader)
        val_accuracy = accuracy_score(val_targets, val_preds)
        print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {epoch_loss:.4f} | Train Acc: {epoch_acc*100:.2f}% | Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy*100:.2f}%")

        # --- Early Stopping Check ---
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            epochs_no_improve = 0
            best_model_state = model.state_dict().copy()
            print(f"  New best validation accuracy: {best_val_accuracy*100:.2f}%. Saving model state...")
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= PATIENCE:
                print(f"\nEarly stopping triggered after {epoch+1} epochs.")
                break

    end_time = time.time()
    print(f"\nTraining complete for LR={lr}. Total time: {end_time - start_time:.2f} seconds.")

    # --- Save and Evaluate Best Model for this LR ---
    if best_model_state:
        print(f"\nSaving best model weights to {MODEL_SAVE_PATH}...")
        torch.save(best_model_state, MODEL_SAVE_PATH)
        print("✓ Model saved.")
        model.load_state_dict(best_model_state) # Load best state for testing
    else:
         print("\nWarning: No best model state saved.")

    model.eval()
    test_preds, test_targets = [], []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            test_preds.extend(predicted.cpu().numpy())
            test_targets.extend(labels.cpu().numpy())

    test_accuracy = accuracy_score(test_targets, test_preds)
    print(f"\nEvaluating on Test Set (for LR={lr})...")
    print(f"Test Accuracy: {test_accuracy * 100:.2f}%")
    print("\nClassification Report (Test Set):")
    print(classification_report(test_targets, test_preds, target_names=encoder.classes_, zero_division=0))

print("\nAll training runs finished.")


Loading data...
✓ Loaded CSV: 13706 rows
✓ Loaded Text Embeddings: Shape (13706, 768)
✓ Loaded Audio Embeddings: Shape (13706, 768)

Preparing data...
Combined Embeddings Shape: (13706, 1536)
Found 7 emotion classes: ['anger' 'disgust' 'fear' 'joy' 'neutral' 'sadness' 'surprise']

Data Split:
Train samples: 9988
Dev samples:   1108
Test samples:  2610

Using device: cuda

Calculating class weights for the loss function...
Calculated Class Weights (sqrt inverse freq): tensor([0.8165, 1.6516, 1.6608, 0.6512, 0.3962, 1.0404, 0.7833],
       device='cuda:0')

=== STARTING TRAINING RUN: LEARNING RATE = 0.001 ===

Building model...
DenseClassifier(
  (network): Sequential(
    (0): Linear(in_features=1536, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=256, out_features=128, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=128, out_features=7, bias=True)
  )
)

Starting training...
E