In [1]:
import pandas as pd
import numpy as np

# Load text features
text_data = pd.read_csv('datasets/processed/meld_features_updated.csv')

# Load audio features
audio_data = pd.read_csv('datasets/processed/meld_audio_features.csv')

# Merge text and audio features
merged_data = pd.merge(text_data, audio_data, on=['Dialogue_ID', 'Utterance_ID'], suffixes=('_text', '_audio'))
print(f"Merged dataset shape: {merged_data.shape}")

Merged dataset shape: (11839, 35)


In [2]:
from sklearn.model_selection import train_test_split

# Define features and labels
X_text = merged_data['Clean_Utterance']
X_audio = merged_data[[col for col in merged_data.columns if col.startswith('MFCC') or col in ['Average_Pitch', 'Average_Energy']]]
y = merged_data['Emotion_text']  # Use the correct column for labels

# Split data
X_text_train, X_text_temp, X_audio_train, X_audio_temp, y_train, y_temp = train_test_split(
    X_text, X_audio, y, test_size=0.3, stratify=y, random_state=42
)
X_text_val, X_text_test, X_audio_val, X_audio_test, y_val, y_test = train_test_split(
    X_text_temp, X_audio_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

print(f"Training set size: {len(X_text_train)}")
print(f"Validation set size: {len(X_text_val)}")
print(f"Test set size: {len(X_text_test)}")

Training set size: 8287
Validation set size: 1776
Test set size: 1776


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fill NaN values in 'Clean_Utterance' with an empty string
X_text_train = X_text_train.fillna('')
X_text_val = X_text_val.fillna('')
X_text_test = X_text_test.fillna('')

# Text: TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_text_train_tfidf = tfidf.fit_transform(X_text_train)
X_text_val_tfidf = tfidf.transform(X_text_val)
X_text_test_tfidf = tfidf.transform(X_text_test)

print(f"Text TF-IDF features shape: {X_text_train_tfidf.shape}")

Text TF-IDF features shape: (8287, 5000)


In [4]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import ast

# Function to flatten and handle audio features
def flatten_audio_features(df, columns):
    flat_features = []
    for _, row in df.iterrows():
        combined_features = []
        for col in columns:
            value = row[col]
            if isinstance(value, str):  # Convert string to array
                features = np.array(ast.literal_eval(value))
            elif isinstance(value, (list, np.ndarray)):  # Already a list or array
                features = np.array(value)
            else:  # Handle missing or invalid data
                features = np.zeros(13)  # Default size of MFCC features
            combined_features.extend(features)  # Flatten into a single list
        flat_features.append(combined_features)
    return np.array(flat_features)

# Define MFCC and other audio columns
audio_columns = [col for col in X_audio_train.columns if col.startswith('MFCC')] + ['Average_Pitch', 'Average_Energy']

# Flatten audio features for train, validation, and test sets
X_audio_train_flat = flatten_audio_features(X_audio_train, audio_columns)
X_audio_val_flat = flatten_audio_features(X_audio_val, audio_columns)
X_audio_test_flat = flatten_audio_features(X_audio_test, audio_columns)

# Normalize flattened features
scaler = StandardScaler()
X_audio_train_scaled = scaler.fit_transform(X_audio_train_flat)
X_audio_val_scaled = scaler.transform(X_audio_val_flat)
X_audio_test_scaled = scaler.transform(X_audio_test_flat)

print(f"Audio features normalized. Shape: {X_audio_train_scaled.shape}")

Audio features normalized. Shape: (8287, 208)


In [5]:
from scipy.sparse import hstack

# Combine text and audio features
X_train_combined = hstack([X_text_train_tfidf, X_audio_train_scaled])
X_val_combined = hstack([X_text_val_tfidf, X_audio_val_scaled])
X_test_combined = hstack([X_text_test_tfidf, X_audio_test_scaled])

print(f"Combined features shape: {X_train_combined.shape}")

Combined features shape: (8287, 5208)


In [6]:
from sklearn.linear_model import LogisticRegression

# Initialize and train model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_combined, y_train)
print("Logistic Regression model trained on multimodal features!")

Logistic Regression model trained on multimodal features!


In [7]:
from sklearn.metrics import classification_report, accuracy_score

# Validate model
y_val_pred = model.predict(X_val_combined)
print("\nMultimodal Logistic Regression Validation Results:")
print(classification_report(y_val, y_val_pred))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")


Multimodal Logistic Regression Validation Results:
              precision    recall  f1-score   support

       anger       0.29      0.09      0.14       205
     disgust       0.00      0.00      0.00        49
        fear       0.00      0.00      0.00        48
         joy       0.42      0.19      0.27       308
     neutral       0.52      0.91      0.66       840
     sadness       0.47      0.06      0.11       130
    surprise       0.47      0.19      0.27       196

    accuracy                           0.50      1776
   macro avg       0.31      0.21      0.21      1776
weighted avg       0.44      0.50      0.41      1776

Validation Accuracy: 0.5023


In [8]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_combined, y_train)

# Validate the model
y_val_pred_rf = rf_model.predict(X_val_combined)
print("\nRandom Forest Validation Results:")
print(classification_report(y_val, y_val_pred_rf))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_rf):.4f}")


Random Forest Validation Results:
              precision    recall  f1-score   support

       anger       0.14      0.03      0.05       205
     disgust       0.00      0.00      0.00        49
        fear       0.00      0.00      0.00        48
         joy       0.28      0.10      0.14       308
     neutral       0.49      0.86      0.62       840
     sadness       0.14      0.04      0.06       130
    surprise       0.31      0.14      0.19       196

    accuracy                           0.45      1776
   macro avg       0.19      0.17      0.15      1776
weighted avg       0.34      0.45      0.35      1776

Validation Accuracy: 0.4465


In [9]:
from sklearn.preprocessing import LabelEncoder

# Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

# Display encoded classes
print("Encoded classes:", label_encoder.classes_)

Encoded classes: ['anger' 'disgust' 'fear' 'joy' 'neutral' 'sadness' 'surprise']


In [10]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

# Train an XGBoost model
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train_combined, y_train_encoded)

# Validate the model
y_val_pred_xgb = xgb_model.predict(X_val_combined)

# Decode predictions back to original labels
y_val_pred_decoded = label_encoder.inverse_transform(y_val_pred_xgb)

# Validation results
print("\nXGBoost Validation Results:")
print(classification_report(y_val, y_val_pred_decoded))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_decoded):.4f}")

Parameters: { "use_label_encoder" } are not used.




XGBoost Validation Results:
              precision    recall  f1-score   support

       anger       0.19      0.06      0.09       205
     disgust       0.00      0.00      0.00        49
        fear       0.00      0.00      0.00        48
         joy       0.38      0.13      0.20       308
     neutral       0.50      0.89      0.64       840
     sadness       0.15      0.03      0.05       130
    surprise       0.46      0.19      0.27       196

    accuracy                           0.47      1776
   macro avg       0.24      0.19      0.18      1776
weighted avg       0.39      0.47      0.38      1776

Validation Accuracy: 0.4735


In [11]:
# Test set evaluation
y_test_pred_xgb = xgb_model.predict(X_test_combined)

# Decode predictions back to original labels
y_test_pred_decoded = label_encoder.inverse_transform(y_test_pred_xgb)

print("\nXGBoost Test Results:")
print(classification_report(y_test, y_test_pred_decoded))
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred_decoded):.4f}")


XGBoost Test Results:
              precision    recall  f1-score   support

       anger       0.32      0.12      0.18       205
     disgust       0.20      0.02      0.04        49
        fear       0.00      0.00      0.00        48
         joy       0.40      0.17      0.23       309
     neutral       0.51      0.87      0.65       839
     sadness       0.44      0.11      0.17       131
    surprise       0.49      0.22      0.30       195

    accuracy                           0.49      1776
   macro avg       0.34      0.22      0.22      1776
weighted avg       0.44      0.49      0.41      1776

Test Accuracy: 0.4882


In [12]:
from imblearn.over_sampling import SMOTE
import numpy as np

# Combine text and audio features for SMOTE
X_train_dense = np.hstack([X_text_train_tfidf.toarray(), X_audio_train_scaled])
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_dense, y_train)

print(f"Balanced training set size: {X_train_balanced.shape[0]}")

Balanced training set size: 27426


In [13]:
# Train Logistic Regression on balanced data
balanced_lr_model = LogisticRegression(max_iter=1000, random_state=42)
balanced_lr_model.fit(X_train_balanced, y_train_balanced)

# Validate the model
X_val_dense = np.hstack([X_text_val_tfidf.toarray(), X_audio_val_scaled])
y_val_pred_balanced_lr = balanced_lr_model.predict(X_val_dense)

print("\nBalanced Logistic Regression Validation Results:")
print(classification_report(y_val, y_val_pred_balanced_lr))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_balanced_lr):.4f}")


Balanced Logistic Regression Validation Results:
              precision    recall  f1-score   support

       anger       0.20      0.23      0.22       205
     disgust       0.06      0.12      0.08        49
        fear       0.06      0.12      0.09        48
         joy       0.32      0.34      0.33       308
     neutral       0.62      0.44      0.51       840
     sadness       0.19      0.25      0.21       130
    surprise       0.25      0.32      0.28       196

    accuracy                           0.35      1776
   macro avg       0.24      0.26      0.25      1776
weighted avg       0.42      0.35      0.38      1776

Validation Accuracy: 0.3536


In [14]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['lbfgs', 'saga']
}

# Grid search
grid = GridSearchCV(LogisticRegression(max_iter=1000, random_state=42), param_grid, cv=3, scoring='accuracy')
grid.fit(X_train_combined, y_train)

print(f"Best parameters: {grid.best_params_}")
print(f"Best validation score: {grid.best_score_}")



Best parameters: {'C': 1, 'solver': 'lbfgs'}
Best validation score: 0.4988547326665624


In [15]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

print("BERT model and tokenizer loaded!")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


BERT model and tokenizer loaded!


In [16]:
def get_bert_embeddings(texts):
    """
    Generate BERT embeddings for a list of texts.
    :param texts: List of text strings.
    :return: BERT embeddings as a NumPy array.
    """
    embeddings = []
    for text in texts:
        # Tokenize text and convert to input IDs
        inputs = bert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        
        # Generate embeddings using BERT
        with torch.no_grad():
            outputs = bert_model(**inputs)
            # Use the [CLS] token embedding as the sentence embedding
            cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        
        embeddings.append(cls_embedding)
    return np.array(embeddings)

# Generate BERT embeddings for train, validation, and test text
X_text_train_bert = get_bert_embeddings(X_text_train)
X_text_val_bert = get_bert_embeddings(X_text_val)
X_text_test_bert = get_bert_embeddings(X_text_test)

print("BERT embeddings generated!")

BERT embeddings generated!


In [17]:
# Combine BERT embeddings with audio features
X_train_combined_bert = np.hstack([X_text_train_bert, X_audio_train_scaled])
X_val_combined_bert = np.hstack([X_text_val_bert, X_audio_val_scaled])
X_test_combined_bert = np.hstack([X_text_test_bert, X_audio_test_scaled])

print(f"Combined features shape with BERT: {X_train_combined_bert.shape}")

Combined features shape with BERT: (8287, 976)


In [18]:
from sklearn.linear_model import LogisticRegression

# Train Logistic Regression
bert_lr_model = LogisticRegression(max_iter=5000, random_state=42)
bert_lr_model.fit(X_train_combined_bert, y_train)

# Validate the model
y_val_pred_bert = bert_lr_model.predict(X_val_combined_bert)
from sklearn.metrics import classification_report, accuracy_score

print("\nBERT + Audio Logistic Regression Validation Results:")
print(classification_report(y_val, y_val_pred_bert))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_bert):.4f}")


BERT + Audio Logistic Regression Validation Results:
              precision    recall  f1-score   support

       anger       0.20      0.12      0.15       205
     disgust       0.10      0.04      0.06        49
        fear       0.06      0.02      0.03        48
         joy       0.36      0.23      0.28       308
     neutral       0.52      0.78      0.62       840
     sadness       0.13      0.06      0.08       130
    surprise       0.31      0.17      0.22       196

    accuracy                           0.45      1776
   macro avg       0.24      0.20      0.21      1776
weighted avg       0.38      0.45      0.39      1776

Validation Accuracy: 0.4459


In [20]:
import pandas as pd
import os
import librosa

# Load the processed dataset
data_path = 'datasets/processed/meld_features_updated.csv'
data = pd.read_csv(data_path)

print("Dataset loaded successfully!")

Dataset loaded successfully!


In [21]:
# Define the audio directory path
audio_dir = 'datasets/raw/MELD/train/audio/'

# Generate full paths for audio files
data['Audio_Path'] = data.apply(lambda row: os.path.join(audio_dir, f"dia{row['Dialogue_ID']}_utt{row['Utterance_ID']}.wav"), axis=1)

# Check for missing files
missing_files = data[~data['Audio_Path'].apply(os.path.exists)]
print(f"Missing audio files: {len(missing_files)}")
print(missing_files[['Dialogue_ID', 'Utterance_ID']].head())

Missing audio files: 0
Empty DataFrame
Columns: [Dialogue_ID, Utterance_ID]
Index: []


In [24]:
print("Available columns in the dataset:", data.columns)

Available columns in the dataset: Index(['Dialogue_ID', 'Utterance_ID', 'Emotion', 'Word_Count', 'Char_Count',
       'Sentiment_Polarity', 'Audio_Duration', 'MFCCs', 'Utterance',
       'Clean_Utterance', 'Audio_Path', 'Aligned_Audio'],
      dtype='object')


In [27]:
# Load the original dataset with timestamps
timestamps_df = pd.read_csv('datasets/raw/MELD/train/train_sent_emo.csv')

# Check available columns to confirm timestamp presence
print(timestamps_df.columns)

# Merge based on Dialogue_ID and Utterance_ID
data = data.merge(
    timestamps_df[['Dialogue_ID', 'Utterance_ID', 'StartTime', 'EndTime']],
    on=['Dialogue_ID', 'Utterance_ID'],
    how='left'
)

# Verify if timestamps are added
print("Updated columns:", data.columns)

Index(['Sr No.', 'Utterance', 'Speaker', 'Emotion', 'Sentiment', 'Dialogue_ID',
       'Utterance_ID', 'Season', 'Episode', 'StartTime', 'EndTime'],
      dtype='object')
Updated columns: Index(['Dialogue_ID', 'Utterance_ID', 'Emotion', 'Word_Count', 'Char_Count',
       'Sentiment_Polarity', 'Audio_Duration', 'MFCCs', 'Utterance',
       'Clean_Utterance', 'Audio_Path', 'Aligned_Audio', 'StartTime',
       'EndTime'],
      dtype='object')


In [28]:
# Add placeholder values (e.g., 0 and 2 seconds)
data['StartTime'] = 0.0  # Assuming start at 0 seconds
data['EndTime'] = data['Audio_Duration']  # Assuming full duration for now

In [29]:
def align_text_with_audio(row):
    try:
        audio_path = row['Audio_Path']
        start_time = row['StartTime']
        end_time = row['EndTime']

        # Load the audio file
        audio, sr = librosa.load(audio_path, sr=16000)

        # Convert timestamps to samples
        start_sample = int(float(start_time) * sr)
        end_sample = int(float(end_time) * sr)

        # Extract the aligned audio segment
        aligned_audio = audio[start_sample:end_sample]

        return aligned_audio
    except Exception as e:
        print(f"Error aligning audio for {audio_path}: {e}")
        return None

# Apply the alignment function
data['Aligned_Audio'] = data.apply(align_text_with_audio, axis=1)
print("Text-to-audio alignment complete.")

Text-to-audio alignment complete.


In [30]:
import librosa

def extract_prosodic_features(aligned_audio, sr=16000):
    try:
        # Extract pitch (F0) and energy
        pitches, magnitudes = librosa.piptrack(y=aligned_audio, sr=sr)

        # Get the mean pitch and energy values
        avg_pitch = np.mean(pitches[pitches > 0]) if np.any(pitches > 0) else 0
        avg_energy = np.mean(magnitudes) if np.any(magnitudes) else 0

        return avg_pitch, avg_energy
    except Exception as e:
        print(f"Error extracting prosodic features: {e}")
        return None, None

# Apply the prosodic feature extraction
data[['Average_Pitch', 'Average_Energy']] = data['Aligned_Audio'].apply(lambda x: pd.Series(extract_prosodic_features(x) if x is not None else (None, None)))

# Save updated dataset with prosodic features
data.to_csv('datasets/processed/meld_multimodal_features.csv', index=False)

print("Prosodic feature extraction complete and dataset saved!")



Prosodic feature extraction complete and dataset saved!


In [1]:
import pandas as pd

# Paths to the processed text and audio features
text_features_path = 'datasets/processed/meld_features_updated.csv'
audio_features_path = 'datasets/processed/meld_audio_features.csv'

# Load text and audio data
text_data = pd.read_csv(text_features_path)
audio_data = pd.read_csv(audio_features_path)

print("Text and Audio datasets loaded successfully!")
print(f"Text Data Shape: {text_data.shape}")
print(f"Audio Data Shape: {audio_data.shape}")

# Inspect columns to confirm compatibility
print("\nText Data Columns:", text_data.columns)
print("\nAudio Data Columns:", audio_data.columns)

Text and Audio datasets loaded successfully!
Text Data Shape: (11839, 10)
Audio Data Shape: (9989, 27)

Text Data Columns: Index(['Dialogue_ID', 'Utterance_ID', 'Emotion', 'Word_Count', 'Char_Count',
       'Sentiment_Polarity', 'Audio_Duration', 'MFCCs', 'Utterance',
       'Clean_Utterance'],
      dtype='object')

Audio Data Columns: Index(['Sr No.', 'Utterance', 'Speaker', 'Emotion', 'Sentiment', 'Dialogue_ID',
       'Utterance_ID', 'Season', 'Episode', 'StartTime', 'EndTime',
       'Audio_Path', 'MFCC_0', 'MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4',
       'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 'MFCC_10', 'MFCC_11',
       'MFCC_12', 'Average_Pitch', 'Average_Energy'],
      dtype='object')


In [6]:
# Select relevant columns from both datasets
text_data_selected = text_data[['Dialogue_ID', 'Utterance_ID', 'Emotion', 'Word_Count', 'Char_Count',
                                'Sentiment_Polarity', 'Utterance', 'Clean_Utterance']]
audio_data_selected = audio_data[['Dialogue_ID', 'Utterance_ID', 'Audio_Path', 'MFCC_0', 'MFCC_1', 'MFCC_2',
                                  'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9',
                                  'MFCC_10', 'MFCC_11', 'MFCC_12', 'Average_Pitch', 'Average_Energy']]

# Merge datasets on 'Dialogue_ID' and 'Utterance_ID'
multimodal_data = pd.merge(text_data_selected, audio_data_selected, on=['Dialogue_ID', 'Utterance_ID'], how='inner')

# Save the merged dataset
multimodal_data.to_csv('datasets/processed/multimodal_features.csv', index=False)

print("Multimodal feature dataset created successfully!")
print(f"Multimodal Data Shape: {multimodal_data.shape}")
print("Sample rows:")
print(multimodal_data.head())

Multimodal feature dataset created successfully!
Multimodal Data Shape: (11839, 24)
Sample rows:
   Dialogue_ID  Utterance_ID   Emotion  Word_Count  Char_Count  \
0            0             0   neutral           9          56   
1            0             0   sadness           7          30   
2            0             0  surprise           5          30   
3            0             1   neutral           4          19   
4            0             1     anger          13          64   

   Sentiment_Polarity                                          Utterance  \
0              0.0000  also I was the point person on my companys tr...   
1             -0.4201  also I was the point person on my companys tr...   
2              0.0000  also I was the point person on my companys tr...   
3              0.0000                   You mustve had your hands full.   
4              0.2244                   You mustve had your hands full.   

                                     Clean_Uttera

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import numpy as np

# Replace missing values in the 'Clean_Utterance' column
multimodal_data['Clean_Utterance'].fillna("", inplace=True)

# Extract the text and audio features
X_text = multimodal_data['Clean_Utterance'].values
X_audio = multimodal_data[['MFCC_0', 'MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4', 
                          'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 
                          'MFCC_10', 'MFCC_11', 'MFCC_12', 'Average_Pitch', 'Average_Energy']].values

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_text_tfidf = tfidf_vectorizer.fit_transform(X_text).toarray()

# Standardize audio features
scaler = StandardScaler()
X_audio_scaled = scaler.fit_transform(X_audio)

# Concatenate text and audio features
X_fused = np.concatenate((X_text_tfidf, X_audio_scaled), axis=1)

# Save the fused features for future use
np.save('datasets/processed/fused_features.npy', X_fused)

print("Feature fusion completed successfully!")
print(f"Fused feature matrix shape: {X_fused.shape}")

Feature fusion completed successfully!
Fused feature matrix shape: (11839, 5015)


In [14]:
import pandas as pd

# Load the saved fused dataset
fused_features = pd.read_csv('datasets/processed/meld_multimodal_features.csv')
print("Fused dataset loaded successfully!")

Fused dataset loaded successfully!


In [15]:
from sklearn.preprocessing import LabelEncoder

# Remove the target label from features
X_fused = fused_features.drop(columns=['Emotion'])  # Remove target column
y_fused = fused_features['Emotion']  # Target labels

# Encode target labels
label_encoder = LabelEncoder()
y_fused_encoded = label_encoder.fit_transform(y_fused)

print("Data prepared for model training!")

Data prepared for model training!


In [17]:
from sklearn.model_selection import train_test_split

# Split data into train, validation, and test sets
X_train_fused, X_temp, y_train_fused, y_temp = train_test_split(
    X_fused, y_fused, test_size=0.3, stratify=y_fused, random_state=42
)

X_val_fused, X_test_fused, y_val_fused, y_test_fused = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

print(f"Training set size: {X_train_fused.shape[0]}")
print(f"Validation set size: {X_val_fused.shape[0]}")
print(f"Test set size: {X_test_fused.shape[0]}")

Training set size: 8287
Validation set size: 1776
Test set size: 1776


In [22]:
print("Available columns:", X_fused.columns)

Available columns: Index(['Dialogue_ID', 'Utterance_ID', 'Word_Count', 'Char_Count',
       'Sentiment_Polarity', 'Audio_Duration', 'Utterance', 'Clean_Utterance',
       'Audio_Path', 'Aligned_Audio', 'StartTime', 'EndTime', 'Average_Pitch',
       'Average_Energy', 'MFCC_0', 'MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4',
       'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 'MFCC_10', 'MFCC_11',
       'MFCC_12'],
      dtype='object')


In [23]:
from sklearn.preprocessing import StandardScaler

# Selecting only numerical feature columns for scaling
numeric_columns = ['Word_Count', 'Char_Count', 'Sentiment_Polarity', 
                   'Audio_Duration', 'Average_Pitch', 'Average_Energy'] + \
                  [f'MFCC_{i}' for i in range(13)]

X_fused_numeric = X_fused[numeric_columns]

# Standardize features using StandardScaler
scaler = StandardScaler()
X_fused_scaled = scaler.fit_transform(X_fused_numeric)

# Split the dataset back into train, validation, and test sets
X_train_fused, X_val_fused, X_test_fused = X_fused_scaled[:len(y_train_fused)], \
                                          X_fused_scaled[len(y_train_fused):len(y_train_fused) + len(y_val_fused)], \
                                          X_fused_scaled[len(y_train_fused) + len(y_val_fused):]

print("Feature scaling completed successfully!")
print(f"Training data shape: {X_train_fused.shape}")
print(f"Validation data shape: {X_val_fused.shape}")
print(f"Test data shape: {X_test_fused.shape}")

Feature scaling completed successfully!
Training data shape: (8287, 19)
Validation data shape: (1776, 19)
Test data shape: (1776, 19)


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model on the training data
logistic_model.fit(X_train_fused, y_train_fused)

# Make predictions on the validation set
y_val_pred = logistic_model.predict(X_val_fused)

# Evaluate the model
accuracy = accuracy_score(y_val_fused, y_val_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

# Classification report for detailed evaluation
print("Classification Report:")
print(classification_report(y_val_fused, y_val_pred))

Validation Accuracy: 0.4730
Classification Report:
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00       205
     disgust       0.00      0.00      0.00        49
        fear       0.00      0.00      0.00        48
         joy       0.00      0.00      0.00       308
     neutral       0.47      1.00      0.64       840
     sadness       0.00      0.00      0.00       130
    surprise       0.00      0.00      0.00       196

    accuracy                           0.47      1776
   macro avg       0.07      0.14      0.09      1776
weighted avg       0.22      0.47      0.30      1776



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to training data
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_fused, y_train_fused)

# Check class distribution after SMOTE
print("Class distribution after SMOTE:", Counter(y_train_balanced))

# Retrain the logistic regression model with balanced data
logistic_model_balanced = LogisticRegression(max_iter=1000, random_state=42)
logistic_model_balanced.fit(X_train_balanced, y_train_balanced)

# Predict on validation set
y_val_pred_balanced = logistic_model_balanced.predict(X_val_fused)

# Evaluate the model
print("\nLogistic Regression with SMOTE Validation Results:")
print(classification_report(y_val_fused, y_val_pred_balanced))
print(f"Validation Accuracy: {accuracy_score(y_val_fused, y_val_pred_balanced):.4f}")

Class distribution after SMOTE: Counter({'surprise': 3918, 'joy': 3918, 'neutral': 3918, 'anger': 3918, 'sadness': 3918, 'fear': 3918, 'disgust': 3918})

Logistic Regression with SMOTE Validation Results:
              precision    recall  f1-score   support

       anger       0.09      0.08      0.08       205
     disgust       0.03      0.24      0.05        49
        fear       0.03      0.23      0.05        48
         joy       0.19      0.08      0.12       308
     neutral       0.55      0.10      0.17       840
     sadness       0.07      0.17      0.10       130
    surprise       0.10      0.09      0.09       196

    accuracy                           0.11      1776
   macro avg       0.15      0.14      0.10      1776
weighted avg       0.32      0.11      0.13      1776

Validation Accuracy: 0.1070


In [29]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Encode target labels to numeric values
label_encoder = LabelEncoder()
y_train_fused_encoded = label_encoder.fit_transform(y_train_fused)
y_val_fused_encoded = label_encoder.transform(y_val_fused)

# Define the models and hyperparameters for tuning
models = {
    'SVM': SVC(class_weight='balanced', probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

param_grid = {
    'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'Random Forest': {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
    'XGBoost': {'learning_rate': [0.01, 0.1], 'max_depth': [3, 5, 7], 'n_estimators': [100, 200]}
}

best_models = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    grid_search = GridSearchCV(model, param_grid[model_name], cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_fused, y_train_fused_encoded)
    
    # Get best model and evaluate
    best_model = grid_search.best_estimator_
    best_models[model_name] = best_model
    
    y_val_pred = best_model.predict(X_val_fused)
    
    # Decode predicted labels back to original class names
    y_val_pred_decoded = label_encoder.inverse_transform(y_val_pred)
    
    print(f"\n{model_name} Best Parameters: {grid_search.best_params_}")
    print(f"{model_name} Validation Accuracy: {accuracy_score(y_val_fused, y_val_pred_decoded):.4f}")
    print(classification_report(y_val_fused, y_val_pred_decoded))

Training SVM...

SVM Best Parameters: {'C': 10, 'kernel': 'rbf'}
SVM Validation Accuracy: 0.2095
              precision    recall  f1-score   support

       anger       0.12      0.20      0.15       205
     disgust       0.03      0.06      0.04        49
        fear       0.02      0.04      0.03        48
         joy       0.17      0.17      0.17       308
     neutral       0.52      0.27      0.36       840
     sadness       0.07      0.12      0.08       130
    surprise       0.12      0.15      0.13       196

    accuracy                           0.21      1776
   macro avg       0.15      0.14      0.14      1776
weighted avg       0.31      0.21      0.24      1776

Training Random Forest...

Random Forest Best Parameters: {'max_depth': 20, 'n_estimators': 200}
Random Forest Validation Accuracy: 0.4324
              precision    recall  f1-score   support

       anger       0.08      0.01      0.02       205
     disgust       0.00      0.00      0.00        49
    

Parameters: { "use_label_encoder" } are not used.




XGBoost Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
XGBoost Validation Accuracy: 0.4730
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00       205
     disgust       0.00      0.00      0.00        49
        fear       0.00      0.00      0.00        48
         joy       0.00      0.00      0.00       308
     neutral       0.47      1.00      0.64       840
     sadness       0.00      0.00      0.00       130
    surprise       0.00      0.00      0.00       196

    accuracy                           0.47      1776
   macro avg       0.07      0.14      0.09      1776
weighted avg       0.22      0.47      0.30      1776



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
# Need to work more on it !!