In [2]:
# Load dataset
data = pd.read_csv(r'datasets\processed\meld_features.csv')

# Display column names
print("Columns in the dataset:")
print(data.columns)


Columns in the dataset:
Index(['Dialogue_ID', 'Utterance_ID', 'Emotion', 'Word_Count', 'Char_Count',
       'Sentiment_Polarity', 'Audio_Duration', 'MFCCs'],
      dtype='object')


In [49]:
# Load raw train, dev, or test dataset to check columns
raw_data_path = r'datasets\raw\MELD\train\train_sent_emo.csv'
raw_data = pd.read_csv(raw_data_path)

# Display column names
print("Columns in the raw dataset:")
print(raw_data.columns)

Columns in the raw dataset:
Index(['Sr No.', 'Utterance', 'Speaker', 'Emotion', 'Sentiment', 'Dialogue_ID',
       'Utterance_ID', 'Season', 'Episode', 'StartTime', 'EndTime'],
      dtype='object')


In [50]:
# Load processed and raw datasets
processed_path = r'datasets\processed\meld_features.csv'
raw_train_path = r'datasets\raw\MELD\train\train_sent_emo.csv'

processed_data = pd.read_csv(processed_path)
raw_data = pd.read_csv(raw_train_path)

# Merge raw and processed datasets to restore 'Utterance'
merged_data = pd.merge(
    processed_data,
    raw_data[['Dialogue_ID', 'Utterance_ID', 'Utterance']],
    on=['Dialogue_ID', 'Utterance_ID'],
    how='left'
)

# Check for successful merge
print("Columns in merged dataset:")
print(merged_data.columns)

# Define text preprocessing function
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stop_words and word not in punctuation]
    return ' '.join(tokens)

# Compute 'Clean_Utterance'
merged_data['Clean_Utterance'] = merged_data['Utterance'].apply(preprocess_text)

# Save updated dataset
updated_path = r'datasets\processed\meld_features_updated.csv'
merged_data.to_csv(updated_path, index=False)

print(f"Dataset updated with 'Clean_Utterance' column and saved to {updated_path}")


Columns in merged dataset:
Index(['Dialogue_ID', 'Utterance_ID', 'Emotion', 'Word_Count', 'Char_Count',
       'Sentiment_Polarity', 'Audio_Duration', 'MFCCs', 'Utterance'],
      dtype='object')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rajt8\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rajt8\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Dataset updated with 'Clean_Utterance' column and saved to datasets\processed\meld_features_updated.csv


In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the updated dataset
data = pd.read_csv(r'datasets\processed\meld_features_updated.csv')

# Split text and labels
X_train_text, X_temp, y_train, y_temp = train_test_split(
    data['Clean_Utterance'], data['Emotion'], test_size=0.3, stratify=data['Emotion'], random_state=42)
X_val_text, X_test_text, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Display dataset sizes
print(f"Training set size: {len(X_train_text)}")
print(f"Validation set size: {len(X_val_text)}")
print(f"Test set size: {len(X_test_text)}")

Training set size: 8287
Validation set size: 1776
Test set size: 1776


In [52]:
# Check for missing values in Clean_Utterance
print(f"Missing values in Clean_Utterance: {data['Clean_Utterance'].isna().sum()}")

# Fill missing values with empty strings
data['Clean_Utterance'] = data['Clean_Utterance'].fillna('')

# Split text and labels again
X_train_text, X_temp, y_train, y_temp = train_test_split(
    data['Clean_Utterance'], data['Emotion'], test_size=0.3, stratify=data['Emotion'], random_state=42)
X_val_text, X_test_text, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Display updated sizes
print(f"Training set size: {len(X_train_text)}")
print(f"Validation set size: {len(X_val_text)}")
print(f"Test set size: {len(X_test_text)}")

Missing values in Clean_Utterance: 91
Training set size: 8287
Validation set size: 1776
Test set size: 1776


In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000) 

# Fit and transform the training text
X_train_tfidf = tfidf.fit_transform(X_train_text)

# Transform validation and test text
X_val_tfidf = tfidf.transform(X_val_text)
X_test_tfidf = tfidf.transform(X_test_text)

print(f"TF-IDF vectorization complete. Number of features: {X_train_tfidf.shape[1]}")

TF-IDF vectorization complete. Number of features: 4515


In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize and train Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Make predictions on the validation set
y_val_pred = model.predict(X_val_tfidf)

# Evaluate the model
print("\nLogistic Regression Validation Results:")
print(classification_report(y_val, y_val_pred))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")


Logistic Regression Validation Results:
              precision    recall  f1-score   support

       anger       0.29      0.05      0.08       205
     disgust       0.33      0.02      0.04        49
        fear       0.00      0.00      0.00        48
         joy       0.53      0.18      0.27       308
     neutral       0.51      0.94      0.66       840
     sadness       0.41      0.05      0.10       130
    surprise       0.55      0.16      0.25       196

    accuracy                           0.51      1776
   macro avg       0.37      0.20      0.20      1776
weighted avg       0.47      0.51      0.40      1776

Validation Accuracy: 0.5051


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [55]:
# Initialize and train Logistic Regression with balanced class weights
model_weighted = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
model_weighted.fit(X_train_tfidf, y_train)

# Make predictions on the validation set
y_val_pred_weighted = model_weighted.predict(X_val_tfidf)

# Evaluate the model
print("\nLogistic Regression with Class Weights Validation Results:")
print(classification_report(y_val, y_val_pred_weighted))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_weighted):.4f}")


Logistic Regression with Class Weights Validation Results:
              precision    recall  f1-score   support

       anger       0.14      0.20      0.16       205
     disgust       0.06      0.16      0.08        49
        fear       0.07      0.29      0.12        48
         joy       0.33      0.35      0.34       308
     neutral       0.62      0.31      0.41       840
     sadness       0.14      0.23      0.17       130
    surprise       0.33      0.33      0.33       196

    accuracy                           0.30      1776
   macro avg       0.24      0.27      0.23      1776
weighted avg       0.42      0.30      0.33      1776

Validation Accuracy: 0.2956


In [56]:
!pip install imbalanced-learn --user
!pip show imbalanced-learn


Name: imbalanced-learn
Version: 0.13.0
Summary: Toolbox for imbalanced dataset in machine learning
Home-page: https://imbalanced-learn.org/
Author: 
Author-email: "G. Lemaitre" <g.lemaitre58@gmail.com>, "C. Aridas" <ichkoar@gmail.com>
License: 
Location: C:\Users\rajt8\AppData\Local\Programs\Python\Python311\Lib\site-packages
Requires: joblib, numpy, scikit-learn, scipy, sklearn-compat, threadpoolctl
Required-by: 


In [57]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

print(f"Training data after SMOTE - Samples: {X_train_resampled.shape[0]}")

Training data after SMOTE - Samples: 27426


In [58]:
# Train Logistic Regression on resampled data
model_smote = LogisticRegression(random_state=42, max_iter=1000)
model_smote.fit(X_train_resampled, y_train_resampled)

# Make predictions on the validation set
y_val_pred_smote = model_smote.predict(X_val_tfidf)

# Evaluate the model
from sklearn.metrics import classification_report, accuracy_score

print("\nLogistic Regression with SMOTE Validation Results:")
print(classification_report(y_val, y_val_pred_smote))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_smote):.4f}")


Logistic Regression with SMOTE Validation Results:
              precision    recall  f1-score   support

       anger       0.17      0.20      0.18       205
     disgust       0.05      0.14      0.08        49
        fear       0.06      0.23      0.09        48
         joy       0.32      0.32      0.32       308
     neutral       0.57      0.34      0.43       840
     sadness       0.13      0.18      0.15       130
    surprise       0.31      0.32      0.31       196

    accuracy                           0.30      1776
   macro avg       0.23      0.25      0.22      1776
weighted avg       0.39      0.30      0.33      1776

Validation Accuracy: 0.2979


In [59]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Train Random Forest on resampled data
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the validation set
y_val_pred_rf = rf_model.predict(X_val_tfidf)

# Evaluate the model
print("\nRandom Forest Validation Results:")
print(classification_report(y_val, y_val_pred_rf))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_rf):.4f}")


Random Forest Validation Results:
              precision    recall  f1-score   support

       anger       0.18      0.18      0.18       205
     disgust       0.05      0.06      0.05        49
        fear       0.06      0.23      0.10        48
         joy       0.29      0.25      0.27       308
     neutral       0.53      0.43      0.48       840
     sadness       0.12      0.14      0.13       130
    surprise       0.21      0.26      0.23       196

    accuracy                           0.32      1776
   macro avg       0.21      0.22      0.21      1776
weighted avg       0.36      0.32      0.33      1776

Validation Accuracy: 0.3153


In [60]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Train SVM on resampled data
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the validation set
y_val_pred_svm = svm_model.predict(X_val_tfidf)

# Evaluate the model
print("\nSVM Validation Results:")
print(classification_report(y_val, y_val_pred_svm))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_svm):.4f}")


SVM Validation Results:
              precision    recall  f1-score   support

       anger       0.15      0.20      0.18       205
     disgust       0.05      0.10      0.07        49
        fear       0.06      0.25      0.10        48
         joy       0.33      0.34      0.33       308
     neutral       0.56      0.35      0.44       840
     sadness       0.16      0.22      0.18       130
    surprise       0.32      0.28      0.30       196

    accuracy                           0.31      1776
   macro avg       0.23      0.25      0.23      1776
weighted avg       0.39      0.31      0.33      1776

Validation Accuracy: 0.3069


In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF with N-grams (unigrams, bigrams, trigrams)
tfidf_ngram = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))  

# Fit and transform the training text
X_train_tfidf_ngram = tfidf_ngram.fit_transform(X_train_text)

# Transform validation and test text
X_val_tfidf_ngram = tfidf_ngram.transform(X_val_text)
X_test_tfidf_ngram = tfidf_ngram.transform(X_test_text)

print(f"TF-IDF with N-grams complete. Number of features: {X_train_tfidf_ngram.shape[1]}")

TF-IDF with N-grams complete. Number of features: 5000


In [62]:
# Retrain Random Forest on n-gram features
rf_model_ngram = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model_ngram.fit(X_train_tfidf_ngram, y_train)

# Make predictions on the validation set
y_val_pred_rf_ngram = rf_model_ngram.predict(X_val_tfidf_ngram)

# Evaluate the model
from sklearn.metrics import classification_report, accuracy_score

print("\nRandom Forest with N-grams Validation Results:")
print(classification_report(y_val, y_val_pred_rf_ngram))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_rf_ngram):.4f}")


Random Forest with N-grams Validation Results:
              precision    recall  f1-score   support

       anger       0.11      0.05      0.07       205
     disgust       0.13      0.04      0.06        49
        fear       0.13      0.04      0.06        48
         joy       0.32      0.17      0.22       308
     neutral       0.51      0.78      0.62       840
     sadness       0.23      0.10      0.14       130
    surprise       0.33      0.23      0.27       196

    accuracy                           0.44      1776
   macro avg       0.25      0.20      0.21      1776
weighted avg       0.37      0.44      0.38      1776

Validation Accuracy: 0.4392


In [None]:
# Need More work on this !!