# Emotion Detection Model
This notebook demonstrates the process of building and evaluating an emotion detection model using machine learning techniques.

## 1. Importing Libraries

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import neattext.functions as nfx
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold


from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
nltk.download('punkt')
import re 
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

  from pandas.core import (
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\udit0\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\udit0\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\udit0\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Reading Data

In [2]:
# Read in the dataset
df_train = pd.read_csv("data/train.csv")
df_val = pd.read_csv("data/validation.csv")
df_test = pd.read_csv("data/test.csv")

# Check unique Emotions in each dataset
print("Unique training labels:", df_train['label'].unique())
print("Unique validation labels:", df_val['label'].unique())
print("Unique test labels:", df_test['label'].unique())

Unique training labels: [0 3 2 5 4 1]
Unique validation labels: [0 2 3 1 4 5]
Unique test labels: [0 1 4 3 2 5]


## Mapping Labels

In [3]:
label_mapping = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise'
}

for df in [df_train, df_val, df_test]:
    df['emotion'] = df['label'].map(label_mapping)

In [4]:
#Checking for Missing values in training data
df_train.isnull().sum()

#Value counts of each Emotions
print(df_train['emotion'].value_counts())

dir(nfx)
df_train['Clean_Text'] = df_train['text'].apply(nfx.remove_userhandles)
df_train['Clean_Text'] = df_train['Clean_Text'].apply(nfx.remove_stopwords)

df_test['Clean_Text'] = df_test['text'].apply(nfx.remove_userhandles)
df_test['Clean_Text'] = df_test['Clean_Text'].apply(nfx.remove_stopwords)

df_val['Clean_Text'] = df_val['text'].apply(nfx.remove_userhandles)
df_val['Clean_Text'] = df_val['Clean_Text'].apply(nfx.remove_stopwords)

# def clean_text(text):
#     # Remove special characters and punctuation
#     text = re.sub(r'[^a-zA-Z\s]', '', text)
#     # Convert to lowercase
#     text = text.lower()
# # Apply data cleaning to the 'text' column in each dataset
# df_train['Clean_Text'] = df_train['Clean_Text'].apply(clean_text)
# df_val['Clean_Text'] = df_val['Clean_Text'].apply(clean_text)
# df_test['Clean_Text'] = df_test['Clean_Text'].apply(clean_text)


emotion
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64


In [5]:
# plt.hist(df_train['emotion'], bins=30, color='skyblue', edgecolor='black')

## Text Preprocessing with Stemmer

In [6]:
# # Initialize the stemmer
# stemmer = PorterStemmer()

# # Define a function for tokenization and stemming
# def stemmed_tokenizer(text):
#     tokens = nltk.word_tokenize(text)
#     return [stemmer.stem(token) for token in tokens]


lemmatizer = WordNetLemmatizer()

# Function to get the Part of Speech (POS) for accurate lemmatization
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)  # Default to noun if POS not found

# Define a function for tokenization and lemmatization
def lemmatized_tokenizer(text):
    tokens = nltk.word_tokenize(text)
    return [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]

## Vectorization

In [7]:
# Vectorize the training data
# vectorizer = CountVectorizer(tokenizer=lemmatized_tokenizer)
# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(tokenizer=lemmatized_tokenizer, max_features=5000)

# tdm_train = vectorizer.fit_transform(df_train['text'])
tdm_train = vectorizer.fit_transform(df_train['Clean_Text'])
tdm_val = vectorizer.transform(df_val['Clean_Text'])
tdm_test = vectorizer.transform(df_test['Clean_Text'])



## Training the Logistic Regression Model

In [8]:
# # Train a Logistic Regression model
# log_model = LogisticRegression(max_iter=1000)
# log_model.fit(tdm_train, df_train['label'])

# # Validate the model
# # tdm_val = vectorizer.transform(df_val['text'])
# tdm_val = vectorizer.transform(df_val['Clean_Text'])
# y_val_pred_log = log_model.predict(tdm_val)

# # Calculate and print validation accuracy
# val_accuracy_log = accuracy_score(df_val['label'], y_val_pred_log)
# print("Logistic Regression Validation Accuracy:", val_accuracy_log)

log_model_ovr = OneVsRestClassifier(LogisticRegression(max_iter=1000))
log_model_ovr.fit(tdm_train, df_train['label'])
log_cv_scores = cross_val_score(log_model_ovr, tdm_train, df_train['label'], cv=5, scoring='accuracy')
print("Logistic Regression (OvR) Cross-Validation Accuracy:", log_cv_scores.mean())

# Validate the model
tdm_val = vectorizer.transform(df_val['Clean_Text'])
y_val_pred_log_ovr = log_model_ovr.predict(tdm_val)

# Calculate and print validation accuracy
val_accuracy_log_ovr = accuracy_score(df_val['label'], y_val_pred_log_ovr)
print("Logistic Regression (OvR) Validation Accuracy:", val_accuracy_log_ovr)

Logistic Regression (OvR) Cross-Validation Accuracy: 0.8477499999999999
Logistic Regression (OvR) Validation Accuracy: 0.8645


## Training the Random Forest model

In [9]:
# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(tdm_train, df_train['label'])

# Validate the Random Forest model
y_val_pred_rf = rf_model.predict(tdm_val)

# Calculate and print validation accuracy
val_accuracy_rf = accuracy_score(df_val['label'], y_val_pred_rf)
print("Random Forest Validation Accuracy:", val_accuracy_rf)

Random Forest Validation Accuracy: 0.876


## Training the SVM model

In [10]:
# #Train the SVM model
# svm_model = SVC(probability=True,random_state=42)
# svm_model.fit(tdm_train, df_train['label'])

# # Validate the SVM model
# y_val_pred_svm = svm_model.predict(tdm_val)

# # Calculate and print validation accuracy
# val_accuracy_svm = accuracy_score(df_val['label'], y_val_pred_svm)
# print("SVM Validation Accuracy:", val_accuracy_svm)

base_svm_model = LinearSVC(random_state=42)
svm_model = CalibratedClassifierCV(base_svm_model)  # This enables probability estimates
svm_model.fit(tdm_train, df_train['label'])

# Validate the calibrated model
y_val_pred_svm = svm_model.predict(tdm_val)

# Calculate and print validation accuracy
val_accuracy_svm = accuracy_score(df_val['label'], y_val_pred_svm)
print("Calibrated LinearSVC Validation Accuracy:", val_accuracy_svm)

Calibrated LinearSVC Validation Accuracy: 0.8855


In [11]:
nb_model = MultinomialNB()
nb_model.fit(tdm_train, df_train['label'])

# Validate the Naive Bayes model
y_val_pred_nb = nb_model.predict(tdm_val)

# Calculate and print validation accuracy
val_accuracy_nb = accuracy_score(df_val['label'], y_val_pred_nb)
print("Naive Bayes Validation Accuracy:", val_accuracy_nb)

Naive Bayes Validation Accuracy: 0.752


In [12]:
xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
xgb_model.fit(tdm_train, df_train['label'])

# Validate the XGBoost model
y_val_pred_xgb = xgb_model.predict(tdm_val)

# Calculate and print validation accuracy
val_accuracy_xgb = accuracy_score(df_val['label'], y_val_pred_xgb)
print("XGBoost Validation Accuracy:", val_accuracy_xgb)

XGBoost Validation Accuracy: 0.8795


In [13]:
# Train a Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(tdm_train, df_train['label'])

# Validate the Decision Tree model
y_val_pred_dt = dt_model.predict(tdm_val)

# Calculate and print validation accuracy
val_accuracy_dt = accuracy_score(df_val['label'], y_val_pred_dt)
print("Decision Tree Validation Accuracy:", val_accuracy_dt)


Decision Tree Validation Accuracy: 0.838


In [14]:
combined_model = VotingClassifier(
    estimators=[
        ('log_reg', log_model_ovr),
        ('rf', rf_model),
        ('svm', svm_model),
        ('xgb', xgb_model),
        ('nb', nb_model),
    ],
    voting='soft' 
)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(combined_model, tdm_train, df_train['label'], cv=skf, scoring='accuracy')
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))

# Train the Combined Model
combined_model.fit(tdm_train, df_train['label'])

# Validation
y_val_pred_combined = combined_model.predict(tdm_val)
val_accuracy_combined = accuracy_score(df_val['label'], y_val_pred_combined)
print("Combined Model Validation Accuracy:", val_accuracy_combined)

# Testing the Combined Model
y_test_pred_combined = combined_model.predict(tdm_test)
test_accuracy_combined = accuracy_score(df_test['label'], y_test_pred_combined)
print("Combined Model Test Accuracy:", test_accuracy_combined)


Cross-Validation Scores: [0.8746875 0.8796875 0.8853125 0.87625   0.8825   ]
Mean CV Accuracy: 0.8796875
Combined Model Validation Accuracy: 0.884
Combined Model Test Accuracy: 0.8885


## Comparing Models

In [15]:
# # Determine the best model based on validation accuracy
# if val_accuracy_log > val_accuracy_rf and val_accuracy_log > val_accuracy_svm:
#     best_model = log_model
#     best_model_name = "Logistic Regression"
#     best_accuracy = val_accuracy_log
# elif val_accuracy_rf > val_accuracy_log and val_accuracy_rf > val_accuracy_svm:
#     best_model = rf_model
#     best_model_name = "Random Forest"
#     best_accuracy = val_accuracy_rf
# else:
#     best_model = svm_model
#     best_model_name = "SVM"
#     best_accuracy = val_accuracy_svm

# print(f"The best model is {best_model_name} with accuracy: {best_accuracy}")

In [16]:
# Determine the best model based on validation accuracy
models = {
#     "Logistic Regression": (log_model, val_accuracy_log),
    "Logistic Regression": (log_model_ovr, val_accuracy_log_ovr),
    "Random Forest": (rf_model, val_accuracy_rf),
    "SVM": (svm_model, val_accuracy_svm),
    "Naive Bayes": (nb_model, val_accuracy_nb),
    "XGBoost": (xgb_model, val_accuracy_xgb),
}

# Find the best model
best_model_name, (best_model, best_accuracy) = max(models.items(), key=lambda item: item[1][1])

print(f"The best model is {best_model_name} with accuracy: {best_accuracy}")


The best model is SVM with accuracy: 0.8855


## Testing the Best Model

In [17]:
# Prepare the test data and make predictions
tdm_test = vectorizer.transform(df_test['text'])
y_test_pred = best_model.predict(tdm_test)

# Calculate and print test accuracy
test_accuracy = accuracy_score(df_test['label'], y_test_pred)
print("Test Accuracy of the best model:", test_accuracy)

Test Accuracy of the best model: 0.8765


## Performance Evaluation

In [18]:
# Print classification report and confusion matrix for the test dataset
print("\nClassification Report for the Test Dataset:")
print(classification_report(df_test['label'], y_test_pred))
print("\nConfusion Matrix for the Test Dataset:")
print(confusion_matrix(df_test['label'], y_test_pred))


Classification Report for the Test Dataset:
              precision    recall  f1-score   support

           0       0.90      0.91      0.90       581
           1       0.90      0.93      0.91       695
           2       0.81      0.71      0.76       159
           3       0.83      0.85      0.84       275
           4       0.88      0.81      0.84       224
           5       0.75      0.70      0.72        66

    accuracy                           0.88      2000
   macro avg       0.85      0.82      0.83      2000
weighted avg       0.88      0.88      0.88      2000


Confusion Matrix for the Test Dataset:
[[531  19   1  23   5   2]
 [  9 646  24   6   4   6]
 [  8  32 113   4   1   1]
 [ 22  10   1 235   6   1]
 [ 21   4   0  12 182   5]
 [  2   7   0   2   9  46]]


## Adding Predictions to the Test Dataset

In [19]:
# Add predicted emotions to the test dataframe
df_test['predicted_emotion'] = [
    label_mapping[label] if label in label_mapping else 'unknown'
    for label in y_test_pred
]

## Displaying Predictions

In [20]:
# Display the predictions in the test dataset
print("\nTest Dataset Predictions:")
print(df_test[['text', 'emotion', 'predicted_emotion']].head())


Test Dataset Predictions:
                                                text  emotion  \
0  im feeling rather rotten so im not very ambiti...  sadness   
1          im updating my blog because i feel shitty  sadness   
2  i never make her separate from me because i do...  sadness   
3  i left with my bouquet of red and yellow tulip...      joy   
4    i was feeling a little vain when i did this one  sadness   

  predicted_emotion  
0           sadness  
1           sadness  
2           sadness  
3               joy  
4           sadness  


## Predicting Emotion Function 

In [21]:
def predict_emotion(sentence, model, vectorizer, label_mapping):
    sentence_tdm = vectorizer.transform([sentence])
    probabilities = model.predict_proba(sentence_tdm)[0]
    emotions = [label_mapping[i] for i in range(len(probabilities))]
    
    # Plot the predicted probabilities
    plt.figure(figsize=(8, 4))
    sns.barplot(x=emotions, y=probabilities)
    plt.title('Emotion Prediction')
    plt.ylabel('Probability')
    plt.show()


## Using the Predict Emotion Function

In [None]:
# Predict emotion for a specific sentence
# predict_emotion("I heard strange noises outside last night, and I couldn't sleep at all. My heart was racing, and I kept imagining all sorts of dangers lurking in the dark.", best_model, vectorizer, label_mapping)

# Function for user input prediction
def user_input_prediction(model, vectorizer, label_mapping):
    try:
        sentence = input("Please enter a sentence to predict emotion: ")
        predict_emotion(sentence, model, vectorizer, label_mapping)
    except Exception as e:
        print(f"An error occurred: {e}")

# Call the user input function
user_input_prediction(best_model, vectorizer, label_mapping)
