# Emotion Detection Model
This notebook demonstrates the process of building and evaluating an emotion detection model using machine learning techniques.

## 1. Importing Libraries

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import neattext.functions as nfx
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB

from itertools import combinations
from sklearn.metrics import precision_score, recall_score, f1_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold


from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
nltk.download('punkt')
import re 
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

  from pandas.core import (
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\udit0\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\udit0\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\udit0\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Reading Data

In [2]:
# Read in the dataset
df_train = pd.read_csv("data/train.csv")
df_val = pd.read_csv("data/validation.csv")
df_test = pd.read_csv("data/test.csv")

# Check unique Emotions in each dataset
print("Unique training labels:", df_train['label'].unique())
print("Unique validation labels:", df_val['label'].unique())
print("Unique test labels:", df_test['label'].unique())

Unique training labels: [0 3 2 5 4 1]
Unique validation labels: [0 2 3 1 4 5]
Unique test labels: [0 1 4 3 2 5]


## Mapping Labels

In [3]:
label_mapping = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise'
}

for df in [df_train, df_val, df_test]:
    df['emotion'] = df['label'].map(label_mapping)

In [4]:
#Checking for Missing values in training data
print("Missing values in training set:", df_train.isnull().sum())

#Value counts of each Emotions
print("\nTraining set emotion distribution:\n", df_train['emotion'].value_counts())

dir(nfx)
df_train['Clean_Text'] = df_train['text'].apply(nfx.remove_userhandles)
df_train['Clean_Text'] = df_train['Clean_Text'].apply(nfx.remove_stopwords)

df_test['Clean_Text'] = df_test['text'].apply(nfx.remove_userhandles)
df_test['Clean_Text'] = df_test['Clean_Text'].apply(nfx.remove_stopwords)

df_val['Clean_Text'] = df_val['text'].apply(nfx.remove_userhandles)
df_val['Clean_Text'] = df_val['Clean_Text'].apply(nfx.remove_stopwords)

Missing values in training set: text       0
label      0
emotion    0
dtype: int64

Training set emotion distribution:
 emotion
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64


## Text Preprocessing with Stemmer

In [5]:
# # Initialize the stemmer
# stemmer = PorterStemmer()

# # Define a function for tokenization and stemming
# def stemmed_tokenizer(text):
#     tokens = nltk.word_tokenize(text)
#     return [stemmer.stem(token) for token in tokens]


lemmatizer = WordNetLemmatizer()

# Function to get the Part of Speech (POS) for accurate lemmatization
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)  # Default to noun if POS not found

# Define a function for tokenization and lemmatization
def lemmatized_tokenizer(text):
    tokens = nltk.word_tokenize(text)
    return [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]

## Vectorization

In [6]:
# Vectorize the training data
# vectorizer = CountVectorizer(tokenizer=lemmatized_tokenizer)

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(tokenizer=lemmatized_tokenizer, max_features=5000)

tdm_train = vectorizer.fit_transform(df_train['Clean_Text'])
tdm_val = vectorizer.transform(df_val['Clean_Text'])
tdm_test = vectorizer.transform(df_test['Clean_Text'])



## Training the Logistic Regression Model - OneVsRest

In [7]:
# # Train the model
# log_model_ovr = OneVsRestClassifier(LogisticRegression(max_iter=1000))
# log_model_ovr.fit(tdm_train, df_train['label'])

# # Validate the model
# y_val_pred_log_ovr = log_model_ovr.predict(tdm_val)

# # Calculate and print validation accuracy
# val_accuracy_log_ovr = accuracy_score(df_val['label'], y_val_pred_log_ovr)
# print("Logistic Regression (OvR) Validation Accuracy:", val_accuracy_log_ovr)


In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

class LogisticRegressionScratch:
    def __init__(self, learning_rate=0.01, num_iterations=1000, regularization_strength=0.01):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.regularization_strength = regularization_strength
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        # Avoid overflow for large/small z
        z = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z))

    def compute_loss(self, y, y_predicted):
        # Binary cross-entropy loss with L2 regularization
        num_samples = y.shape[0]
        loss = (-1 / num_samples) * np.sum(
            y * np.log(y_predicted + 1e-15) + (1 - y) * np.log(1 - y_predicted + 1e-15)
        )
        reg_loss = (self.regularization_strength / (2 * num_samples)) * np.sum(self.weights ** 2)
        return loss + reg_loss

    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.weights = np.random.randn(num_features) * 0.01 
        self.bias = 0

        for i in range(self.num_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)

            # Compute gradients
            dw = (1 / num_samples) * np.dot(X.T, (y_predicted - y)) + \
                 (self.regularization_strength / num_samples) * self.weights
            db = (1 / num_samples) * np.sum(y_predicted - y)

            # Update weights and bias
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db


    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        return np.array([1 if i > 0.5 else 0 for i in y_predicted])

class OneVsRestClassifier:
    def __init__(self, base_classifier):
        self.base_classifier = base_classifier
        self.models = []

    def fit(self, X, y):
        num_classes = len(np.unique(y))
        self.models = []

        for i in range(num_classes):
            binary_y = np.where(y == i, 1, 0)
            model = LogisticRegressionScratch(
                learning_rate=self.base_classifier.learning_rate,
                num_iterations=self.base_classifier.num_iterations,
                regularization_strength=self.base_classifier.regularization_strength
            )
            model.fit(X, binary_y)
            self.models.append(model)

    def predict(self, X):
        predictions = np.array([model.predict(X) for model in self.models]).T
        return np.argmax(predictions, axis=1)

# Prepare the data
X_train = tdm_train.toarray()
y_train = df_train['label'].values

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Train the improved One-vs-Rest Logistic Regression model
ovr_model = OneVsRestClassifier(
    LogisticRegressionScratch(learning_rate=0.01, num_iterations=3000, regularization_strength=0.1)
)
ovr_model.fit(X_train, y_train)

# Validate the model
X_val = tdm_val.toarray()
X_val = scaler.transform(X_val)
y_val_pred = ovr_model.predict(X_val)

# Calculate and print validation accuracy
val_accuracy_ovr = accuracy_score(df_val['label'], y_val_pred)
print("Improved One-vs-Rest Logistic Regression Validation Accuracy:", val_accuracy_ovr)


## Training the Random Forest model

In [None]:
# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(tdm_train, df_train['label'])

# Validate the Random Forest model
y_val_pred_rf = rf_model.predict(tdm_val)

# Calculate and print validation accuracy
val_accuracy_rf = accuracy_score(df_val['label'], y_val_pred_rf)
print("Random Forest Validation Accuracy:", val_accuracy_rf)

## Training the SVM model

In [None]:
# #Train the SVM model
# svm_model = SVC(probability=True,random_state=42)
# svm_model.fit(tdm_train, df_train['label'])

# # Validate the SVM model
# y_val_pred_svm = svm_model.predict(tdm_val)

# # Calculate and print validation accuracy
# val_accuracy_svm = accuracy_score(df_val['label'], y_val_pred_svm)
# print("SVM Validation Accuracy:", val_accuracy_svm)

base_svm_model = LinearSVC(random_state=42)
svm_model = CalibratedClassifierCV(base_svm_model)
svm_model.fit(tdm_train, df_train['label'])

# Validate the calibrated model
y_val_pred_svm = svm_model.predict(tdm_val)

# Calculate and print validation accuracy
val_accuracy_svm = accuracy_score(df_val['label'], y_val_pred_svm)
print("Calibrated LinearSVC Validation Accuracy:", val_accuracy_svm)

## Training the Naive Bayes model

In [None]:
nb_model = MultinomialNB()
nb_model.fit(tdm_train, df_train['label'])

# Validate the Naive Bayes model
y_val_pred_nb = nb_model.predict(tdm_val)

# Calculate and print validation accuracy
val_accuracy_nb = accuracy_score(df_val['label'], y_val_pred_nb)
print("Naive Bayes Validation Accuracy:", val_accuracy_nb)

## Training the Decision Tree model

In [None]:
# Train a Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(tdm_train, df_train['label'])

# Validate the Decision Tree model
y_val_pred_dt = dt_model.predict(tdm_val)

# Calculate and print validation accuracy
val_accuracy_dt = accuracy_score(df_val['label'], y_val_pred_dt)
print("Decision Tree Validation Accuracy:", val_accuracy_dt)


## Comparing Models and Testing the Best Model

In [None]:
model_accuracies = {
    "Logistic Regression": val_accuracy_ovr,
    "Random Forest": val_accuracy_rf,
    "SVM": val_accuracy_svm,
    "Naive Bayes": val_accuracy_nb,
    "Decision Tree": val_accuracy_dt,
}

best_model_name = max(model_accuracies, key=model_accuracies.get)
print(f"The best model is {best_model_name} with accuracy: {model_accuracies[best_model_name]}")

best_model = {
    "Logistic Regression": ovr_model,
    "Random Forest": rf_model,
    "SVM": svm_model,
    "Naive Bayes": nb_model,
    "Decision Tree": dt_model,
}[best_model_name]
y_test_pred = best_model.predict(tdm_test)
test_accuracy = accuracy_score(df_test['label'], y_test_pred)
print(f"Test Accuracy with {best_model_name}: {test_accuracy}")

## Performance Evaluation

In [None]:
# Print classification report and confusion matrix for the test dataset
print("\nClassification Report for the Test Dataset:")
print(classification_report(df_test['label'], y_test_pred))
print("\nConfusion Matrix for the Test Dataset:")
print(confusion_matrix(df_test['label'], y_test_pred))

## Adding Predictions to the Test Dataset

In [None]:
# Add predicted emotions to the test dataframe
df_test['predicted_emotion'] = [
    label_mapping[label] if label in label_mapping else 'unknown'
    for label in y_test_pred
]

## Displaying Predictions

In [None]:
# Display the predictions in the test dataset
print("\nTest Dataset Predictions:")
print(df_test[['text', 'emotion', 'predicted_emotion']].head())

## Predicting Emotion Function 

In [None]:
def predict_emotion(sentence, model, vectorizer, label_mapping):
    sentence_tdm = vectorizer.transform([sentence])
    probabilities = model.predict_proba(sentence_tdm)[0]
    emotions = [label_mapping[i] for i in range(len(probabilities))]
    
    # Plot the predicted probabilities
    plt.figure(figsize=(8, 4))
    sns.barplot(x=emotions, y=probabilities)
    plt.title('Emotion Prediction')
    plt.ylabel('Probability')
    plt.show()


## Using the Predict Emotion Function

In [None]:
# Predict emotion for a specific sentence
# predict_emotion("I heard strange noises outside last night, and I couldn't sleep at all. My heart was racing, and I kept imagining all sorts of dangers lurking in the dark.", best_model, vectorizer, label_mapping)

# Function for user input prediction
def user_input_prediction(model, vectorizer, label_mapping):
    try:
        sentence = input("Please enter a sentence to predict emotion: ")
        predict_emotion(sentence, model, vectorizer, label_mapping)
    except Exception as e:
        print(f"An error occurred: {e}")

# Call the user input function
user_input_prediction(best_model, vectorizer, label_mapping)
