**importing necessary libraries**

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# **step 1 : load the dataset into a dataframe**

In [43]:
df = pd.read_csv('Emotion_final.csv')
df.head()

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,I am in love with you,love
4,i am ever feeling nostalgic about the fireplac...,love


# **step 2 : Perform the data cleaning**


In [45]:
# summary of dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21493 entries, 0 to 21492
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Text     21493 non-null  object
 1   Emotion  21493 non-null  object
dtypes: object(2)
memory usage: 336.0+ KB


In [46]:
df.describe()

Unnamed: 0,Text,Emotion
count,21493,21493
unique,21439,7
top,i often find myself feeling assaulted by a mul...,happy
freq,2,7033


**a . handle missing values**

In [48]:
# checking for missing values
df.isnull().sum()

Text       0
Emotion    0
dtype: int64

In [49]:
df.isnull().sum()/len(df)

Text       0.0
Emotion    0.0
dtype: float64

**b.remove duplicates**

In [51]:
# drop rows with missing values
df.dropna(subset=['Text', 'Emotion'], inplace = True)

In [52]:
# check duplicate rows
df.duplicated().sum()

3

In [53]:
# remove duplicate rows
df.drop_duplicates(inplace=True)

In [54]:
# Reset index after dropping rows
df.reset_index(drop=True, inplace=True)

# **step 3 : Label encoding the emotion column**

In [56]:
from sklearn.preprocessing import LabelEncoder
lbec = LabelEncoder()            # initialize label encoder

lbec.fit(df['Emotion'])
Emotion_encoded = lbec.transform(df['Emotion'])
Emotion_encoded

array([5, 5, 0, ..., 1, 1, 1])

In [57]:
Emotion_encoded[:5]

array([5, 5, 0, 4, 4])

In [58]:
label_mapping = dict(zip(lbec.classes_, lbec.transform(lbec.classes_)))
print("Label Mapping:")
print(label_mapping)             # mapping original labels to encoded values

Label Mapping:
{'anger': 0, 'confusion': 1, 'fear': 2, 'happy': 3, 'love': 4, 'sadness': 5, 'surprise': 6}


# **step 4 : train a random forest model with the dataset**

In [60]:
# define x and y variables
X = df['Text']
Y = df['Emotion']

In [61]:
#split a data set into train and test
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X, Y, train_size = 0.2, random_state = 42)

In [62]:
from sklearn.ensemble import RandomForestClassifier

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer to convert text to numerical features
vectorizer = TfidfVectorizer()

# Fit the vectorizer on the training data and transform both training and testing data
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Now fit the model with the vectorized data
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_vec, Y_train)

# **Step 5: Find the accuracy of the model**

In [65]:
# predictions
Y_pred = model.predict(vectorizer.transform(X_test)) # Transform x_test before prediction

In [66]:
from sklearn.metrics import accuracy_score
# evaluating the model
accuaracy = accuracy_score(Y_test, Y_pred)
print("accuracy:" , accuaracy)

accuracy: 0.7259190321079572


In [67]:
from sklearn.metrics import confusion_matrix, classification_report

# Calculate confusion matrix (use a different variable name)
conf_matrix_result = confusion_matrix(Y_test, Y_pred)
print("confusion_matrix:" , conf_matrix_result)

confusion_matrix: [[1571    0   76  447    4  275    2]
 [   0    0    2    6    0    3    0]
 [  55    0 1297  483   10  257   29]
 [  44    0   43 5056   58  425   13]
 [  19    0    8  621  573  108    2]
 [  92    0   79 1126   13 3680    9]
 [  14    0  104  186    2   97  303]]


In [68]:
#classification_report

c_report = classification_report(Y_test, Y_pred)
print("classification_report:" , c_report)

classification_report:               precision    recall  f1-score   support

       anger       0.88      0.66      0.75      2375
   confusion       0.00      0.00      0.00        11
        fear       0.81      0.61      0.69      2131
       happy       0.64      0.90      0.75      5639
        love       0.87      0.43      0.58      1331
     sadness       0.76      0.74      0.75      4999
    surprise       0.85      0.43      0.57       706

    accuracy                           0.73     17192
   macro avg       0.68      0.54      0.58     17192
weighted avg       0.75      0.73      0.72     17192



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
