In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [6]:
emotion = pd.read_csv('emotion_dataset_balanced.csv')

In [7]:
emotion.head()

Unnamed: 0.1,Unnamed: 0,Emotion,Text,Clean_Text
0,14929,anger,"@charlottehanlan i agree, angry birds is exact...",agree angry birds exactly described
1,21962,anger,CRAP! Didn't think I'd have to know and unders...,CRAP think Id know understand statistics again...
2,15735,anger,Don ’ t criticize Mario or else I ’ ll start ...,’ criticize Mario ’ start fat drunk friends ...
3,18326,anger,Are you done ?,
4,29278,anger,He was coming so fast that she knew he had not...,coming fast knew seen brake sharply frowning ...


In [8]:
print(emotion.shape)

(13524, 4)


In [9]:
print('Emotions are: ', emotion['Emotion'].unique())

Emotions are:  ['anger' 'fear' 'joy' 'neutral' 'sadness' 'surprise']


In [10]:
emotion.isnull().sum()

Unnamed: 0      0
Emotion         0
Text            0
Clean_Text    452
dtype: int64

In [11]:
print(emotion.duplicated().sum())

0


In [12]:
emotion.drop_duplicates(inplace = True)
print(emotion.duplicated().sum())

0


In [13]:
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
emotion['Emotion'].value_counts()

Emotion
anger       2254
fear        2254
joy         2254
neutral     2254
sadness     2254
surprise    2254
Name: count, dtype: int64

In [15]:

plot_title = f"Emotions found"
fig = px.histogram( x=emotion['Emotion'], template="plotly_dark",
                   title=plot_title)
fig.update_traces(marker_color = ['salmon', 'burlywood', 'fuchsia', 'forestgreen',
                                  'purple', 'sienna'], marker_line_width = 1,
                 marker_line_color = 'darkslategrey')
fig.update_xaxes(title = 'Emotions')
fig.update_layout(showlegend=False)
fig.show()

In [16]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [18]:
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
import spacy


In [19]:
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')
from tqdm import tqdm_notebook
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shamimkhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
#filna replace nun with empty string
emotion['Clean_Text'] = emotion['Clean_Text'].fillna('')

In [21]:
def remove_html(text):
  bs = BeautifulSoup(text, 'html.parser')
  return ' ' + bs.get_text() + ' '

def keep_only_letters(text):
  text = re.sub(r'[^a-zA-Z\s]', ' ', text)
  return text

def convert_to_lowercase(text):
  return text.lower()

def clean_reviews(text):
  text = remove_html(text)
  text = keep_only_letters(text)
  text = convert_to_lowercase(text)
  return text


emotion['Clean_Text'] = emotion['Clean_Text'].apply(clean_reviews)

In [22]:
english_stop_words = nltk.corpus.stopwords.words('english')

print(f"Total number of English stopwords: {len(english_stop_words)}")
print("\nFirst 20 stopwords:")
print(english_stop_words[:20])

Total number of English stopwords: 198

First 20 stopwords:
['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been']


In [23]:
def remove_stop_words(text):
  for stopword in english_stop_words:
    stopword = ' ' + stopword + ' '
    text = text.replace(stopword, ' ')
    return text

emotion['Text'] = emotion['Text'].apply(remove_stop_words)

In [24]:
def text_stemming(text):
  stemmer = nltk.porter.PorterStemmer()
  stemmed = ' '.join([stemmer.stem(token) for token in text.split()])
  return stemmed

emotion['Text'] = emotion['Text'].apply(text_stemming)

In [33]:
# suffle the dataframe
emotion = emotion.sample(frac=1).reset_index(drop=True)

In [44]:
emotion

Unnamed: 0.1,Unnamed: 0,Emotion,Text,Clean_Text
0,14904,sadness,am i that bore for peopl to stop responding......,boring people stop responding
1,15031,joy,break! all the good that you'v put out into th...,break good world coming now feel flowing open...
2,11703,neutral,ye .,yes
3,28761,neutral,i hope that it is not formal one when the boss...,hope formal boss
4,8631,joy,day at disneyland with the incred wonder and e...,day disneyland incredibly wonderful extraordi...
...,...,...,...,...
13519,12551,sadness,refresh to start the day by fall flat on your ...,refreshing start day falling flat ass stupid ...
13520,2743,anger,but he still feel mif and mystifi as to whi ma...,feels miffed mystified manager kevin keegan d...
13521,1029,fear,doe anyon els get nervou when member of your h...,nervous member household attempts new recipe ...
13522,29980,joy,thi ha to be the most amaz day of my life. or ...,amazing day life of


In [36]:
emotion_train = emotion[:10000]
emotion_test = emotion[10000:]

vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(use_idf = True, ngram_range=(1,1))
tfidf_features_train = vectorizer.fit_transform(emotion['Text'])
tfidf_features_test = vectorizer.transform(emotion['Text'])
print(tfidf_features_train.shape, tfidf_features_test.shape)

(13524, 18028) (13524, 18028)


In [37]:
tfidf_features_train.ndim

2

In [38]:
emotion_sentences = emotion.iloc[:, 0]
emotion_labels = emotion.iloc[:, 1]

print(len(emotion_sentences), len(emotion_labels))

13524 13524


In [39]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [40]:
from sklearn.calibration import CalibratedClassifierCV

In [41]:
# base_svc = LinearSVC()  # fast solver for text
# svm_model = CalibratedClassifierCV(base_svc,cv=5,method="isotonic", n_jobs=-1)
# method="sigmoid", 
from sklearn.svm import SVC
svm_model = SVC(kernel='poly', probability=True)
svm_model.fit(tfidf_features_train, emotion_labels)

0,1,2
,C,1.0
,kernel,'poly'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [42]:
from sklearn.metrics import ConfusionMatrixDisplay

In [43]:
# Evaluation
y_pred = svm_model.predict(tfidf_features_test)
print(classification_report(emotion_labels, y_pred))
print(confusion_matrix(emotion_labels, y_pred))
print(accuracy_score(emotion_labels, y_pred))

              precision    recall  f1-score   support

       anger       1.00      1.00      1.00      2254
        fear       1.00      1.00      1.00      2254
         joy       1.00      1.00      1.00      2254
     neutral       0.99      0.99      0.99      2254
     sadness       1.00      1.00      1.00      2254
    surprise       1.00      1.00      1.00      2254

    accuracy                           1.00     13524
   macro avg       1.00      1.00      1.00     13524
weighted avg       1.00      1.00      1.00     13524

[[2247    0    0    6    1    0]
 [   1 2251    1    0    1    0]
 [   0    1 2247    5    0    1]
 [   3    2    1 2242    3    3]
 [   0    0    0    3 2251    0]
 [   0    0    1    0    0 2253]]
0.9975598935226264


In [57]:
user_text = input("Enter a sentence to predict emotion: ")
print(f"You entered: {user_text}")
X_new = vectorizer.transform([user_text])
pred_label = svm_model.predict(X_new)[0]
probs      = svm_model.predict_proba(X_new)[0]  # class probabilities

print("\nPredicted emotion:", pred_label)
print("Class probabilities:")
for cls, p in zip(svm_model.classes_, probs):
    print(f"  {cls}: {p:.3f}")

You entered: He is going to the store

Predicted emotion: surprise
Class probabilities:
  anger: 0.109
  fear: 0.052
  joy: 0.045
  neutral: 0.123
  sadness: 0.173
  surprise: 0.498
