In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('train.txt',sep = ';',header=None,names=['text','emotion'])

In [None]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [None]:
df.shape

(16000, 2)

In [None]:
df.isnull().sum()

Unnamed: 0,0
text,0
emotion,0


In [None]:
df.duplicated().sum()

np.int64(1)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df['emotion'].unique()

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [None]:
df_emotions = df['emotion'].unique()
emotion_numbers = {}
i = 0
for emo in df_emotions:
  emotion_numbers[emo] = i
  i += 1

df['emotion'] = df['emotion'].map(emotion_numbers)

In [None]:
df


Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [None]:
df['text'] = df['text'].apply(lambda x: x.lower())

In [None]:
df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [None]:
import string

def remove_punc(txt):
  return txt.translate(str.maketrans('','',string.punctuation))

In [None]:
df['text'] = df['text'].apply(lambda x: remove_punc(x))

In [None]:
def remove_numbers(txt):
    new = ""
    for i in txt:
       if not i.isdigit():
          new += i
    return new


df['text'] = df['text'].apply(lambda x: remove_numbers(x))


In [None]:
df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [None]:
def remove_emojis(txt):
  text = '';
  for i in txt:
    if i.isascii():
      text += i;
  return text


df['text'] = df['text'].apply(remove_emojis)

In [None]:
df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [None]:
import nltk

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def remove_more_words(txt):
  words = word_tokenize(txt)
  cleaned = ''
  for word in words:
    if word not in stop_words:
      cleaned += word + ' '
  return cleaned

In [None]:
df['text'] = df['text'].apply(remove_more_words)

In [None]:
df

Unnamed: 0,text,emotion
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1
...,...,...
15995,brief time beanbag said anna feel like beaten,0
15996,turning feel pathetic still waiting tables sub...,0
15997,feel strong good overall,5
15998,feel like rude comment im glad,1


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['emotion'], test_size=0.20, random_state=42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

bog = CountVectorizer()
tfidf = TfidfTransformer()



In [None]:
X_train_bog = bog.fit_transform(X_train)
X_test_bog = bog.transform(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

In [None]:
nb.fit(X_train_bog,y_train)

In [None]:
y_pred_bog = nb.predict(X_test_bog)

In [None]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [None]:
accuracy_score(y_test,y_pred_bog)

0.773125

In [None]:
X_train_bog = bog.fit_transform(X_train)
X_test_bog = bog.transform(X_test)

X_train_tfidf = tfidf.fit_transform(X_train_bog)
X_test_tfidf = tfidf.transform(X_test_bog)

In [None]:
nb.fit(X_train_tfidf,y_train)

In [None]:
y_pred_tfidf = nb.predict(X_test_tfidf)

In [None]:
accuracy_score(y_test,y_pred_tfidf)

0.6634375

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr_model = LogisticRegression(max_iter=1000)

In [None]:
lr_model.fit(X_train_tfidf,y_train)

In [None]:
y_pred_lr = lr_model.predict(X_test_tfidf)

In [None]:
accuracy_score(y_test,y_pred_lr)

0.86125

In [None]:
classification_report(y_test,y_pred_lr)

'              precision    recall  f1-score   support\n\n           0       0.91      0.94      0.92       950\n           1       0.91      0.81      0.86       439\n           2       0.92      0.60      0.73       303\n           3       0.89      0.47      0.62       106\n           4       0.87      0.76      0.81       375\n           5       0.80      0.96      0.87      1027\n\n    accuracy                           0.86      3200\n   macro avg       0.88      0.76      0.80      3200\nweighted avg       0.87      0.86      0.86      3200\n'