In [144]:
#Import the necessary libraries 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns
sns.set()
import warnings 
warnings.filterwarnings('ignore')

In [146]:
#Load the dataset
df = pd.read_csv("train.txt", sep=';', header=None, names=['Text', 'Emotion'])

In [148]:
#Check the first five record's of the dataset 
df.head()

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [150]:
#Check the null values in dataset 
df.isnull().sum()

Text       0
Emotion    0
dtype: int64

In [152]:
#Get the unique values from the dataset 
df['Emotion'].unique()

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [154]:
#Convert each emotion to its corresponding number 
unique_emotions = df['Emotion'].unique()
emotion_numbers = {}
i = 0

for emo in unique_emotions:
    emotion_numbers[emo] = i
    i += 1

df['Emotion'] = df['Emotion'].map(emotion_numbers)

In [156]:
df.head()

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [158]:
#Transfrom all the text into lower case 
df['Text'] = df['Text'].apply(lambda x: x.lower())

In [160]:
#Remove all the punctuations from the dataset 
import string 

def remove_punc(txt):
    return txt.translate(str.maketrans('', '', string.punctuation))

In [162]:
#Apply the remove punctuation method on the Text column
df['Text'] = df['Text'].apply(remove_punc)

In [164]:
#Remove all the digit from the Text column (if exist)
def remove_digit(txt):
    msg = ''
    for i in txt:
        if not i.isdigit():
            msg += i
    return msg

df['Text'] = df['Text'].apply(remove_digit)

In [166]:
#Remove all the emojis from the Text column (if exist) 
def remove_emojis(txt):
    msg = ''
    for i in txt:
        if i.isascii():
            msg += i
    return msg

df['Text'] = df['Text'].apply(remove_emojies)

In [168]:
#Import nltk library 
import nltk

In [170]:
#Import stopwords class 
from nltk.corpus import stopwords

In [172]:
#Download stopwords 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Pardeep
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [174]:
#Pass all the stopwords to sthe stop_words variable 
stop_words = set(stopwords.words('english'))

In [176]:
#Get some data from Text column
df.loc[1]['Text']

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [178]:
#Remove the stop words from the Text column
def remove_stopwords(txt):
    words = txt.split()
    cleaned = []
    for i in words:
        if not i in stop_words:
            cleaned.append(i)

    return ' '.join(cleaned)


In [180]:
#Apply the remove stopwards method on Text column
df['Text'] = df['Text'].apply(remove_stopwords)

In [182]:
#Get the same record from the Text column after removing the stop words 
df.loc[1]['Text']

'go feeling hopeless damned hopeful around someone cares awake'

In [184]:
#Split the data into train and test 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df['Text'], df['Emotion'], test_size = 0.2, random_state = 42)

In [186]:
#Import CountVectorize(bag of words) and TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

In [188]:
#Import Multinomial naive bayes and accuracy score(to check the accoracy of the model)
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [190]:
#Apply bag of words on the Text column
box_vectorizer = CountVectorizer()
x_train_bow = box_vectorizer.fit_transform(x_train)     #Vocabulary + Transformation(Text to numeric)
x_test_bow = box_vectorizer.transform(x_test)           #Transformation 

In [92]:
#Make the naive bayes model 
nb_model = MultinomialNB()
nb_model.fit(x_train_bow, y_train)

In [140]:
#Predict the test data with naive bayes model to check the accuracy of the model 
y_pred_bow = nb_model.predict(x_test_bow)
print("Accuracy of the naive bayes model with Count Vectorizer is:", accuracy_score(y_test, y_pred_bow))

Accuracy of the naive bayes model with Count Vectorizer is: 0.768125


In [110]:
#Apply the tfidf vectorizer on the Text column
tfidf_vectorizer = TfidfVectorizer()
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

In [128]:
#Make naive bayes model with tfidf vectorizer 
nb2_model = MultinomialNB()
nb2_model.fit(x_train_tfidf, y_train)

In [130]:
#Predict the test data through naive bayes tfidf model 
y_pred_tfidf = nb2_model.predict(x_test_tfidf)

In [138]:
#Print the accuracy of the naive bayes tfidf model 
print("Accuracy of the naive bayes model with tfidf is:", accuracy_score(y_test, y_pred_tfidf))

Accuracy of the naive bayes model with tfidf is: 0.6609375


In [118]:
#Import Logistic Regression model 
from sklearn.linear_model import LogisticRegression

In [122]:
#Make the logistic Regression model 
logistic_model = LogisticRegression()
logistic_model.fit(x_train_tfidf, y_train)

In [124]:
#Predict the x test with logistic regression model 
y_pred_tfidf_logistic = logistic_model.predict(x_test_tfidf)

In [136]:
#Print the accuracy of the logistic regression model with tfidf 
print("Accuracy of the Logistic Regression model with tfidf is :", accuracy_score(y_test, y_pred_tfidf_logistic))

Accuracy of the Logistic Regression model with tfidf is : 0.8628125
