In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('train.txt',sep=";",header=None,names=['text','emotion'])

In [3]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [4]:
df.isnull().sum()

text       0
emotion    0
dtype: int64

In [5]:
df.shape

(16000, 2)

In [3]:
df['emotion'].unique()

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [4]:
unique_emotions = df['emotion'].unique()
emotion_numbers = {}
i = 0
for emo in unique_emotions:
    emotion_numbers[emo] = i
    i +=  1

df['emotion'] = df['emotion'].map(emotion_numbers)

In [8]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [9]:
df['text'] = df['text'].apply(lambda x : x.lower())

In [10]:
import string

def remove_punc(txt):
    return txt.translate(str.maketrans('','',string.punctuation))

In [11]:
df['text'] = df['text'].apply(remove_punc)

In [12]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [13]:
def remove_numbers(txt):
    new = ''
    for i in txt:
        if not i.isdigit():
            new = new + i
    return new

df['text'] = df['text'].apply(remove_numbers)

In [14]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [15]:
def remove_emojis(txt):
    new = ''
    for i in txt:
        if i.isascii():
            new = new + i
    return new

df['text'] = df['text'].apply(remove_emojis)

In [16]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [17]:
import nltk

In [18]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [19]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tamas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tamas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [21]:
len(stop_words)

198

In [22]:
df.loc[1]['text']

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [23]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [24]:
def remove(txt):
    words = txt.split()
    cleaned = []
    for i in words:
        if not i in stop_words:
            cleaned.append(i)

    return ' '.join(cleaned)

In [25]:
df['text'] = df['text'].apply(remove)

In [26]:
df.head()

Unnamed: 0,text,emotion
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


In [27]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizes = CountVectorizer()

X = vectorizes.fit_transform(df['text'])

In [28]:
vectorizes.get_feature_names_out()

array(['aa', 'aaaaaaand', 'aaaaand', ..., 'zum', 'zumba', 'zz'],
      shape=(15046,), dtype=object)

In [29]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(16000, 15046))

In [30]:
X.shape

(16000, 15046)

ngram_range=() in this bracket enter range of word. if you type 2,2 than all word in 2 and if 1,2 than 1 and 2 both

In [31]:
vectorizes = CountVectorizer(ngram_range=(2,3))

In [32]:
X = vectorizes.fit_transform(df['text'])

In [33]:
vectorizes.get_feature_names_out()

array(['aa full', 'aa full force', 'aa meeting', ...,
       'zumba lame housewife', 'zz top', 'zz top logo'],
      shape=(207694,), dtype=object)

In [34]:
# X.toarray()

In [35]:
X.shape

(16000, 207694)

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizes = TfidfVectorizer()

X = vectorizes.fit_transform(df['text'])

In [37]:
vectorizes.get_feature_names_out()

array(['aa', 'aaaaaaand', 'aaaaand', ..., 'zum', 'zumba', 'zz'],
      shape=(15046,), dtype=object)

In [38]:
X.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(16000, 15046))

In [39]:
X.shape

(16000, 15046)

In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'],df['emotion'] , test_size=0.2, random_state=42)


In [41]:
X_train

676      refers course though cant help feeling somehow...
12113                im starting feel im suffering fatigue
7077     feel like probably would liked book little bit...
13005                                  really feel awkward
12123    im feeling little grumpy today lame weather te...
                               ...                        
13418    love leave reader feeling confused slightly de...
5390                                         feel delicate
860                          starting feel little stressed
15795             feel stressed tired worn shape neglected
7270         feel someone rude wrongly done something lose
Name: text, Length: 12800, dtype: object

In [42]:
X_test

8756                             ive made week feel beaten
4660                              feel strategy worthwhile
6095                     feel worthless weak say want find
304                                        feel clever nov
8241                      im moved ive feeling kind gloomy
                               ...                        
15578    feel useful pulpit find ironic often question ...
5746             dried bladders ready day im feeling brave
6395                             feel thrilled matter days
7624     woke morning text mr c declaring walking work ...
15245                                            feel dumb
Name: text, Length: 3200, dtype: object

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer

In [44]:
bow_vectorizer = CountVectorizer()

In [45]:
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

In [46]:
X_train_bow

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 116059 stored elements and shape (12800, 13361)>

In [47]:
X_test_bow

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 26936 stored elements and shape (3200, 13361)>

In [48]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [49]:
nb_model_bow = MultinomialNB()

In [50]:
nb_model_bow.fit(X_train_bow,y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [51]:
pred_bow = nb_model_bow.predict(X_test_bow)

In [52]:
print(accuracy_score(y_test,pred_bow))

0.768125


In [69]:
from sklearn.linear_model import LogisticRegression

logistic_model_bow = LogisticRegression()
logistic_model_bow.fit(X_train_bow,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [70]:
pred_logistic = logistic_model_bow.predict(X_test_bow)
print(accuracy_score(y_test,pred_logistic))

0.8896875


In [71]:
import pickle

# Save the model
with open("logistic_model_bow.pkl", "wb") as f:
    pickle.dump(logistic_model_bow, f)

# Save the vectorizer
with open("bow_vectorizer.pkl", "wb") as f:
    pickle.dump(bow_vectorizer, f)

# Save the emotion mapping
emotion_mapping = {0: 'sadness', 1: 'anger', 2: 'love', 3: 'surprise', 4: 'fear', 5: 'joy'}
with open("emotion_mapping.pkl", "wb") as f:
    pickle.dump(emotion_mapping, f)

print("Model, vectorizer, and emotion mapping saved successfully!")

In [55]:
tfidf_vectorizer = TfidfVectorizer()

In [56]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [57]:
nb_model_tfidf = MultinomialNB()

In [58]:
nb_model_tfidf.fit(X_train_tfidf,y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [59]:
pred_tfidf = nb_model_tfidf.predict(X_test_tfidf)

In [60]:
print(accuracy_score(y_test,pred_tfidf))

0.6609375


In [67]:
from sklearn.linear_model import LogisticRegression

logistic_model_tfidf = LogisticRegression()
logistic_model_tfidf.fit(X_train_tfidf,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [68]:
pred_logistic = logistic_model_tfidf.predict(X_test_tfidf)
print(accuracy_score(y_test,pred_logistic))

0.8628125
