In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as ply
import seaborn as sns

In [44]:
df = pd.read_csv('train.txt', sep=';', header=None, names=['text', 'emotion'])

In [45]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [46]:
df.isnull().sum()

text       0
emotion    0
dtype: int64

In [47]:
unique = df['emotion'].unique()

In [48]:
emotion_number = {}
i = 0
for emo in unique:
  emotion_number[emo] = i
  i += 1

df['emotion'] = df['emotion'].map(emotion_number)

In [49]:
df['emotion'].value_counts()

emotion
5    5362
0    4666
1    2159
4    1937
2    1304
3     572
Name: count, dtype: int64

In [50]:
# Now we will make the function to have every letter in the lower-case
df['text'] = df['text'].apply(lambda x : x.lower())

In [51]:
# Now we will make the function to have every letter in the lower-case
df['text'] = df['text'].apply(lambda x : x.lower())

In [52]:
df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [53]:
# Now we will remove the punctuation
# we will use the string method to do so 
import string
def remove_punc(txt):
  return txt.translate(str.maketrans('','',string.punctuation))
"""
🔹 str.maketrans('', '', string.punctuation)

This creates a translation table that tells Python:

    "For any character in string.punctuation, remove it (i.e., map it to None)."

Let me explain the syntax of str.maketrans(from, to, delete):

    from: characters to replace (empty in your case)

    to: what to replace them with (also empty here)

    delete: characters to be removed – this is where we pass in string.punctuation.

So essentially, it says:

    “Don’t replace anything, but delete all punctuation characters.”
"""

'\n🔹 str.maketrans(\'\', \'\', string.punctuation)\n\nThis creates a translation table that tells Python:\n\n    "For any character in string.punctuation, remove it (i.e., map it to None)."\n\nLet me explain the syntax of str.maketrans(from, to, delete):\n\n    from: characters to replace (empty in your case)\n\n    to: what to replace them with (also empty here)\n\n    delete: characters to be removed – this is where we pass in string.punctuation.\n\nSo essentially, it says:\n\n    “Don’t replace anything, but delete all punctuation characters.”\n'

In [54]:
df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [55]:
df['text'] = df['text'].apply(remove_punc)

In [56]:
# now we will remove the numbers
def remove_num(txt):
  new = ''
  for i in txt:
    if not i.isdigit():
      new = new + i
  return new

df['text'] = df['text'].apply(remove_num)

In [57]:
# now we will remove the url and the links
"""
def remove_link(txt):
words = txt.split()
clean_text = ''
for word in words:
  if not (word.startswith('https') or word.startswith('www.'):
    clean_text += word + ''
"""

"\ndef remove_link(txt):\nwords = txt.split()\nclean_text = ''\nfor word in words:\n  if not (word.startswith('https') or word.startswith('www.'):\n    clean_text += word + ''\n"

In [58]:
# now we will remove the emojis and the special character from the code

def remove_emoji(txt):
  new = ''
  for i in txt:
    if i.isascii():
      new += i
  return new
  
df['text'] = df['text'].apply(remove_emoji)

In [59]:
# now we will do the most important part which is removing the stop-words
# we will use nltk for this task 
import nltk

In [60]:
"""
Now the whole paragraph or the data-set is known as the corpus.
eg the collection of the text is the corpus.

Now the each word in the sentence is known as the toekns. The process of making or converting the text into words is known as the tokenization
Now what is the difference between the tokenozation and the  .split()
The split will differenciate on the basis of the spaces while the other one is tokenization
is on the basis of the word itself it is smarter 
"""

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [61]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/shaheer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/shaheer/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/shaheer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [62]:
stop_words = set(stopwords.words('english'))
len(stop_words)

198

In [63]:
def remove(txt):
  words = word_tokenize(txt)
  cleaned = []
  for word in words:
    if not word in stop_words:
      cleaned.append(word)
    
  return ' '.join(cleaned)

In [64]:
df['text'] = df['text'].apply(remove)
# now the text is preprocessed and cleaned as well

In [65]:
"""
# Now this is the method to find the frequency of the words in the data-set
from sklearn.feature_extraction.text import CountVectorizer

document = [
  "I love  pizza",
  "Pizza is the best",
  "I love pasta",
  "Pasta is great"
]

vectorize = CountVectorizer()
x = vectorize.fit_transform(document)

print("vocablury: ", vectorize.get_feature_names_out())
print("\nBoW Matrix:\n", x.toarray())

# Now there are some feature in it the count vector the most important is the max-count. Now if you give the max coutn it will just selecct the vocablury that has been mostly repeated
# There is another one called boolean. If you make it true then no matter how many times a word is repeated it will always give it as 0 and 1
# Another advantage of this is that it will give make the stop-words remove all by tself
"""

'\n# Now this is the method to find the frequency of the words in the data-set\nfrom sklearn.feature_extraction.text import CountVectorizer\n\ndocument = [\n  "I love  pizza",\n  "Pizza is the best",\n  "I love pasta",\n  "Pasta is great"\n]\n\nvectorize = CountVectorizer()\nx = vectorize.fit_transform(document)\n\nprint("vocablury: ", vectorize.get_feature_names_out())\nprint("\nBoW Matrix:\n", x.toarray())\n\n# Now there are some feature in it the count vector the most important is the max-count. Now if you give the max coutn it will just selecct the vocablury that has been mostly repeated\n# There is another one called boolean. If you make it true then no matter how many times a word is repeated it will always give it as 0 and 1\n# Another advantage of this is that it will give make the stop-words remove all by tself\n'

In [66]:
"""
# This is to apply the bow in our data-set 
from sklearn.feature_extraction.text import CountVectorizer
vectorize = CountVectorizer(max_features=50, stop_words=None)
x = vectorize.fit_transform(df['text'])
print("vocablury: ", vectorize.get_feature_names_out())
print("\nBoW Matrix:\n", x.toarray())
"""

'\n# This is to apply the bow in our data-set \nfrom sklearn.feature_extraction.text import CountVectorizer\nvectorize = CountVectorizer(max_features=50, stop_words=None)\nx = vectorize.fit_transform(df[\'text\'])\nprint("vocablury: ", vectorize.get_feature_names_out())\nprint("\nBoW Matrix:\n", x.toarray())\n'

In [67]:
"""
# making the model using the tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

document = ["I love  pizza",
  "Pizza is the best",
  "I love pasta",
  "Pasta is great"
]

vectorize = TfidfVectorizer()
x = vectorize.fit_transform(document)

print("vocablury: ", vectorize.get_feature_names_out())
print("\nBoW Matrix:\n", x.toarray())
"""

'\n# making the model using the tfidf\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\ndocument = ["I love  pizza",\n  "Pizza is the best",\n  "I love pasta",\n  "Pasta is great"\n]\n\nvectorize = TfidfVectorizer()\nx = vectorize.fit_transform(document)\n\nprint("vocablury: ", vectorize.get_feature_names_out())\nprint("\nBoW Matrix:\n", x.toarray())\n'

In [68]:
x = df['text']
y = df['emotion']
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20, random_state=42)

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
vectorize = CountVectorizer()



In [70]:
from sklearn.naive_bayes import MultinomialNB # This is the model of the naive bayes which is used for the text or which are not the normal distribution
from sklearn.metrics import accuracy_score

In [71]:

vectorizer = CountVectorizer()
x_train_bow = vectorizer.fit_transform(x_train)


x_test_bow = vectorizer.transform(x_test)

from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(x_train_bow, y_train)

y_pred = nb_model.predict(x_test_bow)


In [72]:
print(accuracy_score(y_test, y_pred))

0.7678125


In [73]:
tf_vectorize = TfidfVectorizer()

In [74]:
x_train_tf = tf_vectorize.fit_transform(x_train)
x_test_tf = tf_vectorize.transform(x_test)

In [75]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
x_resampled, y_resampled = ros.fit_resample(x_train_tf, y_train)



In [76]:
nb_model2 = MultinomialNB()
nb_model2.fit(x_train_tf, y_train)

In [77]:
y_pred = nb_model2.predict(x_test_tf)
print(accuracy_score(y_test, y_pred))

0.6609375


In [78]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter= 1000)
model.fit(x_resampled, y_resampled)


In [79]:
y_pred = model.predict(x_test_tf)
print(accuracy_score(y_test, y_pred))

0.886875


In [80]:
import joblib

joblib.dump(tf_vectorize, "tfidf_vectorizer.pkl")
joblib.dump(model, "logistic_model.pkl")


['logistic_model.pkl']