In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df = pd.read_csv('train.txt',sep = ';',header = None,names = ['text','emotion'])

In [None]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [None]:
df.isnull().sum()

Unnamed: 0,0
text,0
emotion,0


In [None]:
#Label Encoding
unique_emotions = df['emotion'].unique()
emotion_numbers = {}
i = 0
for emo in unique_emotions:
    emotion_numbers[emo] = i
    i += 1
df['emotion']  = df['emotion'].map(emotion_numbers)


In [None]:
#to lowercase
df['text'] = df['text'].apply(lambda x : x.lower())


In [None]:
#removed punctuations
import string
def remove_punc(txt):
  return txt.translate(str.maketrans('','',string.punctuation))
  df['text']=df['text'].apply(remove_punc)


In [None]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [None]:
#removing all the numbers
def remove_numbers(txt):
  new = ""
  for i  in txt:
    if not i.isdigit():
      new = new + i
  return new

df['text'] = df['text'].apply(remove_numbers)


In [None]:
# remove URls
def remove_urls(txt):
    words = txt.split()
    new_words = []
    for word in words:
        # Simple check for URLs starting with 'http', 'https', or 'www'
        if not (word.startswith('http') or word.startswith('www')):
            new_words.append(word)
    return ' '.join(new_words)

df['text'] = df['text'].apply(remove_urls)

df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [None]:
# remove emojis and special characters using ascii value methods
def remove_emojis(txt):
  new = ""
  for i in txt:
      if i.isascii():
          new += i
  return new

df['text'] = df['text'].apply(remove_emojis)
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [None]:
# Removing stopwords
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stop_words=set(stopwords.words('english'))



In [None]:
len(stop_words)


198

In [None]:
df.loc[0]['text']

'i didnt feel humiliated'

In [None]:
df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [None]:
#removing stopwords
def remove(txt):
  words = txt.split()
  cleaned = []
  for i in words:
    if not i in stop_words:
      cleaned.append(i)

  return ' '.join(cleaned)

In [None]:
df['text'] = df['text'].apply(remove)

In [None]:
df.loc[1]['text']

'go feeling hopeless damned hopeful around someone cares awake'

In [None]:
# now our data is fully preproceessed and cleaned & ready to use for the ml model
df.head()

Unnamed: 0,text,emotion
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


In [None]:
#BOW
# Now splitting the data
from sklearn.model_selection import train_test_split
X = df['text']
y = df['emotion']
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.20, random_state=42)

In [None]:
# Converting X_train and X_test to vectors
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer


In [None]:
bow_vectorizer = CountVectorizer()

In [None]:
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [None]:
nb_model = MultinomialNB()


In [None]:
nb_model.fit(X_train_bow,y_train)

In [None]:
pred_bow = nb_model.predict(X_test_bow)

In [None]:
print(accuracy_score(y_test,pred_bow))

0.76875


In [None]:
#TF- IDF
tfidf_vectorizer = TfidfVectorizer()

In [None]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

nb2_model = MultinomialNB()
nb2_model.fit(X_train_tfidf,y_train)


In [None]:
pred_tfidf = nb2_model.predict(X_test_tfidf)


In [None]:
print(accuracy_score(y_test,pred_tfidf))


0.66125


In [None]:
# Now doing with Logistic Regression for best accuracy

from sklearn.linear_model import LogisticRegression

In [None]:
logistic_model = LogisticRegression(max_iter = 1000)

In [None]:
logistic_model.fit(X_train_tfidf,y_train)

In [None]:
log_pred = logistic_model.predict(X_test_tfidf)

In [None]:
print(accuracy_score(y_test,log_pred))
# found good accuracy score

0.8609375


# Best Accuracy Model

In [None]:
logistic_model.fit(X_train_bow,y_train)

In [None]:
log_pred_1 = logistic_model.predict(X_test_bow)

In [None]:
print(accuracy_score(y_test,log_pred_1))
# Best accuracy till now

0.8890625


In [None]:
from sklearn import svm

In [None]:
svm_model = svm.SVC()

In [None]:
svm_model.fit(X_train_tfidf,y_train)

In [None]:
svm_pred = svm_model.predict(X_test_tfidf)

In [None]:
print(accuracy_score(y_test,svm_pred))

0.85125


In [None]:
svm_model.fit(X_train_bow,y_train)

In [None]:
svm_pred_1 = svm_model.predict(X_test_bow)


In [None]:
print(accuracy_score(y_test,svm_pred_1))

0.82125


In [None]:
import joblib
joblib.dump(logistic_model, 'logistic_model.pkl')
joblib.dump(bow_vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [None]:
joblib.dump(logistic_model, 'logistic_model.pkl')


['logistic_model.pkl']

In [None]:
print(df['emotion'].value_counts())


emotion
5    5362
0    4666
1    2159
4    1937
2    1304
3     572
Name: count, dtype: int64


In [None]:
from sklearn.utils import resample

# Separate each class
df_major = df[df.emotion.isin([0,5])]  # sadness, joy
df_minor = df[~df.emotion.isin([0,5])] # others

# Upsample minorities
love_upsampled = resample(
    df[df.emotion == 2],
    replace=True, n_samples=4000, random_state=42
)
surprise_upsampled = resample(
    df[df.emotion == 3],
    replace=True, n_samples=4000, random_state=42
)
# Repeat for fear and anger if needed

# Combine
df_balanced = pd.concat([
    df[df.emotion == 0],       # sadness
    df[df.emotion == 5],       # joy
    df[df.emotion == 1],       # anger
    df[df.emotion == 4],       # fear
    love_upsampled,
    surprise_upsampled
])

df_balanced = df_balanced.sample(frac=1, random_state=1)  # Shuffle
