In [75]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [76]:
df = pd.read_csv('/content/train.txt',sep = ';',header = None,names = ['text','emotion'])

In [77]:
df['emotion'].unique()

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [78]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [79]:
df.isnull().sum()

Unnamed: 0,0
text,0
emotion,0


**Data Cleaning**

In [80]:
unique_emotions = df['emotion'].unique()
emotion_numbers = {emotion: number for number, emotion in enumerate(unique_emotions)}

In [81]:
emotion_numbers

{'sadness': 0, 'anger': 1, 'love': 2, 'surprise': 3, 'fear': 4, 'joy': 5}

In [82]:
df['emotion'] = df['emotion'].map(emotion_numbers)

In [83]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [84]:
df['text'] = df['text'].apply(lambda x: x.lower())
df['text']

Unnamed: 0,text
0,i didnt feel humiliated
1,i can go from feeling so hopeless to so damned...
2,im grabbing a minute to post i feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...
4,i am feeling grouchy
...,...
15995,i just had a very brief time in the beanbag an...
15996,i am now turning and i feel pathetic that i am...
15997,i feel strong and good overall
15998,i feel like this was such a rude comment and i...


special character removal

In [85]:
import string

def remove_punc(txt):
  return txt.translate(str.maketrans('','',string.punctuation))


In [86]:
df['text'] = df['text'].apply(remove_punc)

**numbers removal**

In [87]:
def remove_numbers(txt):
  new = ""
  for i in txt:
    if not i.isdigit():
      new += i
  return new

df['text'] = df['text'].apply(remove_numbers)

In [88]:
df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


**emojie removal**

In [89]:
def remove_emojis(txt):
  new = ""
  for i in txt:
    if i.isascii():
      new += i
  return new

df['text'] = df['text'].apply(remove_emojis)

stopwords removal

In [90]:
import nltk

In [91]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [92]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download("punkt_tab")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [93]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [94]:
len(stop_words)

198

In [95]:
def remove(txt):
  words = word_tokenize(txt)
  new = [word for word in words if word not in stop_words]
  return ' '.join(new)

df['text'] = df['text'].apply(remove)

In [96]:
df.loc[1]['text']

'go feeling hopeless damned hopeful around someone cares awake'

In [97]:
df.head()

Unnamed: 0,text,emotion
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


In [98]:
df

Unnamed: 0,text,emotion
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1
...,...,...
15995,brief time beanbag said anna feel like beaten,0
15996,turning feel pathetic still waiting tables sub...,0
15997,feel strong good overall,5
15998,feel like rude comment im glad,1


In [99]:
df.shape

(16000, 2)

In [100]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df['text'],df['emotion'],test_size = 0.20,random_state = 42)

In [101]:
X_train

Unnamed: 0,text
676,refers course though cant help feeling somehow...
12113,im starting feel im suffering fatigue
7077,feel like probably would liked book little bit...
13005,really feel awkward
12123,im feeling little grumpy today lame weather te...
...,...
13418,love leave reader feeling confused slightly de...
5390,feel delicate
860,starting feel little stressed
15795,feel stressed tired worn shape neglected


In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer

In [103]:
bow_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=20000)


In [105]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [108]:
X_train_idf = tfidf_vectorizer.fit_transform(X_train)
X_test_idf = tfidf_vectorizer.transform(X_test)

In [109]:
from sklearn.utils.class_weight import compute_class_weight

In [110]:
# Example: all classes get custom importance
class_weights = {0:0.57, 1:1.23, 2:5.0, 3:10.0, 4:1.38, 5:0.49}



In [111]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [112]:
lr_model = LogisticRegression(class_weight=class_weights, max_iter=2000, n_jobs=-1)
lr_model.fit(X_train_idf, y_train)
lr_pred = lr_model.predict(X_test_idf)
print("\n--- Logistic Regression ---")
print(classification_report(y_test, lr_pred, digits=3))
print(accuracy_score(y_test,lr_pred))



--- Logistic Regression ---
              precision    recall  f1-score   support

           0      0.957     0.907     0.931       946
           1      0.884     0.892     0.888       427
           2      0.733     0.973     0.836       296
           3      0.648     0.947     0.770       113
           4      0.879     0.806     0.841       397
           5      0.944     0.879     0.910      1021

    accuracy                          0.891      3200
   macro avg      0.841     0.901     0.863      3200
weighted avg      0.902     0.891     0.893      3200

0.8909375


In [113]:
from sklearn.svm import LinearSVC

In [114]:
svm_model = LinearSVC(class_weight=class_weights, max_iter=2000)
svm_model.fit(X_train_idf, y_train)
svm_pred = svm_model.predict(X_test_idf)
print("\n--- Linear SVM ---")
print(classification_report(y_test, svm_pred, digits=3))
print(accuracy_score(y_test,svm_pred))


--- Linear SVM ---
              precision    recall  f1-score   support

           0      0.947     0.933     0.940       946
           1      0.894     0.892     0.893       427
           2      0.800     0.878     0.837       296
           3      0.789     0.796     0.793       113
           4      0.853     0.859     0.856       397
           5      0.939     0.923     0.931      1021

    accuracy                          0.905      3200
   macro avg      0.870     0.880     0.875      3200
weighted avg      0.907     0.905     0.906      3200

0.9053125


In [115]:
stacked_preds = np.vstack([lr_pred, svm_pred])

# Majority vote along rows
ensemble_pred = mode(stacked_preds, axis=0).mode.flatten()

print("\n--- Ensemble (LR + SVM) ---")
print(classification_report(y_test, ensemble_pred, digits=3))
print("Accuracy:", accuracy_score(y_test, ensemble_pred))


--- Ensemble (LR + SVM) ---
              precision    recall  f1-score   support

           0      0.935     0.938     0.936       946
           1      0.887     0.904     0.896       427
           2      0.743     0.966     0.840       296
           3      0.686     0.929     0.789       113
           4      0.894     0.809     0.849       397
           5      0.967     0.871     0.916      1021

    accuracy                          0.898      3200
   macro avg      0.852     0.903     0.871      3200
weighted avg      0.907     0.898     0.900      3200

Accuracy: 0.898125


In [118]:
import joblib
joblib.dump(lr_model, 'lr_model.pkl')
joblib.dump(svm_model, 'svm_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']