In [1]:
import pandas as pd

In [20]:
from sklearn.model_selection import train_test_split

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [72]:
from sklearn.metrics import classification_report as clfr

In [59]:
from sklearn.decomposition import TruncatedSVD

In [89]:
import pickle

#### Load the dataset

In [40]:
data = pd.read_csv("preprocessed_data.csv")

In [41]:
data.head()

Unnamed: 0,text,emotions
0,probably mention feel proud actually keep new ...,joy
1,people feel like go grm worthwhile hour,joy
2,feel especially pleased long time come,joy
3,struggle awful feeling say sweet thing deserve...,joy
4,mean stupid trip make great album thing go fee...,joy


### mapping categorical variables to numeric

In [44]:
data["emotions"].unique()

array(['joy', 'sadness', 'anger', 'fear', 'love'], dtype=object)

In [45]:
class_to_val = {}
val_to_class = {}

for index,emotion in enumerate(data.emotions.unique()):
    class_to_val[emotion] = index
    val_to_class[index] = emotion
    

In [46]:
class_to_val

{'joy': 0, 'sadness': 1, 'anger': 2, 'fear': 3, 'love': 4}

In [47]:
val_to_class

{0: 'joy', 1: 'sadness', 2: 'anger', 3: 'fear', 4: 'love'}

In [48]:
data["emotions"] = data["emotions"].apply(lambda x:class_to_val[x])

In [49]:
data.emotions.unique()

array([0, 1, 2, 3, 4], dtype=int64)

### Train_test_split

In [50]:
x_train,x_test,y_train,y_test = train_test_split(data.text,data.emotions,test_size=0.2,random_state=42) #80-20 split

In [51]:
x_train.isna().any()

False

### Tf-idf_vectorizer

In [52]:
tfidf_obj = TfidfVectorizer().fit(x_train)

In [53]:
x_train = tfidf_obj.transform(x_train)

In [54]:
x_test = tfidf_obj.transform(x_test)

In [56]:
x_train.shape

(138208, 21801)

In [57]:
x_test.shape

(34553, 21801)

##### Using SVD to reduce the dimension

In [60]:
svd = TruncatedSVD(n_components=20)
svd.fit(x_train)

TruncatedSVD(n_components=20)

In [61]:
x_train = svd.transform(x_train)
x_test = svd.transform(x_test)
     

In [62]:
x_train.shape

(138208, 20)

In [63]:
x_test.shape

(34553, 20)

## ML models

In [64]:

from sklearn.linear_model import LogisticRegression as LGR

In [69]:
lr_clf = LGR(random_state=42).fit(x_train,y_train)

In [70]:

pred_values_lr_train = lr_clf.predict(x_train)
pred_values_lr_test = lr_clf.predict(x_test)
     

In [73]:
print(f" Logistic Regression performance on train data -> \n{clfr(y_train,pred_values_lr_train)}")

print("\n\n")

print(f" Logistic Regression performance on test data -> \n{clfr(y_test,pred_values_lr_test)}")

 Logistic Regression performance on train data -> 
              precision    recall  f1-score   support

           0       0.27      0.50      0.35     27716
           1       0.26      0.07      0.11     27603
           2       0.32      0.19      0.23     27746
           3       0.32      0.44      0.37     27533
           4       0.36      0.33      0.34     27610

    accuracy                           0.31    138208
   macro avg       0.31      0.31      0.28    138208
weighted avg       0.31      0.31      0.28    138208




 Logistic Regression performance on test data -> 
              precision    recall  f1-score   support

           0       0.27      0.49      0.34      6838
           1       0.25      0.07      0.11      6951
           2       0.32      0.20      0.25      6803
           3       0.34      0.45      0.38      7017
           4       0.35      0.33      0.34      6944

    accuracy                           0.31     34553
   macro avg       0.31    

## K-Nearest Neighbors

In [75]:
from sklearn.neighbors import KNeighborsClassifier

In [77]:

knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=3)

In [78]:

pred_values_knn_train = knn_clf.predict(x_train)
pred_values_knn_test = knn_clf.predict(x_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [79]:

print(f" KNN performance on train data -> \n{clfr(y_train,pred_values_knn_train)}")
print("\n\n")
print(f" KNN Regression performance on test data -> \n{clfr(y_test,pred_values_knn_test)}")

 KNN performance on train data -> 
              precision    recall  f1-score   support

           0       0.50      0.84      0.62     27716
           1       0.59      0.65      0.62     27603
           2       0.76      0.57      0.65     27746
           3       0.84      0.58      0.69     27533
           4       0.83      0.62      0.71     27610

    accuracy                           0.65    138208
   macro avg       0.70      0.65      0.66    138208
weighted avg       0.70      0.65      0.66    138208




 KNN Regression performance on test data -> 
              precision    recall  f1-score   support

           0       0.31      0.58      0.41      6838
           1       0.34      0.39      0.37      6951
           2       0.47      0.34      0.40      6803
           3       0.60      0.38      0.47      7017
           4       0.60      0.42      0.49      6944

    accuracy                           0.42     34553
   macro avg       0.47      0.42      0.43     

### Decision Tree

In [80]:
from sklearn.tree import DecisionTreeClassifier

In [81]:

dec_clf = DecisionTreeClassifier(random_state=42)
dec_clf.fit(x_train,y_train)
     

DecisionTreeClassifier(random_state=42)

In [82]:

pred_values_dec_train = dec_clf.predict(x_train)
pred_values_dec_test = dec_clf.predict(x_test)

In [83]:
print(f" Decision tree performance on train data -> \n{clfr(y_train,pred_values_dec_train)}")
print("\n\n")
print(f" Decision performance on test data -> \n{clfr(y_test,pred_values_dec_test)}")

 Decision tree performance on train data -> 
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     27716
           1       0.98      1.00      0.99     27603
           2       0.98      0.99      0.98     27746
           3       1.00      0.97      0.98     27533
           4       1.00      0.94      0.97     27610

    accuracy                           0.98    138208
   macro avg       0.98      0.98      0.98    138208
weighted avg       0.98      0.98      0.98    138208




 Decision performance on test data -> 
              precision    recall  f1-score   support

           0       0.32      0.34      0.33      6838
           1       0.32      0.33      0.33      6951
           2       0.36      0.36      0.36      6803
           3       0.44      0.41      0.42      7017
           4       0.43      0.42      0.42      6944

    accuracy                           0.37     34553
   macro avg       0.37      0.37      0.37 

### SVM

In [85]:
from sklearn.svm import SVC

In [86]:

svm_clf = SVC(C=0.8)
svm_clf.fit(x_train, y_train)

SVC(C=0.8)

In [87]:

pred_values_svm_train = svm_clf.predict(x_train)
pred_values_svm_test = svm_clf.predict(x_test)

In [88]:
print(f"SVM performance on train data -> \n{clfr(y_train,pred_values_svm_train)}")
print("\n\n")
print(f"SVM performance on test data -> \n{clfr(y_test,pred_values_svm_test)}")
     

SVM performance on train data -> 
              precision    recall  f1-score   support

           0       0.34      0.50      0.41     27716
           1       0.34      0.28      0.31     27603
           2       0.43      0.33      0.37     27746
           3       0.50      0.50      0.50     27533
           4       0.51      0.48      0.49     27610

    accuracy                           0.42    138208
   macro avg       0.42      0.42      0.42    138208
weighted avg       0.42      0.42      0.42    138208




SVM performance on test data -> 
              precision    recall  f1-score   support

           0       0.33      0.48      0.39      6838
           1       0.31      0.26      0.29      6951
           2       0.40      0.30      0.35      6803
           3       0.48      0.48      0.48      7017
           4       0.49      0.47      0.48      6944

    accuracy                           0.40     34553
   macro avg       0.40      0.40      0.40     34553
weighte

## Dump models

In [94]:
def pickle_dump(file_name,obj):
    
    with open(file_name,"wb") as f:
        pickle.dump(obj,f)

In [98]:
pickle_dump("svm.pkl",svm_clf)

In [99]:
pickle_dump("dec_tree.pkl",dec_clf)

In [101]:
pickle_dump("knn.pkl",knn_clf)

In [102]:
pickle_dump("logistic_reg.pkl",lr_clf)