In [1]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning,)

Reading the Dataset and look at it briefly

In [2]:
import pandas as pd
data=pd.read_csv("D:\\Python\\DA-ML\\Project\\emotion-labels-text.csv")
data.head()

Unnamed: 0,text,label
0,- arrggh kids that won't settle....need some K...,fear and worry
1,- blood and mucus and he chokes and has to swa...,joy and happiness
2,- Gig was awesome! Am exahausted and so dont w...,joy and happiness
3,- God i'm up early. Hayley still asleep but to...,joy and happiness
4,- had a great time at the 'block party' - so d...,fear and worry


In [5]:
data.shape

(32422, 2)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32422 entries, 0 to 32421
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    32422 non-null  object
 1   label   32422 non-null  object
dtypes: object(2)
memory usage: 506.7+ KB


In [7]:
data["label"].value_counts()

joy and happiness    13202
fear and worry       10711
sadness               6698
anger                 1811
Name: label, dtype: int64

PreProcessing:

In [3]:
import re
import string


def clean_text(text):
    text=text.lower()
    text=re.sub(r'(\s)?@\w+', r'\1',text)
    text=re.sub('\[.*?\]','',text)
    text=re.sub('[%s]'%re.escape(string.punctuation),'',text)
    text=re.sub('\w*\d\w*','',text)
    text=re.sub('[''"",,,]','',text)
    text=re.sub('\n','',text)
    return text

cleaned1=lambda x:clean_text(x)


Cleaning the texts from irrelevant characters.

In [4]:
data['text']=pd.DataFrame(data.text.apply(cleaned1))

data.head()

Unnamed: 0,text,label
0,arrggh kids that wont settleneed some kava fo...,fear and worry
1,blood and mucus and he chokes and has to swal...,joy and happiness
2,gig was awesome am exahausted and so dont wan...,joy and happiness
3,god im up early hayley still asleep but today...,joy and happiness
4,had a great time at the block party so did m...,fear and worry


In [5]:
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
# nltk.download('stopwords')
from nltk.corpus import stopwords


tokenizer=ToktokTokenizer()
stopword_list=nltk.corpus.stopwords.words('english')

In [6]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

Removing the words which add no special meaning to the texts (such as "a", "an", "the", ...).

In [12]:
data['text']=pd.DataFrame(data.text.apply(remove_stopwords))
data.head()

Unnamed: 0,text,label
0,arrggh kids wont settleneed kava liam thatll s...,fear and worry
1,blood mucus chokes swallow mirth cut short wro...,joy and happiness
2,gig awesome exahausted dont want revise boo hoo,joy and happiness
3,god im early hayley still asleep today party d...,joy and happiness
4,great time block party mackenzie,fear and worry


Splitting Data

In [7]:
X=data["text"]
y=data["label"]

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train , y_test=train_test_split(X,y, test_size=0.3)

Looking for best classifier:

In [13]:
scores={}

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
from sklearn.tree import DecisionTreeClassifier

clf =  Pipeline([('tfidf', TfidfVectorizer()), 
                  ('classifier',DecisionTreeClassifier(random_state = 30, max_depth=10,splitter='best'))])

clf.fit(X_train,y_train)
y_preds = clf.predict(X_test)

acc_score = clf.score(y_test,y_preds)
print(acc_score)
scores['DecisionTreeClassifier']=acc_score

0.7935643055412769


In [28]:
from sklearn.linear_model import LogisticRegression


lr= Pipeline([('tfidf', TfidfVectorizer()), 
                  ('classifier',LogisticRegression(random_state=30))])
lr.fit(X_train,y_train)
y_preds = lr.predict(X_test)
acc_lr_score = lr.score(y_test,y_preds)
print(acc_lr_score)
scores['LogisticRegression']=acc_lr_score

0.6335972036599157


In [17]:
from sklearn.ensemble import RandomForestClassifier
rf=Pipeline([('tfidf', TfidfVectorizer()), 
                  ('classifier',RandomForestClassifier(random_state=30))])

rf.fit(X_train,y_train)
y_preds=rf.predict(X_test)
acc_score = rf.score(y_test,y_preds)
print(acc_score)
scores['RandomForestClassifier']=acc_score

0.6188958568931839


In [22]:
from sklearn.neighbors import KNeighborsClassifier

model=Pipeline([('tfidf', TfidfVectorizer()),
                ('classifier',KNeighborsClassifier())])


mlp=model.fit(X_train,y_train)
y_preds = clf.predict(X_test)
        
print (f'Accuracy - KNeighborsClassifier: {mlp.score(X_train,y_train):.3f}')
acc_score = mlp.score(y_test,y_preds)
print(f'Accuracy Score - KNeighborsClassifier: {acc_score}')

scores['KNeighborsClassifier']=acc_score

Accuracy - KNeighborsClassifier: 0.557
Accuracy Score - KNeighborsClassifier: 0.5810630204585175


In [24]:
from sklearn.neural_network import MLPClassifier
model=Pipeline([('tfidf', TfidfVectorizer()),
                ('classifier',MLPClassifier( solver='adam',learning_rate="adaptive", verbose=True, 
                    max_iter=200, early_stopping=True, n_iter_no_change=10, 
                    alpha=0.001, momentum=0.5, tol=0.01, random_state=30))])


mlp=model.fit(X_train,y_train)
y_preds = clf.predict(X_test)
        
print (f'Accuracy - MLPClassifier: {mlp.score(X_train,y_train):.3f}')
acc_score = mlp.score(y_test,y_preds)
print(f'Accuracy Score - MLPClassifier: {acc_score}')

scores['MLPClassifier']=acc_score

Iteration 1, loss = 1.22491170
Validation score: 0.566520
Iteration 2, loss = 0.90959142
Validation score: 0.595154
Iteration 3, loss = 0.66501405
Validation score: 0.624229
Iteration 4, loss = 0.49354496
Validation score: 0.618502
Iteration 5, loss = 0.38982293
Validation score: 0.609251
Iteration 6, loss = 0.32324115
Validation score: 0.607048
Iteration 7, loss = 0.27862291
Validation score: 0.601322
Iteration 8, loss = 0.24753397
Validation score: 0.603965
Iteration 9, loss = 0.22361628
Validation score: 0.595595
Iteration 10, loss = 0.20545896
Validation score: 0.592952
Iteration 11, loss = 0.19139849
Validation score: 0.585463
Iteration 12, loss = 0.17961036
Validation score: 0.584141
Iteration 13, loss = 0.17017106
Validation score: 0.581057
Iteration 14, loss = 0.16125757
Validation score: 0.582379
Validation score did not improve more than tol=0.010000 for 10 consecutive epochs. Stopping.
Accuracy - MLPClassifier: 0.831
Accuracy Score - MLPClassifier: 0.4422740824509098


In [25]:
print(scores)

{'DecisionTreeClassifier': 0.7451423871697337, 'LogisticRegression': 0.6388403413179808, 'RandomForestClassifier': 0.6319522977279737, 'KNeighborsClassifier': 0.5810630204585175, 'MLPClassifier': 0.4422740824509098}


Tuning hyperparameters for LogisticRegression and RandomForest, in order to check if they get any better

In [27]:
from sklearn.model_selection import GridSearchCV

In [26]:
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
tv_X_train=tv.fit_transform(X_train)
tv_X_test=tv.transform(X_test)

In [None]:
from sklearn.metrics import accuracy_score


parameters={'penalty':['l1','l2', 'none'],
            'C':[0.01,1,10],
            'solver':['lbfgs', 'newton-cg', 'sag', 'saga'],
            }

model=LogisticRegression()
clf=GridSearchCV(model, parameters, verbose=True, n_jobs=-1)
best_clf=clf.fit(tv_X_train,y_train)
y_preds = best_clf.predict(tv_X_test)

print (f'Best Model for Logistic Regression : {best_clf.best_estimator_}')
        
print (f'Accuracy - Logistic Regression: {best_clf.score(tv_X_train,y_train):.3f}')
acc_score = accuracy_score(y_test,y_preds)

print(f'Accuracy Score - LogisticRegression: {acc_score}')

In [None]:
parameters={'criterion':['gini', 'entropy'],
            'max_depth':[10,5,20],
            'max_features':['sqrt', 'log2']}

model=RandomForestClassifier()
clf=GridSearchCV(model, parameters, verbose=True, n_jobs=-1)
best_clf=clf.fit(tv_X_train,y_train)
y_preds = best_clf.predict(tv_X_test)

print (f'Best Model for RandomForestClassifier : {best_clf.best_estimator_}')
        
print (f'Accuracy - RandomForestClassifier: {best_clf.score(tv_X_train,y_train):.3f}')

acc_score = accuracy_score(y_test,y_preds)
print(f'Accuracy Score - RandomForestClassifier: {acc_score}')

In [15]:

model=Pipeline([('tfidf', TfidfVectorizer()),
                ('classifier',LogisticRegression(C=0.01, penalty='l1', solver='saga'))])


LogReg=model.fit(X_train,y_train)
y_preds = LogReg.predict(X_test)
        
print (f'Accuracy - LogisticRegression: {LogReg.score(X_train,y_train):.3f}')
acc_score = LogReg.score(y_test,y_preds)
print(f'Accuracy Score - LogisticRegression: {acc_score}')

Accuracy - LogisticRegression: 0.410
Accuracy Score - LogisticRegression: 1.0


Looks like LogisticRegression is over fitted. Let's check:

In [16]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_preds)

array([[   0,    0,  561,    0],
       [   0,    0, 3240,    0],
       [   0,    0, 3905,    0],
       [   0,    0, 2021,    0]], dtype=int64)

In [30]:

model=Pipeline([('tfidf', TfidfVectorizer()),
                ('classifier',RandomForestClassifier(max_depth=10))])


RF=model.fit(X_train,y_train)
y_preds = RF.predict(X_test)
        
print (f'Accuracy - RandomForestClassifier: {RF.score(X_train,y_train):.3f}')
acc_score = RF.score(y_test,y_preds)
print(f'Accuracy Score - RandomForestClassifier: {acc_score}')

Accuracy - RandomForestClassifier: 0.417
Accuracy Score - RandomForestClassifier: 0.9922895034440218


In [31]:
cnfsn_mtrx= confusion_matrix(y_test, y_preds)
print(cnfsn_mtrx)

[[   0    3  558    0]
 [   0   60 3180    0]
 [   0    2 3903    0]
 [   0   10 2011    0]]


Both of them has a terrible confusion matrices!
trying another classifier:

In [27]:
from sklearn.tree import DecisionTreeClassifier

clf =  Pipeline([('tfidf', TfidfVectorizer()), 
                  ('classifier',DecisionTreeClassifier(random_state = 30, max_depth=10,splitter='best'))])

clf.fit(X_train,y_train)
y_preds = clf.predict(X_test)

acc_score = clf.score(y_test,y_preds)
print(acc_score)
cnfsn_mtrx= confusion_matrix(y_test, y_preds)
print(cnfsn_mtrx)

0.7935643055412769
[[  31  493   31    6]
 [   0 2924  247   69]
 [   0 2572 1317   16]
 [   0 1730  158  133]]


Considering all the tests and tries, I choose LogisticRegression with default hyperparameters for my model.

In [32]:

lr= Pipeline([('tfidf', TfidfVectorizer()), 
                  ('classifier',LogisticRegression(random_state=30))])
lr.fit(X_train,y_train)
y_preds = lr.predict(X_test)
acc_lr_score = lr.score(y_test,y_preds)
print(acc_lr_score)
scores['LogisticRegression']=acc_lr_score
cnfsn_mtrx= confusion_matrix(y_test, y_preds)
print(cnfsn_mtrx)

0.6335972036599157
[[ 232  179  101   49]
 [   3 2014  830  393]
 [   4  572 3209  120]
 [  17  849  447  708]]


The best confusion matrix, we got so far!
Let's save it.

In [33]:
import pickle
 
with open('D:\\Python\\DA-ML\\Project\\Winner_Model.pkl', 'wb') as f:
    pickle.dump(lr, f)