In [3]:
#Required python libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import pickle
import re

import nltk
import nltk.data
from string import punctuation 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords

In [4]:
df_6emotions = pd.read_pickle('../data/raw/emotions_training.pkl')
df_senti=pd.read_csv("../data/raw/sentiments_training.csv", encoding= 'unicode_escape')

##### There are three sentiments like positive,negative and neutral in the data for sentiment analysis.  As we need only the data having neutral label we need to separate it. 

In [5]:
df_senti.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [6]:
df_neutral=df_senti[df_senti.sentiment=='neutral'][['text','sentiment']]
df_neutral

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
5,http://www.dothebouncy.com/smf - some shameles...,neutral
7,Soooo high,neutral
8,Both of you,neutral
10,"as much as i love to be hopeful, i reckon the...",neutral
...,...,...
27468,"few grilled mushrooms and olives, feta cheese ...",neutral
27469,94 more days till BH comes back to LA,neutral
27471,"i`m defying gravity. and nobody in alll of oz,...",neutral
27473,in spoke to you yesterday and u didnt respond...,neutral


In [7]:
df_6emotions.head()

Unnamed: 0,text,emotions
27383,i feel awful about it too because it s my job ...,sadness
110083,im alone i feel awful,sadness
140764,ive probably mentioned this before but i reall...,joy
100071,i was feeling a little low few days back,sadness
2837,i beleive that i am much more sensitive to oth...,love


In [8]:
df_neutral=df_neutral.rename(columns={'sentiment':'emotions'})

In [9]:
df_neutral.head()

Unnamed: 0,text,emotions
0,"I`d have responded, if I were going",neutral
5,http://www.dothebouncy.com/smf - some shameles...,neutral
7,Soooo high,neutral
8,Both of you,neutral
10,"as much as i love to be hopeful, i reckon the...",neutral


In [10]:
df=pd.concat([df_6emotions,df_neutral], ignore_index=True)
df

Unnamed: 0,text,emotions
0,i feel awful about it too because it s my job ...,sadness
1,im alone i feel awful,sadness
2,ive probably mentioned this before but i reall...,joy
3,i was feeling a little low few days back,sadness
4,i beleive that i am much more sensitive to oth...,love
...,...,...
427922,"few grilled mushrooms and olives, feta cheese ...",neutral
427923,94 more days till BH comes back to LA,neutral
427924,"i`m defying gravity. and nobody in alll of oz,...",neutral
427925,in spoke to you yesterday and u didnt respond...,neutral


In [11]:
df=df.reset_index()

In [12]:
def text_cleaning(text):
   
    text = re.sub(r"[^A-Za-z]", " ", str(text))
    
     #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    # Remove punctuation from text
    text = "".join([c for c in text if c not in punctuation])
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.split()
    text = [w for w in text if not w in stopwords]
    text = " ".join(text)
        
    text = text.split()
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
    text = " ".join(lemmatized_words)
    text=text.lower()
    
    return text 

In [13]:
df['cleaned_text'] = df['text'].apply(lambda x: text_cleaning(x))


In [22]:
df['cleaned_text'].to_csv("../data/processed/cleaned_text_neutral.csv", index=False, header=False)

In [14]:
#Defining class for each emotion
df['labels'] = df['emotions'].factorize()[0]
df.head()

Unnamed: 0,index,text,emotions,cleaned_text,labels
0,0,i feel awful about it too because it s my job ...,sadness,feel awful job get position succeed happen,0
1,1,im alone i feel awful,sadness,im alone feel awful,0
2,2,ive probably mentioned this before but i reall...,joy,ive probably mentioned really feel proud actua...,1
3,3,i was feeling a little low few days back,sadness,feeling little low day back,0
4,4,i beleive that i am much more sensitive to oth...,love,beleive much sensitive people feeling tend com...,2


In [15]:
uniquevalues = pd.unique(df[['emotions']].values.ravel())
df_unique=pd.DataFrame(uniquevalues,columns=['emotion'])


In [16]:
df_unique

Unnamed: 0,emotion
0,sadness
1,joy
2,love
3,anger
4,fear
5,surprise
6,neutral


In [17]:
df_unique.to_csv('../labels_prediction/emotions_neutral.csv',index=False)

#### Undersampling the data

In [18]:
import imblearn
from imblearn.under_sampling import RandomUnderSampler

In [31]:
#importing libraries for models and nlp tasks
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

from sklearn import utils


In [20]:
tfidf_vectorizer = TfidfVectorizer()

In [21]:
y =df['labels']

In [22]:
#Train test split of the data
Xtrain, Xtest, ytrain, ytest = train_test_split(df['cleaned_text'], y, test_size=0.3,random_state=1)
Xtrain_tfidf = tfidf_vectorizer.fit_transform(Xtrain)
Xtest_tfidf = tfidf_vectorizer.transform(Xtest)

In [35]:
with open('../tfidfvectors/tfidf_vect_neutral.pkl', 'wb') as file:  
    pickle.dump(tfidf_vectorizer, file) 

##### Taking equal number of samples in test data also

In [23]:
#Train test split of the data
Xtrain, Xtest, ytrain, ytest = train_test_split(df['cleaned_text'], y, test_size=0.3,random_state=1)
Xtrain_tfidf = tfidf_vectorizer.fit_transform(Xtrain)

In [27]:
#ytest.value_counts()
type(ytest)

pandas.core.series.Series

In [28]:
df_test= pd.concat([Xtest, ytest], axis=1)

In [34]:
df_test.head()

Unnamed: 0,cleaned_text,labels
275014,feeling really stressed gave day watched docum...,3
194599,feel like sitting mud puddle sandbox caring ca...,2
122773,feel really dazed im getting,5
177010,feeling stressed looked back line behind,3
249025,intrigued wide eyed wonder child must feel lit...,3


In [30]:
df_test.labels.value_counts()

1    42285
0    36453
3    17346
4    14297
2    10285
5     4408
6     3305
Name: labels, dtype: int64

In [32]:
df_test=utils.shuffle(df_test.groupby("labels").head(3305))

In [33]:
df_test.labels.value_counts()

3    3305
2    3305
5    3305
1    3305
0    3305
4    3305
6    3305
Name: labels, dtype: int64

In [35]:
Xtest_bal=df_test['cleaned_text']
ytest_bal=df_test['labels']

In [36]:
Xtest_bal_tfidf = tfidf_vectorizer.transform(Xtest_bal)

##### For undersampling the data, the text data has to be vectorized, otherwise getting an error. Hence, the data has been split into train and test and applied tfidf vectorization.

In [37]:
undersample = RandomUnderSampler()
X_under, y_under = undersample.fit_resample(Xtrain_tfidf, ytrain)


#### Models
##### Logistic Regression

In [38]:
#Logistic Regression with multinomial
lr_mn = LogisticRegression(multi_class='multinomial', solver='lbfgs')
lr_mn.fit(X_under, y_under)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
ypred_lr_mn=lr_mn.predict(Xtest_tfidf)

In [40]:
tr_acc_lr_mn = lr_mn.score(X_under, y_under)*100
test_acc_lr_mn =  accuracy_score(ytest,ypred_lr_mn) * 100
print(tr_acc_lr_mn,test_acc_lr_mn)

94.28973688541076 88.77152805365364


In [51]:
pickle.dump(lr_mn, open('../models/lr_neutral.pkl', 'wb'))

In [43]:
#Logistic Regression with One vs Rest
lr_ovr = LogisticRegression(multi_class='ovr', solver='liblinear')
lr_ovr.fit(X_under, y_under)

In [44]:
ypred_lr_ovr=lr_ovr.predict(Xtest_tfidf)

In [45]:
tr_acc_lr_ovr = lr_ovr.score(X_under, y_under)*100
test_acc_lr_ovr =  accuracy_score(ytest,ypred_lr_ovr) * 100
print(tr_acc_lr_ovr,test_acc_lr_ovr)

93.6132087546397 88.73024404302885


In [52]:
pickle.dump(lr_ovr, open('../models/lr_ovr_neutral.pkl', 'wb'))

For balanced test data:

In [41]:
ypred_lr_mn=lr_mn.predict(Xtest_bal_tfidf)

In [42]:
test_acc_lr_mn =  accuracy_score(ytest_bal,ypred_lr_mn) * 100
print(tr_acc_lr_mn,test_acc_lr_mn)

94.28973688541076 91.35941214609899


##### SVM

In [46]:
svm = SVC( kernel ='linear',C = 1, decision_function_shape='ovo')
svm.fit(X_under, y_under)

In [47]:
ypred_svm=svm.predict(Xtest_tfidf)

In [48]:
tr_acc_svm = svm.score(X_under, y_under)*100
test_acc_svm =  accuracy_score(ytest,ypred_svm) * 100
print(tr_acc_svm,test_acc_svm)

94.92421056480957 88.5767921544801


In [49]:
pickle.dump(svm, open('../models/svm_neutral.pkl', 'wb'))