# ForMente model training

The model is deployed on a backend server hosted on Heroku. The code for the backend can be found in the `new_nlp_backend` and the configuration files for `Heroku` can be found in the root directory (`Procfile`, `runtime.txt`, `requirements.txt`).

The training procedure is inspired from - https://github.com/abishekarun/Text-Emotion-Classification

In [1]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords

from scipy.stats import itemfreq
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,HashingVectorizer
import pickle

In [2]:
data1 = pd.read_csv('text_emotion.csv',encoding = "ISO-8859-1")

In [3]:
data1.head(2)

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...


In [4]:
data1.shape

(40000, 4)

In [5]:
data1=data1[['tweet_id','sentiment','content']].copy()

In [6]:
data1.sentiment.value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

In [7]:
data1.sentiment = np.where((data1.sentiment == 'neutral') |(data1.sentiment == 'empty')|(data1.sentiment == 'boredom'),'neutral',data1.sentiment)

In [8]:
data1.sentiment = np.where((data1.sentiment == 'fun') |(data1.sentiment == 'enthusiasm'),'fun',data1.sentiment)

In [9]:
data1=data1[data1.sentiment !='neutral']

In [10]:
data1.sentiment.value_counts()

worry        8459
happiness    5209
sadness      5165
love         3842
fun          2535
surprise     2187
relief       1526
hate         1323
anger         110
Name: sentiment, dtype: int64

In [11]:
data2=pd.read_csv('tweets_clean.txt',sep='	',header=None)

In [12]:
data2.head(2)

Unnamed: 0,0,1,2
0,145353048817012736:,Thinks that @melbahughes had a great 50th birt...,:: surprise
1,144279638024257536:,"Como una expresión tan simple, una sola oració...",:: sadness


In [13]:
data2.columns=['tweet_id','content','sentiment']

In [14]:
data2.sentiment = data2.sentiment.str.replace(':: ','')

In [15]:
data2.sentiment.value_counts()

joy         8240
surprise    3849
sadness     3830
fear        2816
anger       1555
disgust      761
Name: sentiment, dtype: int64

In [16]:
data = data1.append(data2)

  data = data1.append(data2)


In [17]:
data.head(2)

Unnamed: 0,tweet_id,sentiment,content
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...


In [18]:
data.sentiment.value_counts()

sadness      8995
worry        8459
joy          8240
surprise     6036
happiness    5209
love         3842
fear         2816
fun          2535
anger        1665
relief       1526
hate         1323
disgust       761
Name: sentiment, dtype: int64

In [19]:
data=data[data.sentiment.isin(['sadness','anger','happiness','fear','love'])]

In [20]:
data.sentiment.value_counts()

sadness      8995
happiness    5209
love         3842
fear         2816
anger        1665
Name: sentiment, dtype: int64

In [21]:
data['content']=data['content'].str.replace('[^A-Za-z0-9\s]+', '')

  data['content']=data['content'].str.replace('[^A-Za-z0-9\s]+', '')


In [22]:
data['content']=data['content'].str.replace('http\S+|www.\S+', '', case=False)

  data['content']=data['content'].str.replace('http\S+|www.\S+', '', case=False)


In [23]:
data['content']=data['content'].str.lower()

In [24]:
target=data.sentiment
data = data.drop(['sentiment'],axis=1)

In [25]:
le=LabelEncoder()
target=le.fit_transform(target)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(data,target,stratify=target,test_size=0.4, random_state=42)

In [27]:
# Extracting features from text files
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train.content)
X_test_counts =count_vect.transform(X_test.content)
print('Shape of Term Frequency Matrix: ',X_train_counts.shape)

pickle.dump(count_vect, open("vector.pickle", 'wb'))

Shape of Term Frequency Matrix:  (13516, 24881)


In [28]:
# Machine Learning
# Training Naive Bayes (NB) classifier on training data.
clf = MultinomialNB().fit(X_train_counts,y_train)
predicted = clf.predict(X_test_counts)
nb_clf_accuracy = np.mean(predicted == y_test) * 100
print(nb_clf_accuracy)

57.56297858173344


In [29]:
predicted

array([4, 2, 4, ..., 4, 2, 4])

In [30]:
pickle.dump(clf, open("model.pickle", 'wb'))

In [31]:
loaded_model = pickle.load(open("model.pickle", 'rb'))