In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data=pd.read_csv('../artifacts/sentiment_analysis.csv')

In [3]:
data.sample(5)

Unnamed: 0,id,label,tweet
4345,4346,0,Looooove my new phone. First photo on my samsu...
6975,6976,0,My new friends now #Samsung #Note #Instalike #...
4553,4554,0,Flash is bright! #family #friends #beauty #fun...
670,671,0,"Me, Myself & I #Mumbai #life #walk #instalove ..."
6388,6389,0,Work with what you have and the rest will fall...


## Data Preprocessing for tweet attribute

In [4]:
data.shape

(7920, 3)

In [5]:
data.duplicated().sum()  #duplicated values checking

np.int64(0)

In [6]:
data.isnull().sum()     #Null values checking

id       0
label    0
tweet    0
dtype: int64

### tweet preprocessing with removing unrelevant links, numbers, symbols to handle whether the possibilty of positive or negative

In [7]:
import re    #Regular expression library
import string

In [8]:
#convert all upercase data to lowercase data


data['tweet']=data['tweet'].apply(lambda x: " " .join(x.lower() for x in x.split()))

In [9]:
data.head(5)

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #pregnancy test https://goo.gl/h1...
1,2,0,finally a transparant silicon case ^^ thanks t...
2,3,0,we love this! would you go? #talk #makememorie...
3,4,0,i'm wired i know i'm george i was made that wa...
4,5,1,what amazing service! apple won't even talk to...


In [10]:
#Remove links 

import re
import string

data['tweet'] = data['tweet'].apply(lambda x: " ".join(re.sub(r'https?://\S+', '', word) for word in x.split()))

In [11]:
data.head(5)

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #pregnancy test #android #apps #...
1,2,0,finally a transparant silicon case ^^ thanks t...
2,3,0,we love this! would you go? #talk #makememorie...
3,4,0,i'm wired i know i'm george i was made that wa...
4,5,1,what amazing service! apple won't even talk to...


In [12]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [13]:
#Remove punctuation marks

def removePunc(text):
    for punc in string.punctuation:
        text=text.replace(punc,' ')
    return text


data['tweet'] = data['tweet'].apply(removePunc)

In [14]:
data.head(5)

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnancy test android apps ...
1,2,0,finally a transparant silicon case thanks t...
2,3,0,we love this would you go talk makememorie...
3,4,0,i m wired i know i m george i was made that wa...
4,5,1,what amazing service apple won t even talk to...


In [15]:
#remove numbers

data['tweet'] = data['tweet'].str.replace('\d+',"",regex=True)
data.sample(5)

  data['tweet'] = data['tweet'].str.replace('\d+',"",regex=True)


Unnamed: 0,id,label,tweet
1255,1256,0,my new phone arrives in a couple of days e...
7181,7182,1,having a mini heart attack when my phone neede...
6748,6749,0,simone in love with gelato’s mountain gelato ...
5350,5351,1,wtf all my apps just randomly fucking deletin...
7068,7069,1,enkymion apple has lost iphone naming rights ...


In [16]:
#remove stopwords -: so,I,and,then,why.........

In [17]:
#install natural language tool kit

!pip install nltk



In [18]:
import nltk

nltk.download('stopwords',download_dir='../static/model')   #download already categorized stopwords dataset into machine

[nltk_data] Downloading package stopwords to ../static/model...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [19]:
with open('../static/model/corpora/stopwords/english','r') as stp:  #get stopwords into a list
    sw=stp.read().splitlines()

In [20]:
sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [21]:
data['tweet']=data['tweet'].apply(lambda x: " " .join(x.lower() for x in x.split() if x not in sw))  #remove stopwords

In [22]:
data.head(5)

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnancy test android apps beauti...
1,2,0,finally transparant silicon case thanks uncle ...
2,3,0,love would go talk makememories unplug relax i...
3,4,0,wired know george made way iphone cute daventr...
4,5,1,amazing service apple even talk question unles...


In [23]:
#remove stemming words -: create,creating,created,creates => create

In [24]:
from nltk.stem import PorterStemmer
st=PorterStemmer()

In [25]:
data['tweet']=data['tweet'].apply(lambda x: " " .join(st.stem(x) for x in x.split()))

In [26]:
data.head(5)

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnanc test android app beauti c...
1,2,0,final transpar silicon case thank uncl yay son...
2,3,0,love would go talk makememori unplug relax iph...
3,4,0,wire know georg made way iphon cute daventri home
4,5,1,amaz servic appl even talk question unless pay...


## Building a vocabulary based on the unique words in the tweets for vectorization

In [27]:
from collections import Counter
vocab=Counter()

In [28]:
#get values for vocabulary

for sentence in data['tweet']:
    vocab.update(sentence.split())   #craete the vocabulary by word by word as features

In [None]:
vocab

In [30]:
len(vocab)

15863

In [31]:
#Decrease the features by removing the count of the feature that has below 5 count then reduce the feature size

In [32]:
token=[key for key in vocab if vocab[key]>5]
len(token)

1923

In [33]:
#save vocabulary values as txt 

def save_voc(lines,filename):
    data='\n'.join(lines)
    file=open(filename,'w',encoding='utf-8')
    file.write(data)
    file.close()


save_voc(token,'../static/model/vocabulary.txt')

In [34]:
#split dataset into train and test

x=data['tweet']
y=data['label']

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.8)

## Vectorization

In [35]:
def vectorizer(dataset, vocabulary):
    vectorized_lst=[]  #create empty list
    for sentence in dataset:  #go sentence by sentence in x
        sentence_lst=np.zeros(len(vocabulary))   #initialize the 0 list with respect to vocab size
        for i in range(len(vocabulary)):    # go word by word in sentece in vocab size
            if vocabulary[i] in sentence.split():  #if found vocab's word then replace 0 value by 1
                sentence_lst[i] = 1
        vectorized_lst.append(sentence_lst)   #append into another list as a list 
    vectorized_lst_new=np.asarray(vectorized_lst, dtype=np.float32)   #create fully completed list into np array
    return vectorized_lst_new

In [36]:
vectorized_x_train=vectorizer(x_train,token)  #vectorize x-train values

In [37]:
vectorized_x_train

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], shape=(6336, 1923), dtype=float32)

In [38]:
vectorized_x_test=vectorizer(x_test,token)

In [39]:
vectorized_x_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(1584, 1923), dtype=float32)

In [40]:
#check whether the classes are imbalanced or not

In [41]:
y_train.value_counts()   # is imbalanced

label
0    4716
1    1620
Name: count, dtype: int64

In [42]:
!pip install imblearn



In [43]:
#Reduce Imbalanced

from imblearn.over_sampling import SMOTE
smote=SMOTE()
vectorized_x_train_smote,y_train_smote=smote.fit_resample(vectorized_x_train,y_train)


In [44]:
y_train_smote.value_counts()

label
0    4716
1    4716
Name: count, dtype: int64

# Model training and evaluate the model

In [45]:
#Create functions for get accuracy values for trainig and testing  datasets

In [48]:
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score

def training_acc(y_act,y_pred):
    acs=round(accuracy_score(y_act,y_pred),3)
    f1=round(f1_score(y_act,y_pred),3)
    prs=round(precision_score(y_act,y_pred),3)
    res=round(recall_score(y_act,y_pred),3)

    print("Training scores : "+str(acs)+"\t"+str(f1)+"\t"+str(prs)+"\t"+str(res))


def testing_scores(y_act,y_pred):
    acs=round(accuracy_score(y_act,y_pred),3)
    f1=round(f1_score(y_act,y_pred),3)
    prs=round(precision_score(y_act,y_pred),3)
    res=round(recall_score(y_act,y_pred),3)

    print("Testing scores : "+str(acs)+"\t"+str(f1)+"\t"+str(prs)+"\t"+str(res))

In [62]:
# Tune up to relevant hyper_parameters of a algorithm

from sklearn.model_selection import GridSearchCV

def getBestModel(model,param_list):
    grid_obj=GridSearchCV(estimator=model,param_grid=param_list)
    grid_fit=grid_obj.fit(vectorized_x_train_smote,y_train_smote)
    
    #get best parameters
    best_model=grid_fit.best_estimator_
    return best_model

## For Logistic Regression



In [70]:
from sklearn.linear_model import LogisticRegression

lr=LogisticRegression()


lr.fit(vectorized_x_train_smote,y_train_smote)
y_train_pred=lr.predict(vectorized_x_train_smote)
y_test_pred=lr.predict(vectorized_x_test)

training_acc(y_train_smote,y_train_pred)

testing_scores(y_test,y_test_pred)


Training scores : 0.952	0.953	0.931	0.976
Testing scores : 0.881	0.786	0.727	0.855


## For Multinomial NaiveBase 

In [64]:
from sklearn.naive_bayes import MultinomialNB

mnv=MultinomialNB()
mnv.fit(vectorized_x_train_smote,y_train_smote)
y_train_pred=mnv.predict(vectorized_x_train_smote)
y_test_pred=mnv.predict(vectorized_x_test)

training_acc(y_train_smote,y_train_pred)

testing_scores(y_test,y_test_pred)

Training scores : 0.912	0.916	0.873	0.964
Testing scores : 0.878	0.796	0.694	0.933


## For DecisionTreeClassifier

In [66]:
from sklearn.tree import DecisionTreeClassifier

dc=DecisionTreeClassifier()
param_list={'criterion':['gini', 'entropy', 'log_loss'],
            'splitter':['best', 'random']
           }
dc=getBestModel(dc,param_list)

dc.fit(vectorized_x_train_smote,y_train_smote)
y_train_pred=dc.predict(vectorized_x_train_smote)
y_test_pred=dc.predict(vectorized_x_test)

training_acc(y_train_smote,y_train_pred)

testing_scores(y_test,y_test_pred)

Training scores : 1.0	1.0	1.0	1.0
Testing scores : 0.836	0.672	0.692	0.653


## For Random Forest Classifier

In [67]:
from sklearn.ensemble import RandomForestClassifier

rfc=RandomForestClassifier()

param_list={'criterion':['gini', 'entropy', 'log_loss'],
            'class_weight':['balanced', 'balanced_subsample'],
            'n_estimators':[10,30,50,75,100]
           }
rfc=getBestModel(rfc,param_list)


rfc.fit(vectorized_x_train_smote,y_train_smote)
y_train_pred=rfc.predict(vectorized_x_train_smote)
y_test_pred=rfc.predict(vectorized_x_test)

training_acc(y_train_smote,y_train_pred)

testing_scores(y_test,y_test_pred)

  _data = np.array(data, dtype=dtype, copy=copy,


Training scores : 1.0	1.0	1.0	1.0
Testing scores : 0.879	0.759	0.778	0.741


## For Support Vector Classifier

In [69]:
from sklearn.svm import SVC

svc=SVC()

svc.fit(vectorized_x_train_smote,y_train_smote)
y_train_pred=svc.predict(vectorized_x_train_smote)
y_test_pred=svc.predict(vectorized_x_test)

training_acc(y_train_smote,y_train_pred)

testing_scores(y_test,y_test_pred)

Training scores : 0.981	0.982	0.967	0.997
Testing scores : 0.893	0.8	0.766	0.837


## Select Support Vector Classifier and go forward

In [72]:
#save model
import pickle

with open('../static/model/model.pickle','wb') as model:
    pickle.dump(svc,model)