In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
data=pd.read_csv('sentiment_analysis.csv')

In [4]:
data.shape

(7920, 3)

In [5]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


# Data Preprocessing

In [6]:
data.duplicated().sum()

0

In [7]:
data.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7920 entries, 0 to 7919
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7920 non-null   int64 
 1   label   7920 non-null   int64 
 2   tweet   7920 non-null   object
dtypes: int64(2), object(1)
memory usage: 185.8+ KB


In [9]:
data.describe()

Unnamed: 0,id,label
count,7920.0,7920.0
mean,3960.5,0.255808
std,2286.451399,0.436342
min,1.0,0.0
25%,1980.75,0.0
50%,3960.5,0.0
75%,5940.25,1.0
max,7920.0,1.0


In [10]:
# Text preprocess
#  1. Convert Uppercase to Lowercase
#  2. Remove links
#  3. Remove Punctuations
#  4. Remove Numbers
#  5. Remove stopwords
#  6. Stemming

In [11]:
import re
import string

In [12]:
#Convert uppercase to lowercase

In [13]:
data['tweet']=data['tweet'].apply(lambda x:' '.join(x.lower() for x in x.split()))

In [14]:
data['tweet'].head()

0    #fingerprint #pregnancy test https://goo.gl/h1...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

In [15]:
#Remove links

In [16]:
data['tweet']=data['tweet'].apply(lambda x:' '.join(re.sub(r'^https?:\/\/.*[\r\n]*','',x,flags=re.MULTILINE) for x in x.split()))

In [17]:
data['tweet'].head()

0    #fingerprint #pregnancy test  #android #apps #...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

In [18]:
# Remove Punctuations

In [19]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text=text.replace(punctuation,'')
    return text
data['tweet']=data['tweet'].apply(remove_punctuations)

In [20]:
data['tweet'].head()

0    fingerprint pregnancy test  android apps beaut...
1    finally a transparant silicon case  thanks to ...
2    we love this would you go talk makememories un...
3    im wired i know im george i was made that way ...
4    what amazing service apple wont even talk to m...
Name: tweet, dtype: object

In [21]:
#Remove Numbers

In [22]:
data['tweet']=data['tweet'].str.replace('\d+','',regex=True)

In [23]:
#Remove stopwords

In [24]:
import nltk

In [25]:
nltk.download('stopwords',download_dir='../static/model')

[nltk_data] Downloading package stopwords to ../static/model...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
with open('../static/model/corpora/stopwords/english','r') as file:
    sw=file.read().splitlines()

In [27]:
sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [28]:
data['tweet']=data['tweet'].apply(lambda x:' '.join(x for x in x.split() if x not in sw))

In [29]:
#Stemming

In [30]:
from nltk.stem import PorterStemmer
ps=PorterStemmer()

In [31]:
data['tweet']=data['tweet'].apply(lambda x:' '.join(ps.stem(x) for x in x.split()))

In [32]:
data['tweet'].head()

0    fingerprint pregnanc test android app beauti c...
1    final transpar silicon case thank uncl yay son...
2    love would go talk makememori unplug relax iph...
3    im wire know im georg made way iphon cute dave...
4    amaz servic appl wont even talk question unles...
Name: tweet, dtype: object

In [33]:
## Building vacabulary

In [34]:
from collections import Counter
vocablulary=Counter()

In [35]:
for sentence in data['tweet']:
    vocablulary.update(sentence.split())

In [36]:
len(vocablulary)

15949

In [37]:
tokens=[key for key in vocablulary if vocablulary[key]>10]

In [38]:
len(tokens)

1145

# Divede Dataset

In [39]:
x=data['tweet']
y=data['label']

In [40]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)

In [41]:
#Vectorization

In [42]:
def vectorizer(ds,vocabulary):
    vectorized_list=[]
    for sentence in ds:
        sentence_list=np.zeros(len(vocablulary))
        for i in range(len(vocabulary)):
            if vocabulary[i] in sentence.split():
                sentence_list[i]=1
        vectorized_list.append(sentence_list)
    vectorized_list_new=np.asarray(vectorized_list,dtype=np.float32)
    return vectorized_list_new

In [43]:
vectorized_x_train=vectorizer(x_train,tokens)

In [44]:
vectorized_x_test=vectorizer(x_test,tokens)

In [45]:
vectorized_x_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [46]:
y_train.value_counts()

0    4448
1    1492
Name: label, dtype: int64

In [47]:
#Handle imbalenced dataset

In [48]:
from imblearn.over_sampling import SMOTE
obj=SMOTE()
vectorized_x_train_SMOTE,y_train_SMOTE=obj.fit_resample(vectorized_x_train,y_train)

In [49]:
vectorized_x_train_SMOTE.shape

(8896, 15949)

In [50]:
y_train_SMOTE.value_counts()

0    4448
1    4448
Name: label, dtype: int64

In [75]:
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score
def training_ACCURACY (y_actual,y_preadicted):
    acc=accuracy_score(y_actual,y_preadicted)
    pre=precision_score(y_actual,y_preadicted)
    f1=f1_score(y_actual,y_preadicted)
    re_score=recall_score(y_actual,y_preadicted)
    print('training accuracy : ' ,'accuracy_score:',acc ,'precision_score:',pre ,'f1_score:',f1 
         ,'recall_score:',re_score)

In [76]:
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score
def testing_ACCURACY (y_actual,y_preadicted):
    acc=accuracy_score(y_actual,y_preadicted)
    pre=precision_score(y_actual,y_preadicted)
    f1=f1_score(y_actual,y_preadicted)
    re_score=recall_score(y_actual,y_preadicted)
    print('testing accuracy : ' ,'accuracy_score:',acc ,'precision_score:',pre ,'f1_score:',f1 
         ,'recall_score:',re_score)

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [54]:
#Logistic Regression

In [55]:
lr_model=LogisticRegression()
lr_model.fit(vectorized_x_train_SMOTE,y_train_SMOTE)

In [56]:
lr_pred=lr_model.predict(vectorized_x_train_SMOTE)

In [81]:
training_ACCURACY (y_train_SMOTE,lr_pred)

training accuracy :  accuracy_score: 0.939523381294964 precision_score: 0.9147221043699618 f1_score: 0.9412791966819472 recall_score: 0.9694244604316546


In [79]:
y_test_pred=lr_model.predict(vectorized_x_test)

In [80]:
testing_ACCURACY(y_test,y_test_pred)

testing accuracy :  accuracy_score: 0.8696969696969697 precision_score: 0.729235880398671 f1_score: 0.772887323943662 recall_score: 0.8220973782771536


In [59]:
#Naive bayes

In [60]:
nb_model=MultinomialNB()
nb_model.fit(vectorized_x_train_SMOTE,y_train_SMOTE)

In [61]:
nb_pred=nb_model.predict(vectorized_x_train_SMOTE)

In [82]:
training_ACCURACY (y_train_SMOTE,nb_pred)

training accuracy :  accuracy_score: 0.9097347122302158 precision_score: 0.8808777429467085 f1_score: 0.913029351240117 recall_score: 0.9476169064748201


In [83]:
y_test_pred=nb_model.predict(vectorized_x_test)

In [84]:
testing_ACCURACY(y_test,y_test_pred)

testing accuracy :  accuracy_score: 0.8939393939393939 precision_score: 0.7563291139240507 f1_score: 0.8198970840480274 recall_score: 0.8951310861423221


In [62]:
#DecisionTree

In [63]:
dt_model=DecisionTreeClassifier()
dt_model.fit(vectorized_x_train_SMOTE,y_train_SMOTE)

In [64]:
dt_pred=dt_model.predict(vectorized_x_train_SMOTE)

In [85]:
training_ACCURACY (y_train_SMOTE,dt_pred)

training accuracy :  accuracy_score: 0.9997751798561151 precision_score: 1.0 f1_score: 0.999775129300652 recall_score: 0.9995503597122302


In [86]:
y_test_pred=dt_model.predict(vectorized_x_test)

In [87]:
testing_ACCURACY(y_test,y_test_pred)

testing accuracy :  accuracy_score: 0.8368686868686869 precision_score: 0.7017208413001912 f1_score: 0.6944181646168401 recall_score: 0.6872659176029963


In [65]:
#RandomForest

In [66]:
rf_model=RandomForestClassifier(n_estimators=50)
rf_model.fit(vectorized_x_train_SMOTE,y_train_SMOTE)

In [67]:
rf_pred=rf_model.predict(vectorized_x_train_SMOTE)

In [88]:
training_ACCURACY (y_train_SMOTE,rf_pred)

training accuracy :  accuracy_score: 0.9997751798561151 precision_score: 1.0 f1_score: 0.999775129300652 recall_score: 0.9995503597122302


In [89]:
y_test_pred=rf_model.predict(vectorized_x_test)

In [90]:
testing_ACCURACY(y_test,y_test_pred)

testing accuracy :  accuracy_score: 0.8767676767676768 precision_score: 0.8059071729957806 f1_score: 0.757936507936508 recall_score: 0.7153558052434457


In [91]:
#svm

In [92]:
svm_model=SVC()
svm_model.fit(vectorized_x_train_SMOTE,y_train_SMOTE)

In [93]:
svm_pred=svm_model.predict(vectorized_x_train_SMOTE)

In [94]:
training_ACCURACY (y_train_SMOTE,svm_pred)

training accuracy :  accuracy_score: 0.9783048561151079 precision_score: 0.9613966601604857 f1_score: 0.9786952202229827 recall_score: 0.9966276978417267


In [95]:
y_test_pred=svm_model.predict(vectorized_x_test)

In [96]:
testing_ACCURACY(y_test,y_test_pred)

testing accuracy :  accuracy_score: 0.8742424242424243 precision_score: 0.7624309392265194 f1_score: 0.7688022284122563 recall_score: 0.7752808988764045


# Prediction Pipeline

In [99]:
def preprocessing (text):
    data=pd.DataFrame([text],columns=['tweet'])
    data['tweet']=data['tweet'].apply(lambda x:' '.join(x.lower() for x in x.split()))
    data['tweet']=data['tweet'].apply(lambda x:' '.join(re.sub(r'^https?:\/\/.*[\r\n]*','',x,flags=re.MULTILINE) for x in x.split()))
    data['tweet']=data['tweet'].apply(remove_punctuations)
    data['tweet']=data['tweet'].str.replace('\d+','',regex=True)
    data['tweet']=data['tweet'].apply(lambda x:' '.join(x for x in x.split() if x not in sw))
    data['tweet']=data['tweet'].apply(lambda x:' '.join(ps.stem(x) for x in x.split()))
    return data['tweet']



In [None]:
def vectorizer(ds,vocabulary):
    vectorized_list=[]
    for sentence in ds:
        sentence_list=np.zeros(len(vocablulary))
        for i in range(len(vocabulary)):
            if vocabulary[i] in sentence.split():
                sentence_list[i]=1
        vectorized_list.append(sentence_list)
    vectorized_list_new=np.asarray(vectorized_list,dtype=np.float32)
    return vectorized_list_new

In [134]:
txt="i love it"
preprocess_data=preprocessing (txt)
pred_new=vectorizer(preprocess_data,tokens)

In [135]:
svm_pred=lr_model.predict(pred_new)

In [136]:
svm_pred

array([0], dtype=int64)

In [127]:
# 0= Positive 1= Negative