In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt


In [2]:
data =pd.read_csv('../artifacts/sentiment_analysis.csv')

In [3]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


### Data Preprocessing

In [4]:
data.shape #looking data shape

(7920, 3)

In [5]:
data.duplicated().sum()

0

In [6]:
data.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

## Text clearing removing unnecesary thing

### I mean like links, numbers,stemming these are not useful to recognize wheather comments are negative or positive

In [7]:
#convert all capital letters to simple
#remove links
#remove punctuations(!,*,#...)
#remove numbers
#remove stopwords(is,and are..)
#stemming(started,starts->start)
import re
import string


In [8]:
data["tweet"].head(5)

0    #fingerprint #Pregnancy Test https://goo.gl/h1...
1    Finally a transparant silicon case ^^ Thanks t...
2    We love this! Would you go? #talk #makememorie...
3    I'm wired I know I'm George I was made that wa...
4    What amazing service! Apple won't even talk to...
Name: tweet, dtype: object

In [9]:
data["tweet"]=data["tweet"].apply(lambda x:" ".join(x.lower() for x in x.split()))

In [10]:
data["tweet"].head(5)

0    #fingerprint #pregnancy test https://goo.gl/h1...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

### remove links

In [11]:
data["tweet"]= data['tweet'].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*','',x,flags=re.MULTILINE) for x in x.split()))

In [12]:
data["tweet"].head(5)

0    #fingerprint #pregnancy test  #android #apps #...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

In [13]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Remove punctuations

In [14]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text=text.replace(punctuation ,'')
    return text    
data["tweet"]=data["tweet"].apply(remove_punctuations)

In [19]:
data["tweet"].tail(5)

7915    live out loud lol liveoutloud selfie smile son...
7916    we would like to wish you an amazing day make ...
7917    helping my lovely  year old neighbor with her ...
7918    finally got my smart pocket wifi stay connecte...
7919    apple barcelona apple store bcn barcelona trav...
Name: tweet, dtype: object

### Remove numbers

In [20]:
data["tweet"] = data['tweet'].str.replace(r'\d+', '', regex=True)

In [21]:
data["tweet"].tail(10)

7910    perfect match instagood applewatch red instagr...
7911    i am completely in love with the new iphone em...
7912    tune in turn on drop out  gtd in one app  mobi...
7913    ok so my galaxy crashed after one day now i ha...
7914    gain followers rt this must follow me i follow...
7915    live out loud lol liveoutloud selfie smile son...
7916    we would like to wish you an amazing day make ...
7917    helping my lovely  year old neighbor with her ...
7918    finally got my smart pocket wifi stay connecte...
7919    apple barcelona apple store bcn barcelona trav...
Name: tweet, dtype: object

### Remove stopwords

In [22]:
!pip install nltk



In [24]:
import nltk

In [25]:
nltk.download('stopwords',download_dir='../static/model')

[nltk_data] Downloading package stopwords to ../static/model...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [26]:
with open('../static/model/corpora/stopwords/english','r')as file:
    sw=file.read().splitlines()

In [27]:
sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [28]:
data["tweet"]=data["tweet"].apply(lambda x:" ".join(x for x in x.split() if x not in sw))

In [29]:
data["tweet"].tail(10)

7910    perfect match instagood applewatch red instagr...
7911    completely love new iphone emojis iphone apple...
7912    tune turn drop gtd one app mobile mind meditat...
7913    ok galaxy crashed one day wait til monday skyr...
7914    gain followers rt must follow follow back foll...
7915    live loud lol liveoutloud selfie smile sony mu...
7916    would like wish amazing day make every minute ...
7917    helping lovely year old neighbor ipad morning ...
7918    finally got smart pocket wifi stay connected a...
7919    apple barcelona apple store bcn barcelona trav...
Name: tweet, dtype: object

### Stemming

In [30]:
from nltk.stem import PorterStemmer
ps=PorterStemmer()

In [31]:
data["tweet"]=data["tweet"].apply(lambda x:" ".join(ps.stem(x) for x in x.split()))

In [32]:
data["tweet"].head(5)

0    fingerprint pregnanc test android app beauti c...
1    final transpar silicon case thank uncl yay son...
2    love would go talk makememori unplug relax iph...
3    im wire know im georg made way iphon cute dave...
4    amaz servic appl wont even talk question unles...
Name: tweet, dtype: object

In [33]:
data["tweet"]

0       fingerprint pregnanc test android app beauti c...
1       final transpar silicon case thank uncl yay son...
2       love would go talk makememori unplug relax iph...
3       im wire know im georg made way iphon cute dave...
4       amaz servic appl wont even talk question unles...
                              ...                        
7915    live loud lol liveoutloud selfi smile soni mus...
7916    would like wish amaz day make everi minut coun...
7917    help love year old neighbor ipad morn made rea...
7918    final got smart pocket wifi stay connect anyti...
7919    appl barcelona appl store bcn barcelona travel...
Name: tweet, Length: 7920, dtype: object

In [34]:
data

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnanc test android app beauti c...
1,2,0,final transpar silicon case thank uncl yay son...
2,3,0,love would go talk makememori unplug relax iph...
3,4,0,im wire know im georg made way iphon cute dave...
4,5,1,amaz servic appl wont even talk question unles...
...,...,...,...
7915,7916,0,live loud lol liveoutloud selfi smile soni mus...
7916,7917,0,would like wish amaz day make everi minut coun...
7917,7918,0,help love year old neighbor ipad morn made rea...
7918,7919,0,final got smart pocket wifi stay connect anyti...


### Convert strings to Numbers

#### Build a vocabulary

In [36]:
from collections import Counter
vocab=Counter()

In [37]:
vocab

Counter()

In [38]:
for sentence in data["tweet"]:
    vocab.update(sentence.split())

In [39]:
len(vocab)

15949

In [40]:
tokens=[key for key in vocab if vocab[key]>10]

In [41]:
len(tokens) #increase amount of features because normally number of features should be less than amount of data set rowsavoid overfit

1145

In [42]:
def save_vocabulary(lines,filename):
    data='\n'.join(lines)
    file=open(filename,'w',encoding ="utf-8")
    file.write(data)
    file.close()
    
save_vocabulary(tokens, '../static/model/vocabulary.txt')

### Split dataset

In [44]:
x=data["tweet"]
y=data["label"]

In [45]:
x

0       fingerprint pregnanc test android app beauti c...
1       final transpar silicon case thank uncl yay son...
2       love would go talk makememori unplug relax iph...
3       im wire know im georg made way iphon cute dave...
4       amaz servic appl wont even talk question unles...
                              ...                        
7915    live loud lol liveoutloud selfi smile soni mus...
7916    would like wish amaz day make everi minut coun...
7917    help love year old neighbor ipad morn made rea...
7918    final got smart pocket wifi stay connect anyti...
7919    appl barcelona appl store bcn barcelona travel...
Name: tweet, Length: 7920, dtype: object

In [46]:
y

0       0
1       0
2       0
3       0
4       1
       ..
7915    0
7916    0
7917    0
7918    0
7919    0
Name: label, Length: 7920, dtype: int64

In [47]:
!pip install scikit-learn



In [48]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [49]:
x_train.shape

(6336,)

In [50]:
y_train.shape

(6336,)

In [51]:
y_test.shape

(1584,)

In [52]:
x_test.shape

(1584,)

In [53]:
x_train

3821    happi monday motivationmonday motiv monday gig...
2057    follow capetownsup instagram sup surf fun cape...
6482    tale majey … андроид android game news io appl...
4557          realwizkhalifa bad ive noth problem product
6022    new blog entri producttest review soni actipat...
                              ...                        
723     joe biden donald trump would lead… news photog...
1825    readi instap instamood iphon iphonesia instago...
866     would like wish amaz day make everi minut coun...
764     id like send big suck appl caus updat took min...
7395    ipad ipad mini fuck everyon new crap charger u...
Name: tweet, Length: 6336, dtype: object

### Vectorization

In [54]:
def vectorizer(ds,vocabulary):
    vectorized_list=[]

    for sentence in ds:
        sentence_list=np.zeros(len(vocabulary))
        
        for i in range(len(vocabulary)):
            if vocabulary[i] in sentence.split():
                sentence_list[i]=1


        vectorized_list.append( sentence_list)
    vectorized_list_new =np.asarray(vectorized_list,dtype=np.float32)
    return  vectorized_list_new
       

In [55]:
vectorized_x_train=vectorizer(x_train,tokens)

In [56]:
vectorized_x_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [57]:
vectorized_x_test=vectorizer(x_test,tokens)

In [58]:
vectorized_x_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [59]:
y_train

3821    0
2057    0
6482    0
4557    0
6022    0
       ..
723     0
1825    0
866     0
764     1
7395    1
Name: label, Length: 6336, dtype: int64

In [60]:
y_train.value_counts()

label
0    4732
1    1604
Name: count, dtype: int64

In [61]:
!pip install imbalanced-learn



### Handle unbalance dataset

In [62]:
from imblearn.over_sampling import SMOTE
smote=SMOTE()
vectorized_x_train_smote,y_train_smote=smote.fit_resample(vectorized_x_train,y_train)
print(vectorized_x_train_smote.shape,y_train_smote.shape)

(9464, 1145) (9464,)


In [63]:
y_train_smote.value_counts()

label
0    4732
1    4732
Name: count, dtype: int64

### Model Trainning & Evaluation

In [64]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [65]:
from sklearn.metrics import accuracy_score, f1_score, precision_score,recall_score
def training_scores(y_act,y_pred):
    acc=round(accuracy_score(y_act,y_pred),3)
    pr=round(precision_score(y_act,y_pred),3)
    rec=round(recall_score(y_act,y_pred),3)
    f1=round(f1_score(y_act,y_pred),3)
    print(f'Training scores:\n\tAccuracy={acc}\n\tPrecision={pr}\n\tRecall={rec}\n\tF1-score={f1}')
    
def validation_scores(y_act,y_pred):
    acc=round(accuracy_score(y_act,y_pred),3)
    pr=round(precision_score(y_act,y_pred),3)
    rec=round(recall_score(y_act,y_pred),3)
    f1=round(f1_score(y_act,y_pred),3)
    print(f'Testing scores:\n\tAccuracy={acc}\n\tPrecision={pr}\n\tRecall={rec}\n\tF1-score={f1}')

### Logistic Regression

In [67]:
lr=LogisticRegression()

lr.fit(vectorized_x_train_smote,y_train_smote)
      
y_train_pred=lr.predict(vectorized_x_train_smote
                       ) 
y_test_pred=lr.predict(vectorized_x_test)

training_scores(y_train_smote,y_train_pred)
validation_scores(y_test,y_test_pred)


Training scores:
	Accuracy=0.938
	Precision=0.916
	Recall=0.964
	F1-score=0.939
Testing scores:
	Accuracy=0.876
	Precision=0.73
	Recall=0.851
	F1-score=0.786
