In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string

In [2]:
data = pd.read_csv('../artifacts/sentiment_analysis.csv')

In [3]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [4]:
data.shape

(7920, 3)

In [5]:
data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
7915    False
7916    False
7917    False
7918    False
7919    False
Length: 7920, dtype: bool

In [6]:
data.duplicated().sum()

np.int64(0)

In [7]:
data.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

#TEXT PREPROCESSING

In [8]:
data["tweet"].head()

0    #fingerprint #Pregnancy Test https://goo.gl/h1...
1    Finally a transparant silicon case ^^ Thanks t...
2    We love this! Would you go? #talk #makememorie...
3    I'm wired I know I'm George I was made that wa...
4    What amazing service! Apple won't even talk to...
Name: tweet, dtype: object

In [9]:
data["tweet"] = data["tweet"].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [10]:
data["tweet"].head()

0    #fingerprint #pregnancy test https://goo.gl/h1...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

REMOVE LINK

In [11]:
data["tweet"] = data["tweet"].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*', '',x,flags=re.MULTILINE) for x in x.split()))

In [12]:
data["tweet"].head()

0    #fingerprint #pregnancy test  #android #apps #...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

REMOVE PUNCTUATIONS

In [13]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ')
    return text

data["tweet"] = data["tweet"].apply(remove_punctuations)


REMOVE NUMBERS

In [14]:
data["tweet"].head()

0     fingerprint  pregnancy test   android  apps  ...
1    finally a transparant silicon case    thanks t...
2    we love this  would you go   talk  makememorie...
3    i m wired i know i m george i was made that wa...
4    what amazing service  apple won t even talk to...
Name: tweet, dtype: object

In [15]:
 data["tweet"].tail(10)

7910    perfect match  instagood  applewatch  red  ins...
7911    i am completely in love with the new iphone em...
7912    tune in  turn on  drop out     gtd in one app ...
7913    ok so my galaxy crashed after one day now i ha...
7914    gain followers rt this must follow me i follow...
7915    live out loud  lol  liveoutloud  selfie  smile...
7916    we would like to wish you an amazing day  make...
7917    helping my lovely 90 year old neighbor with he...
7918    finally got my  smart  pocket  wifi stay conne...
7919    apple barcelona     apple  store  bcn  barcelo...
Name: tweet, dtype: object

In [16]:
data["tweet"] = data["tweet"].str.replace('/d+', '', regex=True)

In [17]:
data["tweet"].tail(10)

7910    perfect match  instagood  applewatch  red  ins...
7911    i am completely in love with the new iphone em...
7912    tune in  turn on  drop out     gtd in one app ...
7913    ok so my galaxy crashed after one day now i ha...
7914    gain followers rt this must follow me i follow...
7915    live out loud  lol  liveoutloud  selfie  smile...
7916    we would like to wish you an amazing day  make...
7917    helping my lovely 90 year old neighbor with he...
7918    finally got my  smart  pocket  wifi stay conne...
7919    apple barcelona     apple  store  bcn  barcelo...
Name: tweet, dtype: object

In [18]:
!pip install nltk




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
import nltk

In [20]:
nltk.download('stopwords', download_dir='../static/model')

[nltk_data] Downloading package stopwords to ../static/model...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [21]:
with open('../static/model/corpora/stopwords/english','r') as file:
    sw = file.read().splitlines()

In [22]:
sw

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [23]:
data["tweet"] = data["tweet"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [24]:
data["tweet"].tail(10)

7910    perfect match instagood applewatch red instagr...
7911    completely love new iphone emojis iphone apple...
7912    tune turn drop gtd one app mobile mind meditat...
7913    ok galaxy crashed one day wait til monday skyr...
7914    gain followers rt must follow follow back foll...
7915    live loud lol liveoutloud selfie smile sony mu...
7916    would like wish amazing day make every minute ...
7917    helping lovely 90 year old neighbor ipad morni...
7918    finally got smart pocket wifi stay connected a...
7919    apple barcelona apple store bcn barcelona trav...
Name: tweet, dtype: object

In [25]:
data["tweet"].head()

0    fingerprint pregnancy test android apps beauti...
1    finally transparant silicon case thanks uncle ...
2    love would go talk makememories unplug relax i...
3    wired know george made way iphone cute daventr...
4    amazing service apple even talk question unles...
Name: tweet, dtype: object

STEMMMING

In [26]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [27]:
data["tweet"] = data["tweet"].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))

In [28]:
data["tweet"].head()

0    fingerprint pregnanc test android app beauti c...
1    final transpar silicon case thank uncl yay son...
2    love would go talk makememori unplug relax iph...
3    wire know georg made way iphon cute daventri home
4    amaz servic appl even talk question unless pay...
Name: tweet, dtype: object

In [29]:
data

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnanc test android app beauti c...
1,2,0,final transpar silicon case thank uncl yay son...
2,3,0,love would go talk makememori unplug relax iph...
3,4,0,wire know georg made way iphon cute daventri home
4,5,1,amaz servic appl even talk question unless pay...
...,...,...,...
7915,7916,0,live loud lol liveoutloud selfi smile soni mus...
7916,7917,0,would like wish amaz day make everi minut coun...
7917,7918,0,help love 90 year old neighbor ipad morn made ...
7918,7919,0,final got smart pocket wifi stay connect anyti...


Building Vocabulary

In [30]:
from collections import Counter
vocab = Counter()

In [31]:
for sentence in data['tweet']:
    print(sentence)

fingerprint pregnanc test android app beauti cute health iger iphoneonli iphonesia iphon
final transpar silicon case thank uncl yay soni xperia sonyexperias…
love would go talk makememori unplug relax iphon smartphon wifi connect
wire know georg made way iphon cute daventri home
amaz servic appl even talk question unless pay 19 95 stupid support
iphon softwar updat fuck phone big time stupid iphon
happi us instap instadaili us soni xperia xperiaz
new type c charger cabl uk … bay amazon etsi new year rob cross tobi young evemun mcmafia taylor spectr 2018 newyear start 2018 recip technolog samsunggalaxys9 iphonex pic twitter com pjiwq59wtc
bout go shop listen music iphon justm music likeforlik followforfollow…
photo fun selfi pool water soni camera picoftheday sun instagood boy cute outdoor
hey appl make new ipod dont make new color 2inch thinner make crash everi five fuckin minit
ha heavi machineri need appl realli drop ball design drinkyourhaterad
contempl give iphon bandwagon simpli c

In [32]:
for sentence in data ['tweet']:
    vocab.update(sentence.split())

In [33]:
vocab

Counter({'iphon': 3754,
         'appl': 2913,
         'samsung': 1399,
         'new': 1146,
         'twitter': 1124,
         'com': 1077,
         'phone': 1032,
         'follow': 890,
         'soni': 821,
         '…': 728,
         'rt': 534,
         'ipad': 527,
         'pic': 519,
         'love': 494,
         'like': 452,
         'day': 424,
         'app': 421,
         'android': 419,
         'life': 416,
         'photo': 394,
         'get': 382,
         'instagram': 359,
         'case': 353,
         'beauti': 324,
         'cute': 324,
         'work': 314,
         'gain': 312,
         'today': 311,
         'back': 298,
         'happi': 297,
         'galaxi': 297,
         'fuck': 294,
         'photographi': 294,
         'game': 284,
         'got': 281,
         'fun': 277,
         'thank': 267,
         'news': 267,
         'music': 264,
         'io': 260,
         'time': 252,
         'make': 247,
         'updat': 241,
         'smile': 232,
    

In [34]:
tokens = [key for key in vocab if vocab[key] > 10]

In [35]:
tokens

['test',
 'android',
 'app',
 'beauti',
 'cute',
 'health',
 'iger',
 'iphoneonli',
 'iphonesia',
 'iphon',
 'final',
 'case',
 'thank',
 'yay',
 'soni',
 'xperia',
 'love',
 'would',
 'go',
 'talk',
 'relax',
 'smartphon',
 'wifi',
 'connect',
 'know',
 'made',
 'way',
 'home',
 'amaz',
 'servic',
 'appl',
 'even',
 'question',
 'pay',
 '19',
 'stupid',
 'support',
 'softwar',
 'updat',
 'fuck',
 'phone',
 'big',
 'time',
 'happi',
 'us',
 'instap',
 'instadaili',
 'new',
 'type',
 'c',
 'charger',
 'cabl',
 'uk',
 '…',
 'amazon',
 'year',
 '2018',
 'newyear',
 'start',
 'technolog',
 'iphonex',
 'pic',
 'twitter',
 'com',
 'shop',
 'listen',
 'music',
 'likeforlik',
 'photo',
 'fun',
 'selfi',
 'water',
 'camera',
 'picoftheday',
 'sun',
 'instagood',
 'boy',
 'outdoor',
 'hey',
 'make',
 'ipod',
 'dont',
 'color',
 'crash',
 'everi',
 'need',
 'realli',
 'drop',
 'ball',
 'design',
 'give',
 'anoth',
 'crazi',
 'purchas',
 'lol',
 'work',
 'hard',
 'play',
 'ipad',
 'batteri',
 'cha

In [36]:
len(tokens)

1230

In [37]:
def save_vocabulary(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w' , encoding ="utf-8")
    file.write(data)
    file.close()

save_vocabulary(tokens, '../static/model/vocabul')

### Divide dataset(train,test)

In [38]:
data

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnanc test android app beauti c...
1,2,0,final transpar silicon case thank uncl yay son...
2,3,0,love would go talk makememori unplug relax iph...
3,4,0,wire know georg made way iphon cute daventri home
4,5,1,amaz servic appl even talk question unless pay...
...,...,...,...
7915,7916,0,live loud lol liveoutloud selfi smile soni mus...
7916,7917,0,would like wish amaz day make everi minut coun...
7917,7918,0,help love 90 year old neighbor ipad morn made ...
7918,7919,0,final got smart pocket wifi stay connect anyti...


In [39]:
X = data['tweet']
Y = data['label']

In [40]:
X

0       fingerprint pregnanc test android app beauti c...
1       final transpar silicon case thank uncl yay son...
2       love would go talk makememori unplug relax iph...
3       wire know georg made way iphon cute daventri home
4       amaz servic appl even talk question unless pay...
                              ...                        
7915    live loud lol liveoutloud selfi smile soni mus...
7916    would like wish amaz day make everi minut coun...
7917    help love 90 year old neighbor ipad morn made ...
7918    final got smart pocket wifi stay connect anyti...
7919    appl barcelona appl store bcn barcelona travel...
Name: tweet, Length: 7920, dtype: object

In [41]:
Y

0       0
1       0
2       0
3       0
4       1
       ..
7915    0
7916    0
7917    0
7918    0
7919    0
Name: label, Length: 7920, dtype: int64

In [42]:
!pip install scikit-learn




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [43]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size = 0.2)

In [44]:
X_train.shape

(6336,)

In [45]:
X_test.shape

(1584,)

###Vectorization


In [46]:
def vectorizer(ds, vocabulary):
    vectorized_lst = []

    for sentence in ds:
        sentence_lst = np.zeros(len(vocabulary))

        for i in range(len(vocabulary)):
            if vocabulary[i] in sentence.split():
                sentence_lst[i] = 1
        vectorized_lst.append(sentence_lst)

    vectorized_lst_new = np.asarray(vectorized_lst, dtype=np.float32)

    return vectorized_lst_new

In [47]:
vectorized_X_train = vectorizer(X_train, tokens)

In [48]:
vectorized_X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(6336, 1230), dtype=float32)

In [49]:
##for i in vectorized_X_train[0]:
   ## print(i)

In [50]:
vectorized_X_test = vectorizer(X_test, tokens)

In [51]:
vectorized_X_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(1584, 1230), dtype=float32)

In [52]:
##for i in vectorized_X_test[0]:
    ##print(i)

### handle inbalanced data

In [53]:
!pip install imbalanced-learn




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [54]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
vectorized_x_train_smote, y_train_smote = smote.fit_resample(vectorized_X_train, Y_train)
print(vectorized_x_train_smote.shape, y_train_smote.shape)

(9414, 1230) (9414,)


In [55]:
y_train_smote.value_counts()

label
0    4707
1    4707
Name: count, dtype: int64

##MODEL TRANING AND EVALUATION

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC



In [57]:
from sklearn.metrics import accuracy_score ,f1_score, precision_score, recall_score

def training_scores(y_act,y_pred):
    acc = round(accuracy_score(y_act,y_pred), 3)
    pr = round(precision_score(y_act,y_pred), 3)
    rec = round(recall_score(y_act,y_pred), 3)
    f1 = round(f1_score(y_act,y_pred), 3)
    print(f'Traning Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

def validation_scores(y_act,y_pred):
    acc = round(accuracy_score(y_act,y_pred), 3)
    pr = round(precision_score(y_act,y_pred), 3)
    rec = round(recall_score(y_act,y_pred), 3)
    f1 = round(f1_score(y_act,y_pred), 3)
    print(f'Testing Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

### Logistic Regression

In [58]:
lr = LogisticRegression()
lr.fit(vectorized_x_train_smote,y_train_smote)

y_train_pred = lr.predict(vectorized_x_train_smote)
y_test_pred = lr.predict(vectorized_X_test)
training_scores(y_train_smote,y_train_pred)
validation_scores(Y_test,y_test_pred)



Traning Scores:
	Accuracy = 0.944
	Precision = 0.921
	Recall = 0.972
	F1-Score = 0.946
Testing Scores:
	Accuracy = 0.866
	Precision = 0.696
	Recall = 0.826
	F1-Score = 0.756


### Naive Bayes

In [59]:
mnb = MultinomialNB()
mnb.fit(vectorized_x_train_smote,y_train_smote)

y_train_pred = mnb.predict(vectorized_x_train_smote)
y_test_pred = mnb.predict(vectorized_X_test)
training_scores(y_train_smote,y_train_pred)
validation_scores(Y_test,y_test_pred)

Traning Scores:
	Accuracy = 0.909
	Precision = 0.873
	Recall = 0.956
	F1-Score = 0.913
Testing Scores:
	Accuracy = 0.86
	Precision = 0.664
	Recall = 0.899
	F1-Score = 0.764


### Decision Tree

In [60]:
dtc = DecisionTreeClassifier()
dtc.fit(vectorized_x_train_smote,y_train_smote)

y_train_pred = dtc.predict(vectorized_x_train_smote)
y_test_pred = dtc.predict(vectorized_X_test)
training_scores(y_train_smote,y_train_pred)
validation_scores(Y_test,y_test_pred)

Traning Scores:
	Accuracy = 0.999
	Precision = 1.0
	Recall = 0.999
	F1-Score = 0.999
Testing Scores:
	Accuracy = 0.823
	Precision = 0.649
	Recall = 0.635
	F1-Score = 0.642


### RandomForestClassifier

In [61]:
rf = RandomForestClassifier()
rf.fit(vectorized_x_train_smote,y_train_smote)

y_train_pred = rf.predict(vectorized_x_train_smote)
y_test_pred = rf.predict(vectorized_X_test)
training_scores(y_train_smote,y_train_pred)
validation_scores(Y_test,y_test_pred)

Traning Scores:
	Accuracy = 0.999
	Precision = 1.0
	Recall = 0.999
	F1-Score = 0.999
Testing Scores:
	Accuracy = 0.86
	Precision = 0.735
	Recall = 0.693
	F1-Score = 0.713


### Support Vector Machine

In [62]:
sv = SVC()
sv.fit(vectorized_x_train_smote,y_train_smote)

y_train_pred = sv.predict(vectorized_x_train_smote)
y_test_pred = sv.predict(vectorized_X_test)
training_scores(y_train_smote,y_train_pred)
validation_scores(Y_test,y_test_pred)

Traning Scores:
	Accuracy = 0.978
	Precision = 0.961
	Recall = 0.997
	F1-Score = 0.979
Testing Scores:
	Accuracy = 0.873
	Precision = 0.721
	Recall = 0.806
	F1-Score = 0.761


In [64]:
import pickle

with open('../static/model/model.pickle','wb') as file:
    pickle.dump(lr, file)
