# Sentiment Analysis Using LSTM

In [34]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

In [35]:
df=pd.read_csv('Uber_Dataset1.csv')

In [36]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,Subjectivity,Polarity,Analysis,label
0,0,Some very good simulations of bus v car can b...,0.78,0.91,Positive,1
1,1,the all time greatest price and rates ever...,0.875,0.9,Positive,1
2,2,Your services at Bengaluru Airport is so scre...,0.6,0.7,Positive,1
3,3,Lots of good analysis on Uber IPO in the shar...,0.625,0.675,Positive,1
4,4,Seeing this I didnt book any ride from you w...,0.7,0.6,Positive,1


In [37]:
###Drop Nan Values
df=df.dropna()


In [38]:
## Get the Independent Features

X=df.drop('label',axis=1)

In [39]:
## Get the Dependent features
y=df['label']

In [40]:
X.shape

(163, 5)

In [41]:
y.shape

(163,)

In [42]:
import tensorflow as tf

In [43]:
tf.__version__

'2.3.0'

In [44]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [45]:
### Vocabulary size
voc_size=5000

### Onehot Representation

In [46]:
messages=X.copy()

In [47]:
messages['tweet'][1]

'    the all time greatest price and rates ever been paid by any of company great working with uber â‚¹  for trip Shame on you guys  httpstcoFwsemKUEWZ'

In [48]:
messages.reset_index(inplace=True)

In [49]:
import nltk
import re
from nltk.corpus import stopwords

In [50]:
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [51]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    print(i)
    review = re.sub('[^a-zA-Z]', ' ', messages['tweet'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162


In [52]:
corpus

['good simul bu v car done uber pool even compet bu th price',
 'time greatest price rate ever paid compani great work uber trip shame guy httpstcofwsemkuewz',
 'servic bengaluru airport screw cab driver keep cancel request increas price support staff good noth happen month',
 'lot good analysi uber ipo share price determin spend bn ceo manag wall st narr asid comment market postion vs tesla interest httpstcolusnz',
 'see didnt book ride went serv much better fair price look like costli',
 'might good time get rid peak time price hike lose busi rta colour top meter rate frequent lose peak time like peopl stop use uber dubai',
 'gorgeou look phone best phone k quad camera uber fast builtin min gb ram batteri w front amp back honest price sale today noon httpstcoqhwjniyu',
 'explain deliveri charg rs everytim start rs deliveri charg rs complet food price httpstcobsogv',
 'need help uber driver charg price',
 'ye call olaub stay suburb time cab driver cancel trip wont abl afford surg pric

In [53]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr

[[3220, 4732, 61, 4081, 1014, 1733, 1451, 1494, 4062, 226, 61, 908, 2970],
 [2864,
  1743,
  2970,
  3795,
  4827,
  2348,
  2688,
  3641,
  3963,
  1451,
  3123,
  4696,
  4083,
  525],
 [2772,
  2799,
  1987,
  254,
  77,
  3381,
  4987,
  12,
  2703,
  4053,
  2970,
  463,
  1788,
  3220,
  3409,
  4442,
  948],
 [1029,
  3220,
  2233,
  1451,
  513,
  4547,
  2970,
  1163,
  4699,
  1845,
  2929,
  4247,
  2772,
  2146,
  1584,
  3456,
  2948,
  4397,
  4612,
  1983,
  2864,
  4285,
  2327],
 [3958, 1761, 4481, 4576, 1991, 1827, 3125, 2678, 2133, 2970, 1793, 817, 1260],
 [3085,
  3220,
  2864,
  2902,
  4014,
  1155,
  2864,
  2970,
  461,
  2469,
  2355,
  3577,
  3291,
  105,
  2275,
  3795,
  705,
  2469,
  1155,
  2864,
  817,
  3217,
  42,
  4817,
  1451,
  4904],
 [3774,
  1793,
  1052,
  4798,
  1052,
  4723,
  3360,
  492,
  1451,
  2812,
  4179,
  541,
  2864,
  549,
  4796,
  279,
  4277,
  3889,
  1791,
  2605,
  2970,
  2387,
  2699,
  3741,
  1357],
 [1320,
  3728,
  2

### Embedding Representation

In [54]:
sent_length=40
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ...   61  908 2970]
 [   0    0    0 ... 4696 4083  525]
 [   0    0    0 ... 3409 4442  948]
 ...
 [   0    0    0 ... 4382 3123  516]
 [   0    0    0 ... 3836 4576 1594]
 [   0    0    0 ... 2970 4481 2864]]


In [55]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0, 3220, 4732,   61, 4081, 1014, 1733,
       1451, 1494, 4062,  226,   61,  908, 2970])

In [56]:
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 40, 40)            200000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               56400     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [57]:
len(embedded_docs),y.shape

(163, (163,))

In [58]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [59]:
X_final.shape,y_final.shape

((163, 40), (163,))

In [67]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.4, random_state=51)

### Model Training

In [68]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1d104754f88>

### Performance Metrics And Accuracy

In [69]:
y_pred=model.predict_classes(X_test)

In [70]:
from sklearn.metrics import confusion_matrix

In [71]:
confusion_matrix(y_test,y_pred)

array([[16,  9],
       [ 1, 40]], dtype=int64)

In [72]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8484848484848485

In [73]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.94      0.64      0.76        25
           1       0.82      0.98      0.89        41

    accuracy                           0.85        66
   macro avg       0.88      0.81      0.83        66
weighted avg       0.86      0.85      0.84        66

