### Data Collection

In [1]:
#extracting data from zip file
import zipfile
PATH = '/content/drive/My Drive/Datasets/database.sqlite.zip'
with zipfile.ZipFile(PATH,'r') as zip_ref:
    zip_ref.extractall('')

In [2]:
import sqlite3
import pandas as pd
cnx = sqlite3.connect('/content/database.sqlite')
#importing the data
df = pd.read_sql_query("SELECT * FROM sentiment",cnx)

In [3]:
df.head()

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,name,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,I_Am_Kenzi,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,PeacefulQuest,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,PussssyCroook,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,MattFromTexas31,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,sharonDay5,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


### Importing Essential Libraries

In [4]:
# This Python 3 environtment comes with many helpful
import numpy as np
# CountVectorizer is used to convert the text to vector form
from sklearn.feature_extraction.text import CountVectorizer
# Tokenizer converts sentences to list of unique word
from tensorflow.keras.preprocessing.text import Tokenizer
# pad_sequences for padding the sentences to match the smaller sizes of max sentence size
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Sequential model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,LSTM,SpatialDropout1D
from sklearn.model_selection import train_test_split
import re

Using TensorFlow backend.


### Performing EDA and Data Preprocessing

In [5]:
#only selecting 'text' and 'sentiment' columns
data = df[['text','sentiment']]

In [6]:
#number of unique categories 
set(data['sentiment'])

{'Negative', 'Neutral', 'Positive'}

In [7]:
#values in each categories
data['sentiment'].value_counts()

Negative    8493
Neutral     3142
Positive    2236
Name: sentiment, dtype: int64

In [8]:
# filtering out the neutral ones
data = data[data.sentiment != 'Neutral'] 

In [9]:
data['sentiment'].value_counts()

Negative    8493
Positive    2236
Name: sentiment, dtype: int64

#### Data Cleaning:
Following steps are performed in cleaning the data
1. converting to lower case
2. remove non alpha-numerical values
3. removing 'rt' which is common in all the tweets

In [10]:
data['text'] = data['text'].apply(lambda x:x.lower()) #converts to lower case
data['text'] = data['text'].apply(lambda x:re.sub('[^a-zA-Z0-9\s]','',x)) #only alpha numerical and white spaces are allowed

In [11]:
data['text'].head()

1    rt scottwalker didnt catch the full gopdebate ...
3    rt robgeorge that carly fiorina is trending  h...
4    rt danscavino gopdebate w realdonaldtrump deli...
5    rt gregabbotttx tedcruz on my first day i will...
6    rt warriorwoman91 i liked her and was happy wh...
Name: text, dtype: object

In [12]:
# replacing rt with '' 
for idx,row in data.iterrows():
  row[0] = row[0].replace('rt','')

In [13]:
data['text'].head()

1     scottwalker didnt catch the full gopdebate la...
3     robgeorge that carly fiorina is trending  hou...
4     danscavino gopdebate w realdonaldtrump delive...
5     gregabbotttx tedcruz on my first day i will r...
6     warriorwoman91 i liked her and was happy when...
Name: text, dtype: object

#### Tokenizer

In [14]:
max_features = 2000
#max 2000 words(most common words) and splits on space
tokenizer = Tokenizer(num_words = max_features,split = ' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
print(X)

[[358, 120, 1, 711, 2, 39, 58, 234, 36, 207, 6, 172, 1740, 12, 1301, 1386, 732], [16, 281, 249, 5, 807, 102, 169, 26, 133, 6, 1, 170, 12, 2, 231, 712, 17], [1242, 2, 307, 23, 1920, 1, 1606, 213, 12, 1, 691, 6, 183, 204, 366, 678], [125, 17, 53, 260, 404, 9, 82, 300, 434, 1302, 1741, 1133, 62, 1921, 191, 2, 51], [9, 1134, 169, 8, 21, 1303, 63, 9, 604, 185, 21, 186, 4, 34, 1, 555, 19, 808, 2, 44, 733], [12, 1, 167, 96, 540, 34, 1, 103, 605, 52, 59, 1742, 7, 17, 606, 2], [39, 147, 26, 809, 13, 2, 908], [23, 12, 46, 16], [37, 1922, 47, 4, 1087, 20, 69, 2, 172, 541, 6, 1, 58, 273, 331], [31, 1187, 53, 445, 22, 54, 144, 1, 2, 21], [659, 9, 519, 301, 165, 55, 12, 1304, 180, 13, 47, 2], [302, 367, 1, 6, 877, 14, 92, 457, 268, 5, 269, 2], [264, 458, 949, 14, 445, 22, 107, 12, 17, 2, 467], [607, 52, 184, 129, 173, 108, 29, 1, 2, 51, 275, 45, 6, 108], [757, 878, 5, 574, 12, 1305, 138, 993, 5, 1, 42, 150, 36, 1, 757, 1, 593, 426, 781, 14, 1, 49, 2], [1387, 116, 1306, 14, 1607, 14, 1743, 8, 114, 19

In [15]:
print('before padding : ',len(X[0]))
# padding
X = pad_sequences(X)
print('after padding : ',len(X[0]))
print(X)

before padding :  17
after padding :  28
[[   0    0    0 ... 1301 1386  732]
 [   0    0    0 ...  231  712   17]
 [   0    0    0 ...  204  366  678]
 ...
 [   0    0    0 ...   71   65    3]
 [   0    0    0 ... 1004 1398   73]
 [   0    0    0 ...  194    3  710]]


In [16]:
X.shape

(10729, 28)

In [30]:
Y  = pd.get_dummies(data['sentiment']).values
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.33,random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(7188, 28) (7188, 2)
(3541, 28) (3541, 2)


In [29]:
embed_dim = 256 # output
lstm_out = 512 # LSTM output

model = Sequential()
# Embedding layer is used if we have text data
model.add(Embedding(max_features,embed_dim,input_length=X.shape[1]))
#SpatialDropout1D is used for Embedding layers
model.add(SpatialDropout1D(0.4))
#LSTM layer recurrent_dropout is for recurrent networks
model.add(LSTM(lstm_out,dropout = 0.5,recurrent_dropout=0.5))
# output layer
model.add(Dense(2,activation = 'softmax'))

# compiling
model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 28, 256)           512000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 28, 256)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 512)               1574912   
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 1026      
Total params: 2,087,938
Trainable params: 2,087,938
Non-trainable params: 0
_________________________________________________________________
None


In [32]:
batch_size = 32
model.fit(X_train,Y_train,epochs = 7,batch_size = batch_size,verbose = 1,validation_data=(X_test,Y_test))

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<tensorflow.python.keras.callbacks.History at 0x7f4549d98b00>

In [47]:
def sentiment_of_tweet(tweet):
    twt = [tweet] #1-D array
    #vectorizing the tweet by the pre-fitted tokenizer instance
    twt = tokenizer.texts_to_sequences(twt)
    #padding the tweet to have exactly the same shape as 'embedding_2' input
    twt = pad_sequences(twt,maxlen=28,dtype='int32',value=0)
    sentiment = model.predict(twt,batch_size = 1,verbose = 2)[0]
    
    print('tweet: {}'.format(tweet))
    
    if(np.argmax(sentiment) == 0):
        print('Negative')
    elif(np.argmax(sentiment) == 1):
        print('Positive')

In [48]:
sentiment_of_tweet('The food was very delicious')

1/1 - 0s
tweet: The food was very delicious
Positive


In [49]:
sentiment_of_tweet('The food was worst')

1/1 - 0s
tweet: The food was worst
Negative
