# Importing Libraries

In [14]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

## Load Dataset 

In [2]:
df = pd.read_csv('Sentiment.csv')

## Check dataset

In [3]:
df.head()

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,...,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,...,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,...,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


In [6]:
df.shape

(13871, 21)

In [5]:
df.columns

Index(['id', 'candidate', 'candidate_confidence', 'relevant_yn',
       'relevant_yn_confidence', 'sentiment', 'sentiment_confidence',
       'subject_matter', 'subject_matter_confidence', 'candidate_gold', 'name',
       'relevant_yn_gold', 'retweet_count', 'sentiment_gold',
       'subject_matter_gold', 'text', 'tweet_coord', 'tweet_created',
       'tweet_id', 'tweet_location', 'user_timezone'],
      dtype='object')

In [7]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,13871.0,6936.0,4004.357,1.0,3468.5,6936.0,10403.5,13871.0
candidate_confidence,13871.0,0.8556889,0.241388,0.2222,0.6742,1.0,1.0,1.0
relevant_yn_confidence,13871.0,0.9273036,0.1416959,0.3333,1.0,1.0,1.0,1.0
sentiment_confidence,13871.0,0.756936,0.2176821,0.186,0.6517,0.6813,1.0,1.0
subject_matter_confidence,13871.0,0.7828012,0.258215,0.2222,0.6413,1.0,1.0,1.0
retweet_count,13871.0,45.80333,153.9817,0.0,0.0,2.0,44.0,4965.0
tweet_id,13871.0,6.296058e+17,96118630000000.0,6.294531e+17,6.294861e+17,6.296726e+17,6.296882e+17,6.297017e+17


In [8]:
df['text']

0        RT @NancyLeeGrahn: How did everyone feel about...
1        RT @ScottWalker: Didn't catch the full #GOPdeb...
2        RT @TJMShow: No mention of Tamir Rice and the ...
3        RT @RobGeorge: That Carly Fiorina is trending ...
4        RT @DanScavino: #GOPDebate w/ @realDonaldTrump...
                               ...                        
13866    RT @cappy_yarbrough: Love to see men who will ...
13867    RT @georgehenryw: Who thought Huckabee exceede...
13868    RT @Lrihendry: #TedCruz As President, I will a...
13869    RT @JRehling: #GOPDebate Donald Trump says tha...
13870    RT @Lrihendry: #TedCruz headed into the Presid...
Name: text, Length: 13871, dtype: object

In [9]:
df['text'].shape

(13871,)

##### Unique

In [4]:
df["sentiment"].unique()

array(['Neutral', 'Positive', 'Negative'], dtype=object)

In [10]:
df = df[df.sentiment != 'Neutral']
df['sentiment'].unique()

array(['Positive', 'Negative'], dtype=object)

# Q1. Print the total number of positive and negative sentiments.

In [11]:
df['sentiment'].value_counts()

Negative    8493
Positive    2236
Name: sentiment, dtype: int64

# Q2. Build a sequential LSTM model to predict positive and negative sentiments.

In [12]:
max_features= 2000
tokenizer = keras.preprocessing.text.Tokenizer(num_words=max_features , split=' ')
tokenizer.fit_on_texts(df['text'].values)
x = tokenizer.texts_to_sequences(df['text'].values)
x = keras.preprocessing.sequence.pad_sequences(x)

In [26]:
y = pd.get_dummies(df['sentiment']).values
validation_size = 1500
train_x , test_x , train_y , test_y = train_test_split(x , y , test_size = 0.2 , random_state = 108 , shuffle = True)
x_valid , y_valid = test_x[:validation_size] , test_y[:validation_size]
test_x , test_y = test_x[validation_size:] , test_y[validation_size:]

In [27]:
embed_dim = 128
lstm_out = 196
model = keras.models.Sequential([
    keras.layers.Embedding(max_features , embed_dim , input_length = x.shape[1]),
    keras.layers.SpatialDropout1D(0.3),
    keras.layers.LSTM(lstm_out , dropout = 0.2 , recurrent_dropout = 0.2),
    keras.layers.Dense(2 , activation = 'softmax')
])
model.compile(loss = 'categorical_crossentropy' , optimizer = 'adam' , metrics = ['accuracy'])

##### Summary

In [28]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 29, 128)           256000    
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 29, 128)          0         
 lDropout1D)                                                     
                                                                 
 lstm_1 (LSTM)               (None, 196)               254800    
                                                                 
 dense_1 (Dense)             (None, 2)                 394       
                                                                 
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________


In [29]:
model.fit(train_x , train_y , batch_size = 32 , epochs = 10 ,verbose = 2,  validation_data=(x_valid , y_valid))

Epoch 1/10
269/269 - 47s - loss: 0.4123 - accuracy: 0.8249 - val_loss: 0.3824 - val_accuracy: 0.8413 - 47s/epoch - 175ms/step
Epoch 2/10
269/269 - 43s - loss: 0.3025 - accuracy: 0.8697 - val_loss: 0.3619 - val_accuracy: 0.8520 - 43s/epoch - 159ms/step
Epoch 3/10
269/269 - 43s - loss: 0.2672 - accuracy: 0.8887 - val_loss: 0.3762 - val_accuracy: 0.8587 - 43s/epoch - 159ms/step
Epoch 4/10
269/269 - 45s - loss: 0.2415 - accuracy: 0.8971 - val_loss: 0.3688 - val_accuracy: 0.8533 - 45s/epoch - 169ms/step
Epoch 5/10
269/269 - 50s - loss: 0.2156 - accuracy: 0.9104 - val_loss: 0.3845 - val_accuracy: 0.8493 - 50s/epoch - 187ms/step
Epoch 6/10
269/269 - 58s - loss: 0.1901 - accuracy: 0.9203 - val_loss: 0.4494 - val_accuracy: 0.8467 - 58s/epoch - 215ms/step
Epoch 7/10
269/269 - 48s - loss: 0.1731 - accuracy: 0.9271 - val_loss: 0.4757 - val_accuracy: 0.8420 - 48s/epoch - 179ms/step
Epoch 8/10
269/269 - 43s - loss: 0.1533 - accuracy: 0.9364 - val_loss: 0.5095 - val_accuracy: 0.8440 - 43s/epoch - 160

<keras.callbacks.History at 0x266737b15b0>

# Q3.Based on the model, check the sentiment for the following two sentences

### a. 'He is a great leader.'
### b. 'He is a terrible leader.'

In [60]:
def check(a):
    a = tokenizer.texts_to_sequences(a)
    a = keras.preprocessing.sequence.pad_sequences(a , maxlen= 29 , dtype = 'int32' , value = 0)
    sentiment = model.predict(a , batch_size = None , verbose = 2)[0]
    if(np.argmax(sentiment) == 0):
        print("negative")
    elif (np.argmax(sentiment) == 1):
        print("positive")

In [55]:
a = 'He is a great leader.'
b = 'He is a terrible leader'

In [56]:
check(a)

1/1 - 0s - 63ms/epoch - 63ms/step
positive


In [59]:
check(b)

1/1 - 0s - 79ms/epoch - 79ms/step
negative
