In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
print(tf.__version__)
# Download all ntlk resources
import nltk
# nltk.download('all')

from keras_preprocessing.text import Tokenizer
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional,InputLayer
from sklearn.model_selection import train_test_split

import warnings
import keras

warnings.filterwarnings("ignore") 

2.16.1


In [2]:
df = pd.read_csv('Data_cleaned.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,polarity,text
0,89508,0,buckle up tk ride way back machine - paul simo...
1,1136798,1,rofl see get away comment like that
2,371138,0,bed dissapointed never tweet back night twitter
3,1076439,1,chipotle proud sponsor hoosier lacrosse camp f...
4,513159,0,say hot day strong wind rain


In [4]:
df.isnull().sum()

Unnamed: 0      0
polarity        0
text          380
dtype: int64

In [5]:
df = df.dropna()

In [6]:
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [7]:
df.head()

Unnamed: 0,polarity,text
0,0,buckle up tk ride way back machine - paul simo...
1,1,rofl see get away comment like that
2,0,bed dissapointed never tweet back night twitter
3,1,chipotle proud sponsor hoosier lacrosse camp f...
4,0,say hot day strong wind rain


In [8]:
X = np.array(df['text'])

In [9]:
X.shape

(99620,)

In [10]:
y= np.array(df['polarity'])

In [11]:
y.shape

(99620,)

In [12]:
# Count total no. of distinct tokens

tokenizer = Tokenizer(filters='@')
tokenizer.fit_on_texts(X)

print('No. of distinct tokens = '+str(len(tokenizer.word_index)))

No. of distinct tokens = 63783


In [13]:
# Define Vocabulary size (no. of most frequent tokens) to consider

max_vocab=50000

In [14]:
# Reload Twitter dataset with new Vocabulary

tokenizer = Tokenizer(num_words=max_vocab,filters='@')
tokenizer.fit_on_texts(X)

In [15]:
# Vectorize input text using Vocabulary

X_vectorized=tokenizer.texts_to_sequences(X)

In [16]:
# Count average length of tweets

sum=0
for sentence in X_vectorized:
  sum+=len(sentence)

print('Average length of tweets = '+str(sum/len(X_vectorized)))

Average length of tweets = 7.34530214816302


In [17]:
# Define Maximum input length of the Model

max_length=15

In [18]:
# Pad or Trim data to defined input length

X_pad = keras.preprocessing.sequence.pad_sequences(X_vectorized,max_length,padding='post',
                                                   truncating='post')

print(X_pad.shape)

(99620, 15)


In [19]:

model = keras.models.Sequential()

model.add(InputLayer(input_shape=(15,)))
model.add(Embedding(max_vocab, 100,input_length=max_length,mask_zero=True,
                      trainable=False))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [20]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,loss='binary_crossentropy',
                metrics=['accuracy'])

In [21]:
model.summary()

In [22]:
np.random.seed(123)

X_train, X_test, Y_train, Y_test = train_test_split (X_pad,y.reshape(y.shape[0],1),test_size=0.05)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(94639, 15)
(4981, 15)
(94639, 1)
(4981, 1)


In [23]:
# Train the model

history = model.fit(X_train,Y_train,batch_size=1024,epochs=15,validation_split=0.05)

Epoch 1/15
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 49ms/step - accuracy: 0.5583 - loss: 0.6882 - val_accuracy: 0.6139 - val_loss: 0.6709
Epoch 2/15
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 57ms/step - accuracy: 0.6054 - loss: 0.6672 - val_accuracy: 0.6143 - val_loss: 0.6624
Epoch 3/15
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 57ms/step - accuracy: 0.6107 - loss: 0.6611 - val_accuracy: 0.6160 - val_loss: 0.6573
Epoch 4/15
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 56ms/step - accuracy: 0.6106 - loss: 0.6567 - val_accuracy: 0.6126 - val_loss: 0.6559
Epoch 5/15
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 57ms/step - accuracy: 0.6125 - loss: 0.6530 - val_accuracy: 0.6148 - val_loss: 0.6502
Epoch 6/15
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 55ms/step - accuracy: 0.6204 - loss: 0.6465 - val_accuracy: 0.6207 - val_loss: 0.6478
Epoch 7/15
[1m88/88[0m [32m━━━━

In [25]:
# Evaluate model on Test data

model.evaluate(X_test,Y_test)

[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6591 - loss: 0.6052


[0.6101950407028198, 0.656494677066803]

In [27]:
y_pred=model.predict(X_test)

[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


In [28]:
y_pred = np.where(y_pred > 0.6, 1,0)

In [29]:
from sklearn.metrics import confusion_matrix , accuracy_score, classification_report

In [30]:
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       0.60      0.83      0.70      2492
           1       0.72      0.46      0.56      2489

    accuracy                           0.64      4981
   macro avg       0.66      0.64      0.63      4981
weighted avg       0.66      0.64      0.63      4981



In [31]:
print(confusion_matrix(Y_test,y_pred))

[[2059  433]
 [1356 1133]]
