In [10]:
# Importing the necessary libraries
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from sklearn.preprocessing import LabelEncoder
from keras.models import load_model
import numpy as np
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [2]:
# Connecting to google drive for accessing the Sentiment.csv and also to save model
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
# Loading the Sentiment dataset
data = pd.read_csv('/content/gdrive/MyDrive/Deep Learning/ICP12/Sentiment.csv')

# Keeping only the columns that are required. Text is input and Sentiment is the output.
data = data[['text', 'sentiment']]

print("Before preprocessing: \n")
print(data['text'].head())
# Applying pre-processing task
# Converting all the characters to lower case
data['text'] = data['text'].apply(lambda x: x.lower())
# Making using of Regular expression to remove unwanted data like @ / ' # . etc.
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

# Removing 'rt' string that is persent in all the text.
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ')

print("\n After preprocessing: \n")
print(data['text'].head())

Before preprocessing: 

0    RT @NancyLeeGrahn: How did everyone feel about...
1    RT @ScottWalker: Didn't catch the full #GOPdeb...
2    RT @TJMShow: No mention of Tamir Rice and the ...
3    RT @RobGeorge: That Carly Fiorina is trending ...
4    RT @DanScavino: #GOPDebate w/ @realDonaldTrump...
Name: text, dtype: object

 After preprocessing: 

0      nancyleegrahn how did everyone feel about th...
1      scottwalker didnt catch the full gopdebate l...
2      tjmshow no mention of tamir rice and the gop...
3      robgeorge that carly fiorina is trending  ho...
4      danscavino gopdebate w realdonaldtrump deliv...
Name: text, dtype: object


In [4]:
# Fixing the maxFeatures as 2000 because of computation capabilities
maxFeatures = 2000
# Tokenizing the data
tokenizer = Tokenizer(num_words=maxFeatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
# Converting into vector 
X = tokenizer.texts_to_sequences(data['text'].values)
# padding zero's if the sentence is small, to make each vector of same size.
X = pad_sequences(X)

In [6]:
# Defining the embeddingLayer dimension and Lstm nodes
embed_dim = 128
lstm_out = 196

# Function to create the model
def createModel():
    # Defining the model
    model = Sequential()
    model.add(Embedding(maxFeatures, embed_dim, input_length=X.shape[1])) # Embedding layer
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) # LSTM layer
    model.add(Dense(3, activation='softmax')) # Fully connected layer
    # categorical_crossentropy as the loss function since dealing with 3 categorical data.
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [7]:
# Converting the lables into numerical format
labelEncoder = LabelEncoder()
integerEncoded = labelEncoder.fit_transform(data['sentiment'])
# Converting these numerical format into One-hot encoding
y = to_categorical(integerEncoded)
# Splitting the data into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
# Model summary
model = createModel()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 28, 128)           256000    
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 3)                 591       
Total params: 511,391
Trainable params: 511,391
Non-trainable params: 0
_________________________________________________________________


In [12]:
# Grid Search
model = KerasClassifier(build_fn=createModel,verbose=2)
# Defining the batchsize and epoch. To find which batchsize and epochs is correct.
batch_size= [16, 32, 64]
epochs = [1, 2, 3]
param_grid= dict(batch_size=batch_size, epochs=epochs)
# Defining the GridSearch
grid  = GridSearchCV(estimator=model, param_grid=param_grid)
# Fitting on training dataset and finding the best parameters
grid_result= grid.fit(X_train, y=Y_train)


465/465 - 47s - loss: 0.8313 - accuracy: 0.6464
117/117 - 1s - loss: 0.7460 - accuracy: 0.6783
465/465 - 48s - loss: 0.8244 - accuracy: 0.6476
117/117 - 1s - loss: 0.7871 - accuracy: 0.6665
465/465 - 47s - loss: 0.8296 - accuracy: 0.6455
117/117 - 1s - loss: 0.7593 - accuracy: 0.6767
465/465 - 49s - loss: 0.8228 - accuracy: 0.6440
117/117 - 1s - loss: 0.7641 - accuracy: 0.6738
465/465 - 47s - loss: 0.8253 - accuracy: 0.6438
117/117 - 1s - loss: 0.7885 - accuracy: 0.6722
Epoch 1/2
465/465 - 48s - loss: 0.8325 - accuracy: 0.6410
Epoch 2/2
465/465 - 45s - loss: 0.6842 - accuracy: 0.7077
117/117 - 1s - loss: 0.7375 - accuracy: 0.6875
Epoch 1/2
465/465 - 48s - loss: 0.8332 - accuracy: 0.6406
Epoch 2/2
465/465 - 45s - loss: 0.6891 - accuracy: 0.7136
117/117 - 1s - loss: 0.7364 - accuracy: 0.6869
Epoch 1/2
465/465 - 49s - loss: 0.8290 - accuracy: 0.6408
Epoch 2/2
465/465 - 47s - loss: 0.6775 - accuracy: 0.7135
117/117 - 1s - loss: 0.7442 - accuracy: 0.6869
Epoch 1/2
465/465 - 47s - loss: 0.83

In [13]:
# Summarize results from grid serach
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.680511 using {'batch_size': 64, 'epochs': 3}
