In [1]:
# Importing the necessary libraries
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from sklearn.preprocessing import LabelEncoder
from keras.models import load_model
import numpy as np


In [2]:
# Connecting to google drive for accessing the Sentiment.csv and also to save model
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
# Loading the Sentiment dataset
data = pd.read_csv('/content/gdrive/MyDrive/Deep Learning/ICP12/Sentiment.csv')

# Keeping only the columns that are required. Text is input and Sentiment is the output.
data = data[['text', 'sentiment']]

print("Before preprocessing: \n")
print(data['text'].head())
# Applying pre-processing task
# Converting all the characters to lower case
data['text'] = data['text'].apply(lambda x: x.lower())
# Making using of Regular expression to remove unwanted data like @ / ' # . etc.
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

# Removing 'rt' string that is persent in all the text.
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ')

print("\n After preprocessing: \n")
print(data['text'].head())

Before preprocessing: 

0    RT @NancyLeeGrahn: How did everyone feel about...
1    RT @ScottWalker: Didn't catch the full #GOPdeb...
2    RT @TJMShow: No mention of Tamir Rice and the ...
3    RT @RobGeorge: That Carly Fiorina is trending ...
4    RT @DanScavino: #GOPDebate w/ @realDonaldTrump...
Name: text, dtype: object

 After preprocessing: 

0      nancyleegrahn how did everyone feel about th...
1      scottwalker didnt catch the full gopdebate l...
2      tjmshow no mention of tamir rice and the gop...
3      robgeorge that carly fiorina is trending  ho...
4      danscavino gopdebate w realdonaldtrump deliv...
Name: text, dtype: object


In [4]:
# Fixing the maxFeatures as 2000 because of computation capabilities
maxFeatures = 2000
# Tokenizing the data
tokenizer = Tokenizer(num_words=maxFeatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
# Converting into vector 
X = tokenizer.texts_to_sequences(data['text'].values)
# padding zero's if the sentence is small, to make each vector of same size.
X = pad_sequences(X)

In [5]:
# Finding the vector length
print(X.shape)
senLength = X.shape[1]

(13871, 28)


In [6]:
# Defining the embeddingLayer dimension and Lstm nodes
embed_dim = 128
lstm_out = 196

# Function to create the model
def createModel():
    # Defining the model
    model = Sequential()
    model.add(Embedding(maxFeatures, embed_dim, input_length=X.shape[1])) # Embedding layer
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) # LSTM layer
    model.add(Dense(3, activation='softmax')) # Fully connected layer
    # categorical_crossentropy as the loss function since dealing with 3 categorical data.
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [7]:
# Converting the lables into numerical format
labelEncoder = LabelEncoder()
integerEncoded = labelEncoder.fit_transform(data['sentiment'])
# Converting these numerical format into One-hot encoding
y = to_categorical(integerEncoded)
# Splitting the data into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
# Model summary
model = createModel()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 28, 128)           256000    
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 3)                 591       
Total params: 511,391
Trainable params: 511,391
Non-trainable params: 0
_________________________________________________________________


In [9]:
# Training the model for 10 epochs with batch size of 32.
batch_size = 32
model.fit(X_train, Y_train, epochs=10, batch_size=batch_size, verbose=2)

Epoch 1/10
291/291 - 29s - loss: 0.6817 - accuracy: 0.7099
Epoch 2/10
291/291 - 29s - loss: 0.6140 - accuracy: 0.7407
Epoch 3/10
291/291 - 29s - loss: 0.5699 - accuracy: 0.7654
Epoch 4/10
291/291 - 29s - loss: 0.5182 - accuracy: 0.7875
Epoch 5/10
291/291 - 29s - loss: 0.4781 - accuracy: 0.8035
Epoch 6/10
291/291 - 30s - loss: 0.4416 - accuracy: 0.8220
Epoch 7/10
291/291 - 29s - loss: 0.4099 - accuracy: 0.8342
Epoch 8/10
291/291 - 30s - loss: 0.3759 - accuracy: 0.8470
Epoch 9/10
291/291 - 29s - loss: 0.3497 - accuracy: 0.8541
Epoch 10/10
291/291 - 29s - loss: 0.3272 - accuracy: 0.8670


<tensorflow.python.keras.callbacks.History at 0x7f135c4d82d0>

In [11]:
# Model Prediction on test dataset
score, acc = model.evaluate(X_test, Y_test, verbose=2, batch_size=batch_size)
print("Score: ",score,"  Accuracy: ",acc)
print("Metrics",model.metrics_names)

144/144 - 1s - loss: 1.2811 - accuracy: 0.6549
Score:  1.2811222076416016   Accuracy:  0.6548711061477661
Metrics ['loss', 'accuracy']


In [12]:
# Saving the Model
model.save("/content/gdrive/MyDrive/Deep Learning/ICP12/model.h5")

In [9]:
# Loading the saved model
savedModel = load_model('/content/gdrive/MyDrive/Deep Learning/ICP12/model.h5')



Model prediction on the given Sample text

In [10]:
# preprocessing the given data
sentence = ["A lot of good things are happening. We are respected again throughout the world, and that's a great thing"]
sentence[0] = sentence[0].lower()
sentence[0] = re.sub('[^a-zA-z0-9\s]', '', sentence[0])
# Tokenizing the data and converting to the vector format
sentence = tokenizer.texts_to_sequences(sentence)
# Adding the extra zero's to much the length of senLength
sentence = pad_sequences(sentence, maxlen=senLength)

print(sentence)

[[  0   0   0   0   0   0   0   0   0   0   0   0   7 445   5 146 292  35
   30  35 371   2 349   8 262   7 153 265]]


In [12]:
# Model prediction
prob = savedModel.predict(sentence,batch_size=1)
print("The probabilty of sentence is: ",prob)
pred = np.argmax(prob)
print("The model predicted the given sentence as: ",labelEncoder.classes_[pred])
# The model is 93.37% certain the given sentence is Positive.

The probabilty of sentence is:  [[0.06382039 0.00245245 0.9337271 ]]
The model predicted the given sentence as:  Positive
