In [17]:
# Importing the necessary libraries
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from sklearn.preprocessing import LabelEncoder
from keras.models import load_model
import numpy as np
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [2]:
# Connecting to google drive for accessing the Sentiment.csv and also to save model
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
# Loading the Sentiment dataset
data = pd.read_csv('/content/gdrive/MyDrive/Deep Learning/ICP12/spam.csv',encoding="ISO-8859-1")

# Keeping only the columns that are required. v2 is input and v1 is the output. Remaining columns are empty.
data = data[['v1', 'v2']]

print("Before preprocessing: \n")
print(data['v2'].head())
# Applying pre-processing task
# Converting all the characters to lower case
data['v2'] = data['v2'].apply(lambda x: x.lower())
# Making using of Regular expression to remove unwanted data like @ / ' # . etc.
data['v2'] = data['v2'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

# Removing 'rt' string that is persent in all the text.
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ')

print("\n After preprocessing: \n")
print(data['v2'].head())

Before preprocessing: 

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: v2, dtype: object

 After preprocessing: 

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in 2 a wkly comp to win fa cup fina...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
Name: v2, dtype: object


In [4]:
# Fixing the maxFeatures as 2000 because of computation capabilities
maxFeatures = 2000
# Tokenizing the data
tokenizer = Tokenizer(num_words=maxFeatures, split=' ')
tokenizer.fit_on_texts(data['v2'].values)
# Converting into vector 
X = tokenizer.texts_to_sequences(data['v2'].values)
# padding zero's if the sentence is small, to make each vector of same size.
X = pad_sequences(X)

In [5]:
# Finding the vector length
print(X.shape)
senLength = X.shape[1]

(5572, 152)


In [6]:
# Defining the embeddingLayer dimension and Lstm nodes
embed_dim = 128
lstm_out = 196

# Function to create the model
def createModel():
    # Defining the model
    model = Sequential()
    model.add(Embedding(maxFeatures, embed_dim, input_length=X.shape[1])) # Embedding layer
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) # LSTM layer
    model.add(Dense(2, activation='sigmoid')) # Fully connected layer
    # categorical_crossentropy as the loss function since dealing with 3 categorical data.
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [7]:
# Converting the lables into numerical format
labelEncoder = LabelEncoder()
integerEncoded = labelEncoder.fit_transform(data['v1'])
# Converting these numerical format into One-hot encoding
y = to_categorical(integerEncoded)
# Splitting the data into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [11]:
# Model summary
model = createModel()
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 152, 128)          256000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________


In [12]:
# Training the model for 10 epochs with batch size of 32.
batch_size = 32
model.fit(X_train, Y_train, epochs=5, batch_size=batch_size, verbose=2)

Epoch 1/5
117/117 - 56s - loss: 0.2215 - accuracy: 0.9314
Epoch 2/5
117/117 - 55s - loss: 0.0434 - accuracy: 0.9863
Epoch 3/5
117/117 - 54s - loss: 0.0237 - accuracy: 0.9917
Epoch 4/5
117/117 - 54s - loss: 0.0154 - accuracy: 0.9965
Epoch 5/5
117/117 - 54s - loss: 0.0061 - accuracy: 0.9981


<tensorflow.python.keras.callbacks.History at 0x7fc78e134310>

In [13]:
# Saving the Model
model.save("/content/gdrive/MyDrive/Deep Learning/ICP12/spamModel.h5")

In [14]:
# Loading the saved model
savedModel = load_model('/content/gdrive/MyDrive/Deep Learning/ICP12/spamModel.h5')



In [15]:
# Model Prediction on test dataset
score, acc = savedModel.evaluate(X_test, Y_test, verbose=2, batch_size=batch_size)
print("Score: ",score,"  Accuracy: ",acc)
print("Metrics",model.metrics_names)

58/58 - 2s - loss: 0.0900 - accuracy: 0.9810
Score:  0.09004170447587967   Accuracy:  0.9809679388999939
Metrics ['loss', 'accuracy']


In [19]:
# Grid Search
model = KerasClassifier(build_fn=createModel,verbose=2)
# Defining the batchsize and epoch. To find which batchsize and epochs is correct.
batch_size= [32, 64]
epochs = [2, 3]
param_grid= dict(batch_size=batch_size, epochs=epochs)
# Defining the GridSearch
grid  = GridSearchCV(estimator=model, param_grid=param_grid)
# Fitting on training dataset and finding the best parameters
grid_result= grid.fit(X_train, y=Y_train)

Epoch 1/2
94/94 - 47s - loss: 0.2137 - accuracy: 0.9324
Epoch 2/2
94/94 - 44s - loss: 0.0466 - accuracy: 0.9866
24/24 - 1s - loss: 0.0572 - accuracy: 0.9839
Epoch 1/2
94/94 - 46s - loss: 0.2095 - accuracy: 0.9270
Epoch 2/2
94/94 - 46s - loss: 0.0524 - accuracy: 0.9856
24/24 - 1s - loss: 0.0519 - accuracy: 0.9839
Epoch 1/2
94/94 - 47s - loss: 0.2265 - accuracy: 0.9230
Epoch 2/2
94/94 - 45s - loss: 0.0471 - accuracy: 0.9859
24/24 - 1s - loss: 0.1008 - accuracy: 0.9719
Epoch 1/2
94/94 - 48s - loss: 0.2626 - accuracy: 0.9136
Epoch 2/2
94/94 - 45s - loss: 0.0551 - accuracy: 0.9839
24/24 - 1s - loss: 0.0542 - accuracy: 0.9839
Epoch 1/2
94/94 - 47s - loss: 0.2508 - accuracy: 0.9136
Epoch 2/2
94/94 - 45s - loss: 0.0512 - accuracy: 0.9856
24/24 - 1s - loss: 0.0579 - accuracy: 0.9759
Epoch 1/3
94/94 - 47s - loss: 0.2328 - accuracy: 0.9233
Epoch 2/3
94/94 - 45s - loss: 0.0489 - accuracy: 0.9849
Epoch 3/3
94/94 - 45s - loss: 0.0312 - accuracy: 0.9923
24/24 - 1s - loss: 0.0577 - accuracy: 0.9826
Ep

In [20]:
# Summarize results from grid serach
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.979909 using {'batch_size': 32, 'epochs': 2}
