In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.metrics import accuracy_score


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten,Conv2D,Dense,MaxPool2D
from tensorflow.keras.utils import to_categorical
from keras.layers import Dense,LSTM,Embedding
from tensorflow.keras.optimizers import Adam

In [2]:
messages = pd.read_csv('spam1.csv')
messages.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
messages.isnull().sum()

v1               0
v2               0
Unnamed: 2    6720
Unnamed: 3    6760
Unnamed: 4    6768
dtype: int64

In [4]:
messages =  messages.iloc[ :,[0,1]]

In [5]:
messages

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
6771,spam,This is the 2nd time we have tried 2 contact u...
6772,ham,Will Ì_ b going to esplanade fr home?
6773,ham,"Pity, * was in mood for that. So...any other s..."
6774,ham,The guy did some bitching but I acted like i'd...


In [6]:
messages.rename(columns = {'v1':'Label','v2':'Messages'},inplace = True)

In [7]:
messages.Label.value_counts()

ham     5854
spam     922
Name: Label, dtype: int64

In [8]:
messages.Label.replace({'spam':1,'ham':0},inplace = True)

In [9]:
messages.Messages = messages.Messages.str.lower()

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
msg_train,msg_test = train_test_split(messages,test_size = 0.2)

In [12]:
msg_train_x = msg_train.iloc[:,1]
msg_train_y = msg_train.iloc[:,0]

In [13]:
msg_test_x = msg_test.iloc[:,1]
msg_test_y = msg_test.iloc[:,0]

In [14]:
msg_train_y.shape

(5420,)

In [15]:
# we cannot build model on one dimensional data. So we use to_categorical() to convert into 2D
msg_train_y = to_categorical(msg_train_y)

In [16]:
max_num_words = 15000 # number of unique words in the dataset
seq_len = 50 # number of words to be processed at a single time
embedding_size = 75 # for each word how many embedding we are going to generate

In [17]:
from keras.preprocessing.text import Tokenizer #it is used to assign number to each unique word
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [18]:
tokenizer = Tokenizer(num_words = max_num_words)

In [19]:
tokenizer

<keras.preprocessing.text.Tokenizer at 0x267519dc8b0>

In [20]:
tokenizer.fit_on_texts(messages.Messages)
msg_train_x = tokenizer.texts_to_sequences(msg_train_x)
msg_train_x = pad_sequences(msg_train_x,maxlen = seq_len)  #before using this every record has different shape. It makes shape of evry record same

In [21]:
msg_test_x = tokenizer.texts_to_sequences(msg_test_x)
msg_test_x = pad_sequences(msg_test_x , maxlen = seq_len)

***Model Building***

In [27]:
model = Sequential()
model.add(Embedding(input_dim = max_num_words,input_length = seq_len,output_dim = embedding_size))

model.add(LSTM(5))
model.add(Dense(2,activation = 'softmax'))

adam = Adam(learning_rate = 0.001)
model.compile(optimizer = 'adam',loss = 'categorical_crossentropy',metrics = ['accuracy'])

In [28]:
model.fit(msg_train_x,msg_train_y,epochs = 10,validation_split = 0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2675443f6a0>

In [29]:
pred = model.predict(msg_test_x)



In [30]:
pred

array([[9.9909520e-01, 9.0486836e-04],
       [9.9897170e-01, 1.0283040e-03],
       [9.9750620e-01, 2.4938581e-03],
       ...,
       [9.9872154e-01, 1.2784805e-03],
       [9.9629092e-01, 3.7091027e-03],
       [1.0188707e-01, 8.9811289e-01]], dtype=float32)

In [33]:
pred_class = np.argmax(pred,axis = 1)

In [34]:
pred_class

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [35]:
from sklearn.metrics import confusion_matrix

In [36]:
confusion_matrix(msg_test_y,pred_class)

array([[1188,    0],
       [   5,  163]], dtype=int64)

In [37]:
accuracy_score(msg_test_y,pred_class)*100

99.63126843657817