In [41]:
#import libraries
import keras
import pandas as pd
from nltk.corpus import stopwords
from nltk import *
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [15]:
#import data
#https://www.kaggle.com/uciml/sms-spam-collection-dataset
data = pd.read_csv('spam.csv', encoding = "ISO-8859-1")
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
v1            5572 non-null object
v2            5572 non-null object
Unnamed: 2    50 non-null object
Unnamed: 3    12 non-null object
Unnamed: 4    6 non-null object
dtypes: object(5)
memory usage: 217.7+ KB


In [21]:
email_data = data[['v1', 'v2']]
email_data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
email_data = email_data.rename(columns={'v1':'Target', 'v2':'Email'})

In [28]:
#remove stop words
stop = stopwords.words('english')
email_data['Email'] = email_data['Email'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
email_data.head()

Unnamed: 0,Target,Email
0,ham,"Go jurong point, crazy.. Available bugis n gre..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry 2 wkly comp win FA Cup final tkts 2...
3,ham,U dun say early hor... U c already say...
4,ham,"Nah I think goes usf, lives around though"


In [29]:
#remove punnctuation and connvert to lower
email_data['Email'] = email_data['Email'].apply(lambda x: re.sub('[!@#$:).;,?&]', " ", x.lower()))
email_data['Email'] = email_data['Email'].apply(lambda x: re.sub(' ',' ', x))
email_data.head()

Unnamed: 0,Target,Email
0,ham,go jurong point crazy available bugis n gre...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah i think goes usf lives around though


In [31]:
#separate text(input) and target class
target = email_data['Target'].values

In [33]:
#data preparation and model building
train, test = train_test_split(email_data, test_size=0.2)

In [44]:
# Define the sequence lengths, max number of words and embedding dimensions
max_seq_len = 300
# Top 20000 frequently occurring words
max_num_words = 20000

tokenizer = keras.preprocessing.text.Tokenizer(num_words=max_num_words)
tokenizer.fit_on_texts(train.Email)
train_sequence = tokenizer.texts_to_sequences(train.Email)
test_sequence = tokenizer.texts_to_sequences(test.Email)

In [46]:
#dictionary containing words and thier index
word_index = tokenizer.word_index
print('Found {} unique tokens').format(len(word_index))

Found 7906 unique tokens


In [53]:
#getting only the top frequent words
from keras.preprocessing import sequence
train_data = sequence.pad_sequences(train_sequence, maxlen = max_seq_len)
test_data = sequence.pad_sequences(test_sequence, maxlen = max_seq_len)

In [54]:
print(train_data.shape)

(4457, 300)


In [55]:
print(test_data.shape)

(1115, 300)


In [56]:
train_labels = train['Target']
test_labels = test['Target']

In [57]:
from sklearn.preprocessing import LabelEncoder
#convert character array to numeric array
le = LabelEncoder()
le.fit(train_labels)
train_labels = le.transform(train_labels)
test_labels = le.transform(test_labels)

In [58]:
print(le.classes_)

[u'ham' u'spam']


In [59]:
print(np.unique(train_labels, return_counts=True))
print(np.unique(test_labels, return_counts=True))

(array([0, 1]), array([3854,  603]))
(array([0, 1]), array([971, 144]))


In [62]:
#changing data types
from keras.utils.np_utils import to_categorical
labels_train = to_categorical(np.asarray(train_labels))
labels_test = to_categorical(np.asarray(test_labels))
print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', labels_train.shape)
print('Shape of label tensor:', labels_test.shape)

('Shape of data tensor:', (4457, 300))
('Shape of label tensor:', (4457, 2))
('Shape of label tensor:', (1115, 2))


In [99]:
#model building....#import libraries
from keras.layers import Dense, Input, Dropout, Activation, LSTM, Embedding, MaxPooling1D, GlobalMaxPool1D
from keras.layers import Bidirectional, Conv1D, SimpleRNN, BatchNormalization, Flatten
from keras.models import Model
from keras.models import Sequential
from sklearn import metrics

CNN Model

In [69]:
Embedding_dim = 100
model = Sequential()
model.add(Embedding(max_num_words, Embedding_dim, input_length=max_seq_len))
model.add(Dropout(0.5))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(2, activation='softmax'))

In [70]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [71]:
#fitting the model
model.fit(train_data, labels_train, batch_size=64, epochs=5, 
          validation_data=(test_data, labels_test))

Instructions for updating:
Use tf.cast instead.
Train on 4457 samples, validate on 1115 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a1332f4d0>

In [73]:
predicted = model.predict(test_data)
predicted

array([[0.9833114 , 0.01668858],
       [0.9833114 , 0.01668858],
       [0.99216485, 0.00783512],
       ...,
       [0.9833114 , 0.01668858],
       [0.98925346, 0.01074661],
       [0.9861034 , 0.01389665]], dtype=float32)

In [77]:
#model evaluation
from sklearn.metrics import precision_recall_fscore_support as score
precision, recall, fscore, support = score(labels_test,predicted.round())
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print("############################")
print(metrics.classification_report(labels_test, predicted.round()))

precision: [0.94083414 0.98809524]
recall: [0.99897013 0.57638889]
fscore: [0.96903097 0.72807018]
support: [971 144]
############################
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       971
           1       0.99      0.58      0.73       144

   micro avg       0.94      0.94      0.94      1115
   macro avg       0.96      0.79      0.85      1115
weighted avg       0.95      0.94      0.94      1115
 samples avg       0.94      0.94      0.94      1115



RNN Model

In [80]:
#importing library
from keras.layers.recurrent import SimpleRNN

In [81]:
model = Sequential()
model.add(Embedding(max_num_words, Embedding_dim, input_length=max_seq_len))
model.add(SimpleRNN(2, input_shape=(None, 1)))
model.add(Dense(2, activation='softmax'))

In [82]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [85]:
model.fit(train_data, labels_train, batch_size=16, epochs=5,
         validation_data=(test_data, labels_test))

Train on 4457 samples, validate on 1115 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a35146b50>

In [86]:
#prediction on test data
predicted_srnn = model.predict(test_data)
predicted_srnn

array([[0.98319894, 0.01680109],
       [0.9824455 , 0.01755449],
       [0.9869097 , 0.01309026],
       ...,
       [0.9865031 , 0.01349682],
       [0.9817378 , 0.01826221],
       [0.9667154 , 0.03328463]], dtype=float32)

In [88]:
#model evaluation
precision, recall, fscore, support = score(labels_test, predicted_srnn.round())
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print("############################")
print(metrics.classification_report(labels_test, predicted_srnn.round()))

precision: [0.98373984 0.97709924]
recall: [0.9969104  0.88888889]
fscore: [0.99028133 0.93090909]
support: [971 144]
############################
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       971
           1       0.98      0.89      0.93       144

   micro avg       0.98      0.98      0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115
 samples avg       0.98      0.98      0.98      1115



LSTM Model

In [92]:
model = Sequential()
model.add(Embedding(max_num_words, Embedding_dim, input_length=max_seq_len))
model.add(LSTM(output_dim=16, activation='relu', inner_activation='hard_sigmoid',
               return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(2, activation='softmax'))

  after removing the cwd from sys.path.


In [93]:
model.compile(loss='binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [94]:
model.fit(train_data, labels_train, batch_size=16, epochs=5,
         validation_data=(test_data, labels_test))

Train on 4457 samples, validate on 1115 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a495bf3d0>

In [95]:
#prediction on test data
predicted_lstm = model.predict(test_data)
predicted_lstm

array([[9.9986076e-01, 1.3921251e-04],
       [9.9999285e-01, 7.1840141e-06],
       [1.0000000e+00, 5.7008892e-10],
       ...,
       [9.9994648e-01, 5.3544303e-05],
       [9.9999881e-01, 1.1379519e-06],
       [9.9640638e-01, 3.5936092e-03]], dtype=float32)

In [97]:
precision, recall, fscore, support = score(labels_test, predicted_lstm.round())
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print("############################")
print(metrics.classification_report(labels_test, predicted_lstm.round()))

precision: [0.98676171 0.98496241]
recall: [0.99794027 0.90972222]
fscore: [0.99231951 0.94584838]
support: [971 144]
############################
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       971
           1       0.98      0.91      0.95       144

   micro avg       0.99      0.99      0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115
 samples avg       0.99      0.99      0.99      1115



Bidirectional LSTM

In [100]:
model =  Sequential()
model.add(Embedding(max_num_words, Embedding_dim, input_length=max_seq_len))
model.add(Bidirectional(LSTM(16, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
model.add(Conv1D(16, kernel_size=3, padding='valid', kernel_initializer = "glorot_uniform"))
model.add(GlobalMaxPool1D())
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(2, activation='softmax'))

In [102]:
model.compile(loss='binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [103]:
model.fit(train_data, labels_train, batch_size=16, epochs=3,
         validation_data=(test_data, labels_test))

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 4457 samples, validate on 1115 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1a4e64ac10>

In [104]:
#prediction on test data
predicted_blstm = model.predict(test_data)
predicted_blstm

array([[9.9998641e-01, 1.3642296e-05],
       [9.9998808e-01, 1.1946274e-05],
       [9.9999869e-01, 1.2670264e-06],
       ...,
       [9.9995065e-01, 4.9368144e-05],
       [9.9999726e-01, 2.7452663e-06],
       [9.9363649e-01, 6.3635134e-03]], dtype=float32)

In [105]:
precision, recall, fscore, support = score(labels_test, predicted_blstm.round())
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print("############################")
print(metrics.classification_report(labels_test, predicted_blstm.round()))

precision: [0.98979592 0.99259259]
recall: [0.99897013 0.93055556]
fscore: [0.99436187 0.96057348]
support: [971 144]
############################
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       971
           1       0.99      0.93      0.96       144

   micro avg       0.99      0.99      0.99      1115
   macro avg       0.99      0.96      0.98      1115
weighted avg       0.99      0.99      0.99      1115
 samples avg       0.99      0.99      0.99      1115

