-
Notifications
You must be signed in to change notification settings - Fork 0
/
spam_detection_rnn.py
108 lines (84 loc) · 3.69 KB
/
spam_detection_rnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from keras.layers import SimpleRNN, LSTM, GRU, Embedding, Dense, Flatten
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard
from keras.utils import plot_model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
def main(rnn_model):
def message_to_array(msg):
msg = msg.lower().split(' ')
test_seq = np.array([word_index[word] for word in msg])
test_seq = np.pad(test_seq, (500-len(test_seq), 0), 'constant', constant_values=(0))
test_seq = test_seq.reshape(1, 500)
return test_seq
data = pd.read_csv("./spam_text_message_data.csv")
print(data.head())
print(data.tail())
messages = []
labels = []
for index, row in data.iterrows():
messages.append(row['Message'])
if row['Category'] == 'ham':
labels.append(0)
else:
labels.append(1)
messages = np.asarray(messages)
labels = np.asarray(labels)
print("Number of messages: ", len(messages))
print("Number of labels: ", len(labels))
max_vocab = 10000
max_len = 500
# Ignore all words except the 10000 most common words
tokenizer = Tokenizer(num_words=max_vocab)
# Calculate the frequency of words
tokenizer.fit_on_texts(messages)
# Convert array of messages to list of sequences of integers
sequences = tokenizer.texts_to_sequences(messages)
# Dict keeping track of words to integer index
word_index = tokenizer.word_index
# Convert the array of sequences(of integers) to 2D array with padding
# maxlen specifies the maximum length of sequence (truncated if longer, padded if shorter)
data = pad_sequences(sequences, maxlen=max_len)
print("data shape: ", data.shape)
# We will use 80% of data for training & validation(80% train, 20% validation) and 20% for testing
train_samples = int(len(messages)*0.8)
messages_train = data[:train_samples]
labels_train = labels[:train_samples]
messages_test = data[train_samples:len(messages)-2]
labels_test = labels[train_samples:len(messages)-2]
embedding_mat_columns=32
# Construct the SimpleRNN model
model = Sequential()
## Add embedding layer to convert integer encoding to word embeddings(the model learns the
## embedding matrix during training), embedding matrix has max_vocab as no. of rows and chosen
## no. of columns
model.add(Embedding(input_dim=max_vocab, output_dim=embedding_mat_columns, input_length=max_len))
if rnn_model == 'SimpleRNN':
model.add(SimpleRNN(units=embedding_mat_columns))
elif rnn_model == 'LSTM':
model.add(LSTM(units=embedding_mat_columns))
else:
model.add(GRU(units=embedding_mat_columns))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()
#plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)
# Training the model
model.fit(messages_train, labels_train, epochs=10, batch_size=60, validation_split=0.2)
# Testing the model
pred = model.predict_classes(messages_test)
acc = model.evaluate(messages_test, labels_test)
print("Test loss is {0:.2f} accuracy is {1:.2f} ".format(acc[0],acc[1]))
# Constructing a custom message to check model
custom_msg = 'Congratulations ur awarded 500 of CD vouchers or 125gift guaranteed Free entry for movies'
test_seq = message_to_array(custom_msg)
pred = model.predict_classes(test_seq)
print(pred)
if __name__ == '__main__':
main('SimpleRNN')
main('LSTM')
main('GRU')