In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras import optimizers
from keras.models import Sequential
from keras.layers import *
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, classification_report
import re
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../DATA/SMSSpamCollection', sep='\t', names=["label", "message"])

In [3]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df['message'], df['label'], test_size=0.3)

In [4]:
import re
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')
def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts
        
train_texts = normalize_texts(train_texts.tolist())
val_texts = normalize_texts(val_texts.tolist())

In [5]:
MAX_FEATURES = 1200
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
train_texts = tokenizer.texts_to_sequences(train_texts)
val_texts = tokenizer.texts_to_sequences(val_texts)

In [6]:
MAX_LENGTH = max(len(train_ex) for train_ex in train_texts)
MAX_LENGTH = 32
train_texts = pad_sequences(train_texts, maxlen=MAX_LENGTH, padding='post')
val_texts = pad_sequences(val_texts, maxlen=MAX_LENGTH, padding='post')

In [7]:
MAX_LENGTH

32

In [8]:
train_labels = train_labels.map(lambda x : 0 if x == 'ham' else 1)
val_labels = val_labels.map(lambda x : 0 if x == 'ham' else 1)

In [9]:
def build_model():
    vocab_size = len(tokenizer.word_index) + 1
    model = Sequential()
    model.add(Embedding(vocab_size, 24, input_length=MAX_LENGTH))
    model.add(Flatten())
    model.add(Dense(500, activation='relu'))
    model.add(Dense(200, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    # compile the model
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return model
    
model = build_model()

In [12]:
model.fit(train_texts,
          train_labels, 
          batch_size=128,
          epochs=2,
          validation_data=(val_texts, val_labels))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x2e45ce445e0>

In [13]:
preds = model.predict(val_texts)
print('Accuracy score: {:0.4}'.format(accuracy_score(val_labels, 1 * (preds > 0.5))))
print('F1 score: {:0.4}'.format(f1_score(val_labels, 1 * (preds > 0.5))))
print('ROC AUC score: {:0.4}'.format(roc_auc_score(val_labels, preds)))

Accuracy score: 0.988
F1 score: 0.9507
ROC AUC score: 0.9902


In [14]:
def pred(val):
    val = normalize_texts([val])
    val = tokenizer.texts_to_sequences(val)
    val = pad_sequences(val, maxlen=MAX_LENGTH)
    print(model.predict(val)[0])

In [25]:
df[df['label']=='ham'].iloc[41]['message']

'A gram usually runs like  &lt;#&gt; , a half eighth is smarter though and gets you almost a whole second gram for  &lt;#&gt;'

In [26]:
pred('A gram usually runs like  &lt;#&gt; , a half eighth is smarter though and gets you almost a whole second gram for  &lt;#&gt;')

[1.19081915e-05]


In [18]:
print(classification_report(val_labels, 1 * (preds > 0.5)))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1460
           1       0.99      0.91      0.95       212

    accuracy                           0.99      1672
   macro avg       0.99      0.95      0.97      1672
weighted avg       0.99      0.99      0.99      1672



In [19]:
model.save('../Assets/spam_detector.hdf5')

In [21]:
import pickle

with open('../Assets/spam-tokenizer-5.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=5)

In [22]:
import pickle

with open('../Assets/spam-tokenizer-4.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=4)