In [22]:
import pathlib
import pandas as pd
import pickle
import json


In [23]:
BASE_DIR = pathlib.Path().resolve().parent
DATASET_DIR = BASE_DIR / "datasets"
EXPORT_DIR = DATASET_DIR / "exports"
EXPORT_DIR.mkdir(exist_ok=True, parents=True)

DATASET_CSV_PATH = EXPORT_DIR / "spam-dataset.csv"
TRAINING_DATA_PATH = EXPORT_DIR / "spam-metadata.pkl"
TOKENIZER_DATA_PATH = EXPORT_DIR / "spam-tokenizer.json"

In [24]:
df = pd.read_csv(DATASET_CSV_PATH)
df.head()

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",sms-spam
1,ham,Ok lar... Joking wif u oni...,sms-spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms-spam
3,ham,U dun say so early hor... U c already then say...,sms-spam
4,ham,"Nah I don't think he goes to usf, he lives aro...",sms-spam


In [25]:
data = {}

with open(TRAINING_DATA_PATH, 'rb') as f:
    data = pickle.load(f)
    
data

{'X_train': array([[  0,   0,   0, ..., 112,  33,  77],
        [  0,   0,   0, ...,   3,  12,  18],
        [  0,   0,   0, ...,   0,  12,  46],
        ...,
        [  0,   0,   0, ...,   0,   0,   1],
        [  0,   0,   0, ...,  30, 182,   9],
        [  0,   0,   0, ...,   0,   3, 156]]),
 'X_test': array([[  0,   0,   0, ...,  11,  70,  19],
        [  0,   0,   0, ...,   7, 165,  25],
        [  0,   0,   0, ...,   0,   0,   0],
        ...,
        [  0,   0,   0, ..., 186,  56,   5],
        [  0,   0,   0, ...,  16,  73,  19],
        [  0,   0,   0, ...,  26, 104, 106]]),
 'y_train': array([[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [1., 0.],
        [1., 0.]], dtype=float32),
 'y_test': array([[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [0., 1.],
        [0., 1.],
        [0., 1.]], dtype=float32),
 'max_num_words': 200,
 'max_seq_length': 300,
 'label_legend': {'ham': 0, 'spam': 1},
 'label_legend_inverted':

#### TRANSFORM EXTRACTED DATA

In [27]:
X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']
max_num_words = data['max_num_words']
max_seq_length = data['max_seq_length']
label_legend = data['label_legend']
label_legend_inverted = data['label_legend_inverted']
tokenizer = data['tokenizer']



#### CREATE LSTM MODEL

In [6]:
from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import Dense, Embedding, SpatialDropout1D, LSTM

In [7]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_num_words, embed_dim, input_length = X_train.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(2, activation='softmax'))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy'])  
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 128)          25600     
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 300, 128)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 2)                 394       
Total params: 280,794
Trainable params: 280,794
Non-trainable params: 0
_________________________________________________________________
None


In [8]:
batch_size = 32
epochs = 5

model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=batch_size, verbose=1, epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x24f37d08f10>

In [9]:
MODEL_EXPORT_PATH = EXPORT_DIR / "spam-model.h5"
model.save(MODEL_EXPORT_PATH)

### Predict New Data

In [53]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def predict(text_str, max_words = 280, max_sequence = 280, tokenizer=None):
    if not tokenizer:
        return None
    sequences = tokenizer.texts_to_sequences([text_str])
    x_input = pad_sequences(sequences, maxlen=max_sequence)
    y_output = model.predict(x_input)
    top_y_index = np.argmax(y_output)
    preds = y_output[top_y_index]
    labeled_preds = [{f"{label_legend_inverted[str(i)]}": x} for i, x in enumerate(preds)]
    return labeled_preds
    

    

In [54]:
predict("Hello world", max_words=max_num_words, max_sequence=max_seq_length, tokenizer=tokenizer)

[{'ham': 0.9494517}, {'spam': 0.050548326}]