In [24]:
import os
import json
import boto3
import pickle
import pathlib
import numpy as np
import pandas as pd

from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [27]:
BASE_DIR = pathlib.Path().resolve().parent
EXPORT_DIR = BASE_DIR / "exports"

SPAM_DATASET_PATH = EXPORT_DIR / "spam-dataset.csv"
METADATA_EXPORT_PATH = EXPORT_DIR / 'spam-metadata.pkl'
TOKENIZER_EXPORT_PATH = EXPORT_DIR / 'spam-tokenizer.json'

with open(TOKENIZER_EXPORT_PATH, 'r') as f:
    data = json.load(f)
    tokens = json.dumps(data)
    tokenizer = tokenizer_from_json(tokens)

df = pd.read_csv(SPAM_DATASET_PATH, index_col=[0])
df.head()

Unnamed: 0,class,body,source
0,0.0,"Go until jurong point, crazy.. Available only ...",sms
1,0.0,Ok lar... Joking wif u oni...,sms
2,1.0,Free entry in 2 a wkly comp to win FA Cup fina...,sms
3,0.0,U dun say so early hor... U c already then say...,sms
4,0.0,"Nah I don't think he goes to usf, he lives aro...",sms


In [17]:

with open(METADATA_EXPORT_PATH, 'rb') as f:
    data = pickle.load(f)

X_test = data['X_test']
X_train = data['X_train']
y_test = data['y_test']
y_train = data['y_train']
legend = data['label_legend']
max_sequence = data['max_seq_length']
max_words = data['max_words']


In [18]:
embed_dim = 128
lstm_out = 196

model = Sequential()

model.add(Embedding(max_words, embed_dim, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(2, activation='softmax')) # remenber: SoftMax return is on % 

model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 128)          35840     
                                                                 
 spatial_dropout1d (SpatialD  (None, 300, 128)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 196)               254800    
                                                                 
 dense (Dense)               (None, 2)                 394       
                                                                 
Total params: 291,034
Trainable params: 291,034
Non-trainable params: 0
_________________________________________________________________
None


In [19]:
batch_size = 32
epochs = 5
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=batch_size, verbose=1, epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f490512bb50>

In [20]:
MODEL_EXPORT_PATH = EXPORT_DIR / 'spam-model.h5'
model.save(str(MODEL_EXPORT_PATH))

In [37]:
def predict(text_str, max_words=280, max_sequence = 280, tokenizer=None):
    if not tokenizer: return None
    
    sequences = tokenizer.texts_to_sequences([text_str])
    
    x_input = pad_sequences(sequences, maxlen=max_sequence)
    y_output = model.predict(x_input)
    
    top_y_index = np.argmax(y_output)
    preds = y_output[top_y_index]

    result = {'han': preds[0], 'spam': preds[1]}
    return result

predict("Hello world", max_words=max_words, max_sequence=max_sequence, tokenizer=tokenizer)



{'han': 0.9797849, 'spam': 0.020215118}