## Spam Classification Model (Sklearn)

- Wrap a ML model for use as a prediction microservice in seldon-core
- Run locally on Docker to test
- Deploy on seldon-core running on k8s cluster

### Train Locally

In [33]:
import numpy as np 
import pandas as pd
from sklearn.externals import joblib
from pathlib import Path
import string
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
model_path: Path=Path('./')

In [35]:
data = pd.read_csv("spam.csv",encoding='latin-1')
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1":"class", "v2":"text"})
data.head()

def pre_process(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    words = ""
    for i in text:
            stemmer = SnowballStemmer("english")
            words += (stemmer.stem(i))+" "
    return words

features = data['text'].copy()
features = features.apply(pre_process)

vectorizer = TfidfVectorizer("english")
_features = vectorizer.fit_transform(features)
with open('skl-spam-classifier/model/vectorizer.pkl', 'wb') as vect:
    pickle.dump(vectorizer, vect)
    
vectorizer = joblib.load(model_path.joinpath('skl-spam-classifier/model/vectorizer.pkl'))
train_x, test_x, train_y, test_y = train_test_split(_features, data['class'], test_size=0.3, random_state=0)
svc = SVC(kernel='sigmoid', gamma=1.0, probability=True)
svc.fit(train_x,train_y)
# save the model to disk
filename = 'skl-spam-classifier/model/model.pkl'
pickle.dump(svc, open(filename, 'wb'))

clf = joblib.load(model_path.joinpath(filename))

prediction = clf.predict(test_x)
accuracy_score(test_y,prediction)

0.9730861244019139

In [36]:
message = np.array(['click here to win the price'])
data = vectorizer.transform(message).todense()
probas = clf.predict_proba(data)
probas

array([[0.02762687, 0.97237313]])

In [37]:
clf.classes_

array(['ham', 'spam'], dtype=object)

## Spam Classification Model (keras)

- Wrap a ML model for use as a prediction microservice in seldon-core
- Run locally on Docker to test
- Deploy on seldon-core running on k8s cluster

you can find data here: https://www.kaggle.com/benvozza/spam-classification/data

### Train Locally

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
import pickle
from sklearn.externals import joblib
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from keras.engine.saving import model_from_json
from keras.layers import (
    Bidirectional,
    concatenate,
    Dense,
    Embedding,
    LSTM,
    Masking,
    Reshape,
    SpatialDropout1D,
    TimeDistributed,
)

Instructions for updating:
non-resource variables are not supported in the long term


In [8]:
data = pd.read_csv("spam.csv",encoding='latin-1')

data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1":"class", "v2":"text"})

X = data.text
Y = data['class']
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15)
max_words = 1000
max_len = 150
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

with open('keras-spam-classifier/model/tokenizer.pkl', 'wb') as tok:
    pickle.dump(tokenizer, tok)

    
sequences = tokenizer.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)


In [17]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

#save model
model_json = model.to_json()
with open("keras-spam-classifier/model/architecture.json", "w") as json_file:
    json_file.write(model_json)
model.save_weights("keras-spam-classifier/model/weights.h5")

### wrap each model component using s2i

In [39]:
!s2i build keras-spam-classifier/ seldonio/seldon-core-s2i-python3:1.2.1-dev spam-classifier:1.0.0.1

---> Installing application source...
---> Installing dependencies ...
Looking in links: /whl
Collecting scikit-learn==0.21.2 (from -r requirements.txt (line 1))
  Url '/whl' is ignored. It is either a non-existing path or lacks a specific scheme.
Downloading https://files.pythonhosted.org/packages/85/04/49633f490f726da6e454fddc8e938bbb5bfed2001681118d3814c219b723/scikit_learn-0.21.2-cp36-cp36m-manylinux1_x86_64.whl (6.7MB)
Collecting keras (from -r requirements.txt (line 3))
  Url '/whl' is ignored. It is either a non-existing path or lacks a specific scheme.
Downloading https://files.pythonhosted.org/packages/ad/fd/6bfe87920d7f4fd475acd28500a42482b6b84479832bdc0fe9e589a60ceb/Keras-2.3.1-py2.py3-none-any.whl (377kB)
Collecting joblib>=0.11 (from scikit-learn==0.21.2->-r requirements.txt (line 1))
  Url '/whl' is ignored. It is either a non-existing path or lacks a specific scheme.
Downloading https://files.pythonhosted.org/packages/8f/42/155696f85f344c066e17af287359c9786b436b1bf86029b

In [73]:
!docker run --name "spam-classifier" -d --rm -p 5000:5000 spam-classifier:1.0.0.1

59be46468915231d3915161343486d083d547fde192baa7367ff411efe34c52f


In [74]:
!curl -g http://localhost:5000/predict --data-urlencode 'json={"data": {"names": ["message"], "ndarray": ["click here to win the price"]}}'


{"data":{"ndarray":["0.9779371008528993","spam"]},"meta":{}}


In [75]:
!docker rm spam-classifier --force

spam-classifier


In [47]:
!s2i build keras-spam-classifier/ seldonio/seldon-core-s2i-python3:1.2.1-dev keras-spam-classifier:1.0.0.1

---> Installing application source...
---> Installing dependencies ...
Looking in links: /whl
Collecting scikit-learn==0.21.2 (from -r requirements.txt (line 1))
  Url '/whl' is ignored. It is either a non-existing path or lacks a specific scheme.
Downloading https://files.pythonhosted.org/packages/85/04/49633f490f726da6e454fddc8e938bbb5bfed2001681118d3814c219b723/scikit_learn-0.21.2-cp36-cp36m-manylinux1_x86_64.whl (6.7MB)
Collecting keras (from -r requirements.txt (line 3))
  Url '/whl' is ignored. It is either a non-existing path or lacks a specific scheme.
Downloading https://files.pythonhosted.org/packages/ad/fd/6bfe87920d7f4fd475acd28500a42482b6b84479832bdc0fe9e589a60ceb/Keras-2.3.1-py2.py3-none-any.whl (377kB)
Collecting scipy>=0.17.0 (from scikit-learn==0.21.2->-r requirements.txt (line 1))
  Url '/whl' is ignored. It is either a non-existing path or lacks a specific scheme.
Downloading https://files.pythonhosted.org/packages/29/50/a552a5aff252ae915f522e44642bb49a7b7b31677f9580

In [49]:
!docker run --name "keras-spam-classifier" --rm -d -p 5000:5000  keras-spam-classifier:1.0.0.1

cf8ebd9bff95cb81ca0cd39393e5c3f2707d7b6394ddc0ace15e41a47248c3e8


In [50]:
!s2i build Translator/ seldonio/seldon-core-s2i-python3:1.2.1-dev translator:1.0.0.1

---> Installing application source...
---> Installing dependencies ...
Looking in links: /whl
Collecting goslate (from -r requirements.txt (line 1))
  Url '/whl' is ignored. It is either a non-existing path or lacks a specific scheme.
Downloading https://files.pythonhosted.org/packages/39/0b/50af938a1c3d4f4c595b6a22d37af11ebe666246b05a1a97573e8c8944e5/goslate-1.5.1.tar.gz
Collecting futures (from goslate->-r requirements.txt (line 1))
  Url '/whl' is ignored. It is either a non-existing path or lacks a specific scheme.
Downloading https://files.pythonhosted.org/packages/05/80/f41cca0ea1ff69bce7e7a7d76182b47bb4e1a494380a532af3e8ee70b9ec/futures-3.1.1-py3-none-any.whl
Building wheels for collected packages: goslate
Running setup.py bdist_wheel for goslate: started
Running setup.py bdist_wheel for goslate: finished with status 'done'
Stored in directory: /root/.cache/pip/wheels/4f/7f/28/6f52271012a7649b54b1a7adaae329b4246bbbf9d1e4f6e51a
Successfully built goslate
Installing collected pack

In [53]:
!docker run --name "translator" -d --rm -p 5000:5000 translator:1.0.0.1

26d5f31364f5095680bd89aabae468f0c4cec0f8cc7da5de4ea77434e4836692


In [54]:
!curl -g http://localhost:5000/transform-input --data-urlencode 'json={"data": {"names": ["message"], "ndarray": ["Wie läuft dein Tag"]}}'

{"data":{"names":["message"],"ndarray":["How is your day"]},"meta":{}}


In [55]:
!docker rm translator --force

translator


In [59]:
!s2i build Combiner/ seldonio/seldon-core-s2i-python3:1.2.1-dev combiner:1.0.0.1

---> Installing application source...
Build completed successfully


In [61]:
!docker run --name "model-combiner" -d --rm -p 5000:5000 combiner:1.0.0.1

19fbbdfb073c7da1056611aa18d8c7d3da9a533010667638d8c0cb0abfe6e257


In [67]:
#!curl -g http://localhost:5000/aggregate --data-urlencode 'json={"data": {"names": ["message"], "ndarray": [["0.7","Spam"], ["0.80", "Spam"]]}}'

In [68]:
!docker rm model-combiner --force

model-combiner


#### Assuming you have kubernetes cluster running and seldon-core installed, you can deploy your Machine Learning model using:

kubectl apply -f deploy.yaml