In [None]:
import sys, os
proj_root = os.path.dirname(os.path.abspath(''))
sys.path.insert(0, proj_root)
print(proj_root)

In [2]:
%load_ext autoreload
%autoreload 2

import json
import os
import pickle
import tarfile

import pandas as pd
import numpy as np
import requests
from sklearn import svm
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import sklearn
print(sklearn.__version__)

0.22.1


In [3]:
# load data set 
data_bunch = datasets.load_breast_cancer()
data_bunch.frame = pd.DataFrame(data_bunch.data, columns= data_bunch.feature_names).assign(target=data_bunch.target)
feature_cols = data_bunch.feature_names
label_col = 'target'

In [4]:
# split and save data locally
split=int(data_bunch.frame.shape[0]*.2)
np.random.seed(123)
idx = np.random.permutation(data_bunch.frame.index)
val_idx , train_idx = idx[:split], idx[split:]

filename = os.path.split(data_bunch.filename)[-1]
data_dir = '../data'

data_bunch.frame.loc[val_idx,:].to_csv(os.path.join(data_dir, 'val_'+filename), index=False)
data_bunch.frame.loc[train_idx,:].to_csv(os.path.join(data_dir, 'train_'+filename), index=False)
val_filename = 'val_'+filename

In [5]:
# Store Data in CP4D project
from app.pipeline import Pipeline as _Pipeline

pipeline = _Pipeline()
pipeline.set_connection()
pipeline.set_project('mlops')
asset_details = pipeline.wml_client.get_asset_details()
if val_filename not in [i['name'] for i in asset_details]:
    pipeline.wml_client.data_assets.create(val_filename, val_filename)

In [6]:
# delete data asset
# asset_details = pipeline.wml_client.get_asset_details()
# for record in asset_details:
#     if record['name'] == filename:
#         pipeline.wml_client.delete(record['asset_id'])

In [7]:
def train(data, target):
    clf = svm.SVC(gamma=0.001, C=100., probability=True)
    clf.fit(data, target)
    return clf

In [8]:
def serialize_model(clf, _dir, name):
    pklpath = os.path.join(_dir, name + '.pkl')
    with open(pklpath, "wb") as f:
        pickle.dump(clf, f)
    
    tar_filepath = os.path.splitext(pklpath)[0] + '.tar.gz'
    with tarfile.open(tar_filepath, 'w:gz') as tar:
        # specify arcname= as file w/o path so 
        # that file is extracted into same folder 
        # that extraction is triggered in
        pklname = os.path.split(pklpath)[-1]
        tar.add(pklpath, arcname=pklname)
    os.remove(pklpath)
    return tar_filepath

In [9]:
def load_serialized_model(tarpath):
    with tarfile.open(tarpath, 'r:gz') as tar:
        _dir, tarname = os.path.split(tarpath)
        pklname = tarname.rstrip('.tar.gz') + '.pkl'
        tar.extract(pklname, path=_dir)
    pklpath = os.path.join(_dir, pklname)
    with open(pklpath, 'rb') as f:
        pipeline = pickle.load(f)
    os.remove(pklpath)
    return pipeline

In [10]:
# Get data
train_data = pd.read_csv('../data/train_'+filename)
val_data = pd.read_csv('../data/val_'+filename)

In [11]:
# Train and score model
clf = train(train_data[feature_cols], train_data[label_col])
pipeline = Pipeline([('svc', clf)])

preds = pipeline.predict(val_data.drop(label_col, axis=1))
print(classification_report(val_data[label_col], preds))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93        40
           1       0.96      0.96      0.96        73

    accuracy                           0.95       113
   macro avg       0.94      0.94      0.94       113
weighted avg       0.95      0.95      0.95       113



In [12]:
tarpath = serialize_model(pipeline, '../models', 'svm')
tarpath

'../models/svm.tar.gz'

In [13]:
#load and predict from saved model
pipeline = load_serialized_model(tarpath)

preds = pipeline.predict(val_data.drop(label_col, axis=1))
print(classification_report(val_data[label_col], preds))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93        40
           1       0.96      0.96      0.96        73

    accuracy                           0.95       113
   macro avg       0.94      0.94      0.94       113
weighted avg       0.95      0.95      0.95       113

