In [1]:
#!pip install sklearn==0.23.2
#!pip install sklearn

# create sklearn ML pipeline

## get raw data

In [2]:
col_to_predict = 'survived'
col_primary_identifer = "user_id"

COLLIST_META = [col_primary_identifer]
COLLIST_FEATURE = ['age', 'fare', 'embarked', 'sex', 'pclass']
COLLIST_ALL     = [col_to_predict] + COLLIST_META + COLLIST_FEATURE
COLLIST_NUMERIC = ['age', 'fare']
COLLIST_CATEGORICAL = ['embarked', 'sex', 'pclass']

In [3]:
SEED = 100

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

In [4]:
RAW_TRAIN_PATH = "./data_dir/train_data.csv"
RAW_TEST_PATH  = "./data_dir/test_data.csv"
RAW_VAL_PATH   = "./data_dir/val_data.csv"

In [5]:
!mkdir ./data_dir

mkdir: cannot create directory ‘./data_dir’: File exists


In [6]:
titanic_url = ('https://raw.githubusercontent.com/amueller/'
               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
data = pd.read_csv(titanic_url)
data["user_id"] = ["user_"+str(i) for i in range(len(data))]

X = data[COLLIST_ALL].copy()

train_data, test_data, val_data = np.split(X.sample(frac=1, random_state=SEED), [int(.65*len(X)), int(.9*len(X))])
val_data = val_data[COLLIST_FEATURE]

# save the data
train_data[COLLIST_NUMERIC].to_csv(path_or_buf=RAW_TRAIN_PATH, index=False, header=None)
test_data[COLLIST_NUMERIC].to_csv(path_or_buf=RAW_TEST_PATH, index=False, header=None)
val_data[COLLIST_NUMERIC].to_csv(path_or_buf=RAW_VAL_PATH, index=False, header=None)

print("raw_data: {}, train: {}, test: {}, val: {}".format(X.shape, train_data.shape, test_data.shape, val_data.shape))
X.head(2)

print(X.shape)
X.head(2)

raw_data: (1309, 7), train: (850, 7), test: (328, 7), val: (131, 5)
(1309, 7)


Unnamed: 0,survived,user_id,age,fare,embarked,sex,pclass
0,1,user_0,29.0,211.3375,S,female,1
1,1,user_1,0.9167,151.55,S,male,1


In [7]:
!ls ./data_dir

test_data.csv  train_data.csv  val_data.csv


## bulid ml pipeline - locally

In [8]:
X_train = train_data[COLLIST_FEATURE]
X_test  = test_data[COLLIST_FEATURE]
y_train = train_data[col_to_predict]
y_test  = test_data[col_to_predict]

#X_train = X_train.values
#X_test = X_test.values
#y_train = y_train.values
#y_test = y_test.values

In [9]:
train_data[COLLIST_FEATURE].columns.values

array(['age', 'fare', 'embarked', 'sex', 'pclass'], dtype=object)

In [10]:
#['age', 'fare', 'embarked', 'sex', 'pclass']
#  0,        1,       2,       3,      4

In [11]:
# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['age', 'fare']
categorical_features = ['embarked', 'sex', 'pclass']

#numeric_features = [0, 1]
#categorical_features = [2, 3, 4]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='lbfgs'))])

### model traning

In [12]:
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.799


In [13]:
payload = val_data[0:1].values.tolist()
payload

[[8.0, 36.75, 'S', 'male', 2]]

In [17]:
#clf.predict(payload)

In [18]:
#X_train.head(2)

### model tuning

In [19]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'classifier__C': [0.1, 1.0, 10, 100],
}

grid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)
grid_search.fit(X_train, y_train)

print(("best logistic regression from grid search: %.3f"
       % grid_search.score(X_test, y_test)))

best logistic regression from grid search: 0.799


In [20]:
model_main = grid_search.best_estimator_
model_main

Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose...enalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))])

In [23]:
#model_main.predict(X_train)

### persist the model

In [24]:
import joblib

In [26]:
joblib.dump(model_main, 'sklearn_clf_pipeline_titanic_v11.pkl')

['sklearn_clf_pipeline_titanic_v11.pkl']

# deploy model

In [27]:
from azureml.core import Workspace


ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')

bais-ml
ml-resource-group
eastus
328d2afe-2a26-4e47-8e3b-db00b6ada105


## Register input and output datasets

Here, you will register the data used to create the model in your workspace.

In [28]:
val_data.head(2)

Unnamed: 0,age,fare,embarked,sex,pclass
385,8.0,36.75,S,male,2
212,37.0,29.7,C,male,1


In [29]:
type(X_train)

pandas.core.frame.DataFrame

In [30]:
temp_X_train = pd.DataFrame(X_train)
temp_y_train = pd.DataFrame(y_train)

In [31]:
temp_X_train.to_csv(path_or_buf="features.csv", index=False)
temp_y_train.to_csv(path_or_buf="labels.csv", index=False)

In [33]:
pd.read_csv("./features.csv").head(2)
#pd.read_csv("./labels.csv").head(2)

Unnamed: 0,age,fare,embarked,sex,pclass
0,32.5,211.5,C,male,1
1,,19.9667,S,male,3


In [34]:
import numpy as np

from azureml.core import Dataset

#np.savetxt('features.csv', X_train, delimiter=',')
#np.savetxt('labels.csv', y_train, delimiter=',')

datastore = ws.get_default_datastore()
datastore.upload_files(files=['./features.csv', './labels.csv'],
                       target_path='ds_sklearn_clf_titanic/',
                       overwrite=True)

input_dataset_clf = Dataset.Tabular.from_delimited_files(path=[(datastore, 'ds_sklearn_clf_titanic/features.csv')])
output_dataset_clf = Dataset.Tabular.from_delimited_files(path=[(datastore, 'ds_sklearn_clf_titanic/labels.csv')])

Uploading an estimated of 2 files
Uploading ./features.csv
Uploading ./labels.csv
Uploaded ./labels.csv, 1 files out of an estimated total of 2
Uploaded ./features.csv, 2 files out of an estimated total of 2
Uploaded 2 files


## Register model

In [35]:
!ls -al

total 96
drwxrwxrwx 2 root root     0 Sep 21 08:07 .
drwxrwxrwx 2 root root     0 Sep 21 08:06 ..
drwxrwxrwx 2 root root     0 Sep 21 11:30 .config
drwxrwxrwx 2 root root     0 Sep 21 11:26 data_dir
-rwxrwxrwx 1 root root 17019 Sep 21 13:22 features.csv
-rwxrwxrwx 1 root root 27018 Sep 21 13:19 Inference Pipeline Sklearn - Classification - v1 - working.ipynb
-rwxrwxrwx 1 root root 26784 Sep 21 13:22 Inference Pipeline Sklearn - Classification - v2.ipynb
drwxrwxrwx 2 root root     0 Sep 21 10:55 .ipynb_aml_checkpoints
drwxrwxrwx 2 root root     0 Sep 21 10:56 .ipynb_checkpoints
-rwxrwxrwx 1 root root  1709 Sep 21 13:22 labels.csv
-rwxrwxrwx 1 root root  3781 Sep 21 11:21 sklearn_clf_pipeline_titanic.pkl
-rwxrwxrwx 1 root root  3781 Sep 21 13:22 sklearn_clf_pipeline_titanic_v11.pkl
-rwxrwxrwx 1 root root  3781 Sep 21 13:22 sklearn_clf_pipeline_titanic_v1.pkl
-rwxrwxrwx 1 root root  3732 Sep 21 12:25 sklearn_clf_pipeline_titanic_v2.pkl
-rwxrwxrwx 1 root root  3732 Sep 21 13

In [37]:
import sklearn

from azureml.core import Model
from azureml.core.resource_configuration import ResourceConfiguration


model = Model.register(workspace=ws,
                       model_name='titanic-sklearn-model',                # Name of the registered model in your workspace.
                       model_path='sklearn_clf_pipeline_titanic_v11.pkl',  # Local file to upload and register as a model.
                       model_framework=Model.Framework.SCIKITLEARN,  # Framework used to create the model.
                       model_framework_version=sklearn.__version__,  # Version of scikit-learn used to create the model.
                       sample_input_dataset=input_dataset_clf,
                       sample_output_dataset=output_dataset_clf,
                       resource_configuration=ResourceConfiguration(cpu=1, memory_in_gb=0.5),
                       description='titanic clf model to predict survival.',
                       tags={'area': 'titanic', 'type': 'clf'})

print('Name:', model.name)
print('Version:', model.version)

Registering model titanic-sklearn-model
Name: titanic-sklearn-model
Version: 7


## Deploy model

Deploy your model as a web service using [Model.deploy()](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.model.model?view=azure-ml-py#deploy-workspace--name--models--inference-config--deployment-config-none--deployment-target-none-). Web services take one or more models, load them in an environment, and run them on one of several supported deployment targets. For more information on all your options when deploying models, see the [next steps](#Next-steps) section at the end of this notebook.

For this example, we will deploy your scikit-learn model to an Azure Container Instance (ACI).

In [38]:
service_name = 'service-titanic-sklearn-clf-v11'

service = Model.deploy(ws, service_name, [model], overwrite=True)
service.wait_for_deployment(show_output=True)

Running...............
Succeeded
ACI service creation operation finished, operation "Succeeded"


## Prediction

In [39]:
import json

In [40]:
payload = val_data[0:2].values.tolist()

input_payload = json.dumps({
    'data': payload,
    'method': 'predict'  # If you have a classification model, you can get probabilities by changing this to 'predict_proba'.
})

In [41]:
print(payload)
print(input_payload)

[[8.0, 36.75, 'S', 'male', 2], [37.0, 29.7, 'C', 'male', 1]]
{"data": [[8.0, 36.75, "S", "male", 2], [37.0, 29.7, "C", "male", 1]], "method": "predict"}


In [43]:
parsed_data = json.loads(input_payload)["data"]
model_main.predict(parsed_data)

ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [39]:
#print(service.get_logs())

In [None]:
stop

# terminate end-point

In [46]:
service.delete()

In [70]:
service

AciWebservice(workspace=Workspace.create(name='bais-ml', subscription_id='328d2afe-2a26-4e47-8e3b-db00b6ada105', resource_group='ml-resource-group'), name=service-titanic-sklearn-clf, image_id=None, compute_type=None, state=ACI, scoring_uri=Deleting, tags=http://1638a427-6d04-4def-ac05-4ddaca2f0912.eastus.azurecontainer.io/score, properties={}, created_by={'azureml.git.repository_uri': 'https://github.com/VijaySingh-GSLab/aws-ml.git', 'mlflow.source.git.repoURL': 'https://github.com/VijaySingh-GSLab/aws-ml.git', 'azureml.git.branch': 'branch-azure-vp', 'mlflow.source.git.branch': 'branch-azure-vp', 'azureml.git.commit': '79b23d26106eb8888281461867c38dced14aa2ea', 'mlflow.source.git.commit': '79b23d26106eb8888281461867c38dced14aa2ea', 'azureml.git.dirty': 'True', 'hasInferenceSchema': 'True', 'hasHttps': 'False'})