# Automated ML

In [1]:
from azureml.core import Workspace, Experiment
from azureml.data.dataset_factory import TabularDatasetFactory
from train import clean_data
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.automl import AutoMLConfig
from azureml.widgets import RunDetails

We start by setting up our experiment in our workspace.

In [2]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'pcos_automl'

experiment=Experiment(ws, experiment_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = experiment.start_logging()

Workspace name: quick-starts-ws-136366
Azure region: southcentralus
Subscription id: 5a4ab2ba-6c51-4805-8155-58759ad589d8
Resource group: aml-quickstarts-136366


## Dataset

The dataset for PCOS is a real-time data set that taken from a survey conducted among 119 women between the
ages of 18 and 22. The dataset is primarily based on their lifestyle and food intake habits. The symptoms i.e.
attributes are classified based on classification algorithms to predict whether the patient may have PCOS or not. The
database consists of 119 samples with 18 attributes belonging to two different classes (maybe or maybe not).
There are 14 binary attributes and 4 categorical attributes. <br/>
PCOS-Survey/PCOSData. (2017). GitHub. Retrieved 30
November 2017, from https://github.com/PCOSSurvey/PCOSData

In [3]:
ds = TabularDatasetFactory.from_delimited_files(path="https://raw.githubusercontent.com/priyanshisharma/AI-Champ/master/pcos_data.csv")

Here, we clean and observe our data.

In [4]:
x, y = clean_data(ds)

df = pd.concat([x, pd.DataFrame(y)], axis = 1)
df.head()

Unnamed: 0,Column1,regular_periods,rapid_weight_gain,excess_hair,dark_patches,pimples,depression_and_anxiety,diabetic_hypertension_family_history,body_weight_maintain_difficulty,oily_skin,...,eat_frequency,regular_excercise,sleep_time,wake_time,hostel_stress,personal_stress,peer_pressure,dietary_stress,fast_food_frequency,PCOS
0,0,hb,y,y,n,y,y,n,y,y,...,hm,n,11.00pm,7.00am,n,y,n,n,w,mb n
1,1,im,y,y,n,n,y,n,y,n,...,hm,n,12.00am,8.00am,n,n,n,y,w,mb
2,2,hb,y,y,n,n,y,n,y,n,...,hm,n,11.00pm,7.00am,n,n,y,n,w,mb n
3,3,ib,n,y,n,n,y,y,y,n,...,hm,n,3.00am,7.30am,n,n,n,y,w,mb n
4,4,im,y,y,n,n,y,y,y,y,...,hm,n,12.00am,8.30am,n,y,n,n,w,mb


Now we split our data in order to fe

In [5]:
len(df)

119

In [6]:
outname2='training_dataset3.csv'
outdir2='training3/'
if not os.path.exists(outdir2):
    os.mkdir(outdir2)

df_train, df_test = train_test_split(df, test_size=0.8)

fullpath2=os.path.join(outdir2,outname2)
df_test.to_csv(fullpath2)

outname='validation_dataset3.csv'
outdir='validation3/'
if not os.path.exists(outdir):
    os.mkdir(outdir)

fullpath=os.path.join(outdir,outname)
df_test.to_csv(fullpath)

In [7]:
len(df_train)

23

Now we store our dataset in our default datastore in order to access it.

In [8]:
datastore = ws.get_default_datastore()

In [9]:
datastore.upload(src_dir = "training3/", target_path = "data/")
datastore.upload(src_dir = "validation3/", target_path = "data/")

Uploading an estimated of 1 files
Target already exists. Skipping upload for data/training_dataset3.csv
Uploaded 0 files
Uploading an estimated of 1 files
Target already exists. Skipping upload for data/validation_dataset3.csv
Uploaded 0 files


$AZUREML_DATAREFERENCE_a0fecb791d2f414b85d23918b92bd3f8

In [10]:
training_data = TabularDatasetFactory.from_delimited_files(path = [(datastore, ("data/training_dataset3.csv"))])
validation_data = TabularDatasetFactory.from_delimited_files(path = [(datastore, ("data/validation_dataset3.csv"))])

In [11]:
len(training_data.to_pandas_dataframe())

96

## AutoML Configuration

We start by setting up our compute cluster, where we will run our automl run.

In [12]:
cpu_cluster_name = "cpucluster-aml"

try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


We've used the following configuration for our run:

|Setting |Reasons|
|-|-|
|**experiment_timeout_minutes**| Maximum amount of time in minutes that all iterations combined can take before the experiment terminates. I've taken this to be 30 mins due to the presence of 730 rows. |
|**max_concurrent_iterations**|These are the iterations occuring simultaneously and has to be equal to the number of nodes in the cluster(5-1))|
|**n_cross_validations**|Using 5 cross validations to avoi8d overfitting) |
|**primary_metric**|Since the data isn't quite balanced, Weighted Average Precision Score |
|**task**|Classification |
|**compute_target**|This is the compute cluster we will be using |
|**training_data**|This is the training dataset stored in the default datastore  |
|**label_column_name**|This is the target variable|


In [13]:
training_data

{
  "source": [
    "('workspaceblobstore', 'data/training_dataset3.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ]
}

In [14]:
# TODO: Put your automl settings here
automl_settings = {
    "experiment_timeout_minutes" :30,
    "max_concurrent_iterations": 4,
    "n_cross_validations": 3,
    "primary_metric": 'average_precision_score_weighted',
}

# TODO: Put your automl config here
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    n_cross_validations=3,
    task="classification",
    primary_metric="average_precision_score_weighted",
    compute_target=cpu_cluster,
    training_data=training_data,
    label_column_name="PCOS",
    max_cores_per_iteration=-1,
    enable_onnx_compatible_models=True
    )

Submitting the run

In [15]:
remote_run = experiment.submit(config = automl_config, show_output = True)

Running on remote.
No run_configuration provided, running on cpucluster-aml with default configuration
Running on remote compute: cpucluster-aml
Parent Run ID: AutoML_2753db88-bcd0-4293-a90c-891b78b4837f

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the

## Run Details
The `Rundetails` widget, as the name suggests gives us greater insight about how the Run is proceeding, enabling us to monitor and understand the situation, thereby dealing with it accordingly.

In [None]:
RunDetails(remote_run).show()

## Best Model

The best performing model is the `VotingEnsemble` with a score of 0.9006. It maybe observed to be derived from the following:

|**Field**|Value|
|-|-|
|**Ensembled Iterations**|0, 14, 15, 6, 26|
|**Ensembled Algorithms**|'LightGBM', 'RandomForest', 'XGBoostClassifier', 'ExtremeRandomTrees', 'XGBoostClassifier'|
|**Ensemble Weights**|0.3333333333333333, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666|
|**Best Individual Pipeline Score**|"0.9005922928114609"|

In [37]:
best_run, fitted_model = remote_run.get_output()
print(best_run)
print(fitted_model)

model_ml = best_run.register_model(model_name='PCOS_auto_ml', model_path='./')

Run(Experiment: pcos_automl,
Id: AutoML_2753db88-bcd0-4293-a90c-891b78b4837f_29,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                objective=None,
                                                                                                random_state=None,
                                        

## Retrieve and Save ONNX Model


In [18]:
from azureml.automl.runtime.onnx_convert import OnnxConverter

b_run , onnx_mdl = remote_run.get_output(return_onnx_model=True)
onnx_fl_path = "./best_model.onnx"
OnnxConverter.save_onnx_model(onnx_mdl, onnx_fl_path)

### Predict with the ONNX model, using onnxruntime package

In [36]:
import sys
import json
from azureml.automl.core.onnx_convert import OnnxConvertConstants
from azureml.train.automl import constants

if sys.version_info < OnnxConvertConstants.OnnxIncompatiblePythonVersion:
    python_version_compatible = True
else:
    python_version_compatible = False

import onnxruntime
from azureml.automl.runtime.onnx_convert import OnnxInferenceHelper

def get_onnx_res(run):
    res_path = 'onnx_resource.json'
    run.download_file(name=constants.MODEL_RESOURCE_PATH_ONNX, output_file_path=res_path)
    with open(res_path) as f:
        onnx_res = json.load(f)
    return onnx_res

if python_version_compatible:
    mdl_bytes = onnx_mdl.SerializeToString()
    onnx_res = get_onnx_res(b_run)

    df_test['Column1_1'] = 0.0
    onnxrt_helper = OnnxInferenceHelper(mdl_bytes, onnx_res)
    pred_onnx, pred_prob_onnx = onnxrt_helper.predict(df_test)

    print(pred_onnx)
    print(pred_prob_onnx)
else:
    print('Please use Python version 3.6 or 3.7 to run the inference helper.')

['mb n' 'mb n' 'mb n' 'mb' 'mb' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n'
 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n'
 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n'
 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n'
 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n'
 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n'
 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n'
 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n'
 'mb' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n' 'mb n'
 'mb n' 'mb n' 'mb n' 'mb n' 'mb n']
[[0.01430098 0.13477242 0.85092664]
 [0.02792714 0.1486565  0.8234164 ]
 [0.02050737 0.13300157 0.84649116]
 [0.10077284 0.70155394 0.19767326]
 [0.03942947 0.4998218  0.4607487 ]
 [0.0199182  0.13074069 0.8493412 ]
 [0.03091824 0.14138481 0.82769704]
 [0.04183062 0.20063189 0.7575376 ]
 [0.03044434 0.11633296 0.8532228 ]

## Model Deployment

Being the better performing model, I shall hereby deploy the `VotingEnsemble` model.

In [None]:
from azureml.core.model import Model
from azureml.core import Environment
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice

In [None]:
os.makedirs('./amlmodel', exist_ok=True)

best_run.download_file('/outputs/model.pkl',os.path.join('./amlmodel','automl_best_model_cc.pkl'))

for f in best_run.get_file_names():
    if f.startswith('outputs'):
        output_file_path = os.path.join('./amlmodel', f.split('/')[-1])
        print(f'Downloading from {f} to {output_file_path} ...')
        best_run.download_file(name=f, output_file_path=output_file_path)


TODO: In the cell below, send a request to the web service you deployed to test it.

In [None]:
model=best_run.register_model(
            model_name = 'automl-bestmodel-cc', 
            model_path = './outputs/model.pkl',
            model_framework=Model.Framework.SCIKITLEARN,
            description='Cervical Cancer Prediction'
)

In [None]:
# Download the conda environment file and define the environement
best_run.download_file('outputs/conda_env_v_1_0_0.yml', 'conda_env.yml')
myenv = Environment.from_conda_specification(name = 'myenv',
                                             file_path = 'conda_env.yml')

In [None]:
# download the scoring file produced by AutoML
best_run.download_file('outputs/scoring_file_v_1_0_0.py', 'score_auto_cc.py')

# set inference config
inference_config = InferenceConfig(entry_script= 'score_auto_cc.py',
                                    environment=myenv)

In [None]:
# set Aci Webservice config
aci_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1, auth_enabled=True)

In [None]:
service = Model.deploy(workspace=ws, 
                       name='automl-bestmodel-cc', 
                       models=[model], 
                       inference_config=inference_config,
                       deployment_config=aci_config,
                       overwrite=True)

In [None]:
service

In [None]:
# wait for deployment to finish and display the scoring uri and swagger uri
service.wait_for_deployment(show_output=True)

print('Service state:')
print(service.state)

print('Scoring URI:')
print(service.scoring_uri)

print('Swagger URI:')
print(service.swagger_uri)

In [None]:
import json

# select 3  samples from the dataframe
x_df=df.sample(3)
y_df = x_df.pop('PCOS')

x_df['Column1'] = 0.0

# convert the records to a json data file
recored=x_df.to_dict(orient='records')

scoring_json = json.dumps({'data': recored})
print(scoring_json)

Consumint the endpoint using `endpoint.py`

In [None]:
!python3 endpoint.py

In [None]:
output = service.run(scoring_json)
output

In [None]:
y_df

Enabling logging using `logs.py`.

In [None]:
!python3 logs.py