# AutoML Classification experiment using Local Compute and Pandas DataFrames
### Save trained model as Scikit-Learn model (.pkl) and as ONNX model (.onnx file)
### Data: German credit dataset loaded from Azure ML Dataset

##  Get Azure ML Workspace to use

In [None]:
# azureml-core of version 1.0.72 or higher is required
from azureml.core import Workspace, Dataset

# Get Workspace defined in by default config.json file
ws = Workspace.from_config()

## Load data from Azure ML Datasets into Pandas DataFrame

In [None]:
# Load Data
aml_dataset = ws.datasets['german-credit']

# Use Pandas DataFrame just to sneak peak some data and schema
full_df = aml_dataset.to_pandas_dataframe()
# .to_pandas_dataframe().dropna()
full_df.head(5)

In [None]:
# Use Pandas DataFrame just to investigate the dataset's schema and info
full_df.describe()

## Clean up the initial dataset (Using related Pandas DataFrame)

In [None]:
# Dropping Sno column since it is merely an identifier
full_df = full_df.drop(['Sno'], axis=1)

## Split original dataset in test/train sets using Scikit-Learn train_test_split function

In [None]:
from sklearn.model_selection import train_test_split

# Split using ScikitLearn train_test_split function using Dataframes
# Will use test dataframe at the end, without AutoML, just for testing predictions with the model

# Only split in test/train
train_df, test_df = train_test_split(full_df, test_size=0.2, random_state=1)
train_df.describe()

# Split in x/y and test/train
# y_df = full_df.pop("Risk")
# x_df = full_df
# x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.1, random_state=1)

#Another possibility would be to split using the Azure ML Datasets (Better for Remote Compute): 
# https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset?view=azure-ml-py#random-split-percentage--seed-none-

In [None]:
test_df.describe()

## List and select primary metric to drive the AutoML classification problem

In [None]:
from azureml.train import automl

# List of possible primary metrics is here:
# https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train#primary-metric
    
# Get a list of valid metrics for your given task
automl.utilities.get_primary_metrics('classification')

# I'll use 'accuracy' as primary metric (Closer to 1.00 is better)

## Define AutoML Experiment settings

In [None]:
import logging

# Explanation of Settings: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train#configure-your-experiment-settings

# AutoMLConfig info on: 
# https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.automlconfig.automlconfig

# You can provide additional settings as a **kwargs parameter for the AutoMLConfig object
# automl_settings = {
#     "whitelist_models": 'XGBoostClassifier'
# }

from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(task='classification',
                             primary_metric='AUC_weighted',
                             #experiment_timeout_minutes= 15,                            
                             training_data=train_df,
                             label_column_name="Risk",
                             # X=x_train.values,             # X parameter is deprecated 
                             # y=y_train.values.flatten(),   # y parameter is deprecated 
                             n_cross_validations= 5,
                             # blacklist_models='XGBoostClassifier', 
                             # iteration_timeout_minutes= 5,                                                    
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log='automated_ml_errors.log',
                             verbosity= logging.INFO,
                             enable_onnx_compatible_models=True
                             # **automl_settings
                             )

# WARNING: If using X and y parameters (deprecated) you get the following warning
# WARNING - The AutoMLConfig inputs you have specified will soon be deprecated. Please use the AutoMLConfig shown in our documentation: https://aka.ms/AutoMLConfig


## Run Experiment with multiple child runs under the covers

In [None]:
from azureml.core import Experiment
from datetime import datetime

now = datetime.now()
#time_string = now.strftime("%m-%d-%Y-%H")
time_string = now.strftime("%m-%d-%Y-%H-%M")
print(time_string)
experiment_name = "credit-automl-local-{0}".format(time_string)
print(experiment_name)

experiment = Experiment(workspace=ws, 
                        name=experiment_name)

run = experiment.submit(automl_config, show_output=True)

## Explore results with Widget

In [None]:
# Explore the results of automatic training with a Jupyter widget: https://docs.microsoft.com/en-us/python/api/azureml-widgets/azureml.widgets?view=azure-ml-py
from azureml.widgets import RunDetails
RunDetails(run).show()

## Retrieve the 'Best' Scikit-Learn Model

In [None]:
best_run, fitted_model = run.get_output()
print(best_run)
print('--------')
print(fitted_model)

## Retrieve the 'Best' ONNX Model
Below we select the best pipeline from our iterations. The get_output method returns the best run and the fitted model. The Model includes the pipeline and any pre-processing. Overloads on get_output allow you to retrieve the best run and fitted model for any logged metric or for a particular iteration.
Set the parameter return_onnx_model=True to retrieve the best ONNX model, instead of the Python model.

In [None]:
best_run, onnx_mdl = run.get_output(return_onnx_model=True)

### Explicetely Save the best ONNX model on local drive path

In [None]:
from azureml.automl.runtime.onnx_convert import OnnxConverter
onnx_fl_path = "./best_model.onnx"
OnnxConverter.save_onnx_model(onnx_mdl, onnx_fl_path)

## See files associated with the 'Best run'

In [None]:
print(best_run.get_file_names())

# best_run.download_file('azureml-logs/70_driver_log.txt')

## Download experiment run related files 
Model files (.pkl and .onnx), Environment files to see Conda and Environment dependencies used by AutoML, etc.

In [None]:
# Download the run's files
best_run.download_file('outputs/model.pkl')
best_run.download_file('outputs/model.onnx')
best_run.download_file('outputs/model_onnx.json')
best_run.download_file('outputs/conda_env_v_1_0_0.yml')
best_run.download_file('outputs/env_dependencies.json')
best_run.download_file('outputs/scoring_file_v_1_0_0.py')
best_run.download_file('outputs/pipeline_graph.json')

## Register the Scikit-Learn model (.pkl file)
Once you've trained the model, you can save and register it to your workspace. Model registration lets you store and version your models in your workspace to simplify model management and deployment.

Running the following code will register the model to your workspace, and will make it available to reference by name in remote compute contexts or deployment scripts. 

In [None]:
from azureml.core.model import Model

model_reg = best_run.register_model(model_name='creditmodel_automl_loc',      # Name of the registered model in your workspace.
                                    description='Binary classification model for German credit risk. From AutoML local training',
                                    model_path='outputs/model.pkl',              # Local file to upload and register as a model.
                                    model_framework=Model.Framework.SCIKITLEARN, # Framework used to create the model.
                                    model_framework_version='0.XX.X',            # Version of scikit-learn used to create the model.
                                    tags={'ml-task': "binary-classification-automl", 'business-area': "Credit risk"},
                                    properties={'pandas-version': "0.XX.X"},
                                    sample_input_dataset=aml_dataset
                              )

print(model_reg)

# (Q1:) How can we know what frameworks/libraries versions were used by AutoML when training the model?
#         - If investigating the outputs/conda_env_v_1_0_0.yml file, some libraries like Pandas and Scikit-Learn are not showing any version...

# (Q2:) Why best_run.get_environment() fails?
# best_run_environment = best_run.get_environment() 
# print(best_run_environment)


## Make Predictions

### Prep Test Data: Extract X values (feature columns) from test dataset and convert to NumPi array for predicting 

In [None]:
import pandas as pd

#Remove Label/y column
if 'Risk' in test_df.columns:
    y_test_df = test_df.pop('Risk')

x_test_df = test_df

In [None]:
x_test_df.describe()

### Make Predictions with Scikit-Learn Model

#### (Optional) Download Model from Registry and load in-memory

In [None]:
print(Model.get_model_path('creditmodel_automl_loc', _workspace=ws))

model_definition_from_registry = Model(ws,'creditmodel_automl_loc')
model_definition_from_registry.download(target_dir='.', exist_ok=True)
print(model_definition_from_registry)
print('-------')

# Load the model into memory
import joblib
fitted_model = joblib.load('model.pkl')
print(fitted_model)

In [None]:
# Try the best model making predictions with the test dataset
y_predictions = fitted_model.predict(x_test_df)

print('10 predictions: ')
print(y_predictions[:10])

In [None]:
y_predictions.shape

## Make Predictions with the ONNX model, using onnxruntime package
Needs pip install onnxruntime==1.0.0' in environment (Also try with 1.1.0 version)

In [None]:
import sys
import json
from azureml.automl.core.onnx_convert import OnnxConvertConstants
from azureml.train.automl import constants

if sys.version_info < OnnxConvertConstants.OnnxIncompatiblePythonVersion:
    python_version_compatible = True
else:
    python_version_compatible = False
    
import onnxruntime
from azureml.automl.runtime.onnx_convert import OnnxInferenceHelper

def get_onnx_res(run):
    res_path = 'onnx_resource.json'
    run.download_file(name=constants.MODEL_RESOURCE_PATH_ONNX, output_file_path=res_path)
    with open(res_path) as f:
        onnx_res = json.load(f)
    return onnx_res

if python_version_compatible:
    # test_df = test_dataset.to_pandas_dataframe()
    mdl_bytes = onnx_mdl.SerializeToString()
    onnx_res = get_onnx_res(best_run)

    onnxrt_helper = OnnxInferenceHelper(mdl_bytes, onnx_res)
    pred_onnx, pred_prob_onnx = onnxrt_helper.predict(x_test_df)

    print('Predicting with ONNX model...')
    print(pred_onnx)
    print(pred_prob_onnx)
else:
    print('Please use Python version 3.6 or 3.7 to run the inference helper.')

### Calculate the Accuracy with Test Dataset (Data not used for training)

In [None]:
from sklearn.metrics import accuracy_score

print('Accuracy with Scikit-Learn model:')
print(accuracy_score(y_test_df, y_predictions))

print('Accuracy with ONNX model:')
print(accuracy_score(y_test_df, pred_onnx))