# Automated ML

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources
import azureml.core
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.dataset import Dataset
from azureml.core.environment import Environment
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice, Webservice
from azureml.core.model import InferenceConfig, Model
from azureml.train.automl import AutoMLConfig
from azureml.pipeline.steps import AutoMLStep
from azureml.widgets import RunDetails
from pprint import pprint
import json
import requests
import logging
import os
import csv

## Dataset

### Overview
Employee Attrition affects every organization. The IBM HR Attrition Case Study is aimed at determining factors that lead to employee attrition and predict those at risk of leaving the company.

The Dataset consists of 35 columns, which will help us predict employee attrition. We will use the AutoML feature of Microsoft Azure to train different models on the dataset, deploy the best model and interact with it as a web service.




**Import Workspace**

In [2]:
ws = Workspace.from_config()

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id,
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: quick-starts-ws-143105
Azure region: southcentralus
Subscription id: 1b944a9b-fdae-4f97-aeb1-b7eea0beac53
Resource group: aml-quickstarts-143105


**Create an Experiment**

In [3]:
# choosing a name for experiment
experiment_name = 'employee-attrition-automl'
experiment=Experiment(ws, experiment_name)

run = experiment.start_logging()

**Create Compute Cluster**

In [5]:
cluster_name = "notebook143105"

try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target, using it!')
except ComputeTargetException:
    print('Creating a new compute target!')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    
    # create the cluster
    cpu_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
    
cpu_cluster.wait_for_completion(show_output=True)
 
# Using get_status() to get a detailed status for the current cluster.
print(cpu_cluster.get_status().serialize())

Found existing compute target, using it!

Running
{'errors': [], 'creationTime': '2021-04-19T03:10:01.054453+00:00', 'createdBy': {'userObjectId': '695a9b50-dd79-4e6b-b760-e29d07a0e1fd', 'userTenantId': '660b3398-b80e-49d2-bc5b-ac1dc93b5254', 'userName': 'ODL_User 143105'}, 'modifiedTime': '2021-04-19T03:12:32.756932+00:00', 'state': 'Running', 'vmSize': 'STANDARD_DS3_V2'}


**Import Dataset**

In [6]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
# NOTE: update the key to match the dataset name
found = False
key = "Employee Attrition"
description_text = "IBM HR Analytics Employee Attrition & Performance"

if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        data = 'https://raw.githubusercontent.com/ObinnaIheanachor/Capstone-Project-Udacity-Machine-Learning-Engineer/main/data/WA_Fn-UseC_-HR-Employee-Attrition.csv'
        dataset = Dataset.Tabular.from_delimited_files(data)        
        #Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)


df = dataset.to_pandas_dataframe()
df.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,1.0,1024.865306,2.721769,65.891156,2.729932,2.063946,...,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,1.024165,0.0,602.024335,1.093082,20.329428,0.711561,1.10694,...,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,1.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,2.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


## AutoML Configuration

This is a binary classification problem with label column 'Attrition' having output as 'true' or 'false'. `25 mins` is the `experiment_timeout_duration`, a maximum of 5 concurrent iterations take place together, and the primary metric is `AUC_weighted`.

In [7]:
# Automl setting
automl_settings = automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 5,
    "primary_metric" : 'AUC_weighted'
}

# Automl config
automl_config = AutoMLConfig(compute_target=cpu_cluster,
                             task = "classification",
                             training_data=dataset,
                             label_column_name="Attrition",   
                             path = './capstone-project',
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             **automl_settings
                             )

In [8]:
# Submitting the experiment
remote_run = experiment.submit(automl_config, show_output=True)

Submitting remote run.
No run_configuration provided, running on notebook143105 with default configuration
Running on remote compute: notebook143105


Experiment,Id,Type,Status,Details Page,Docs Page
employee-attrition-automl,AutoML_4d9148a5-c041-4e96-a3d9-557b261db721,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Cross validation
STATUS:       DONE
DESCRIPTION:  Each iteration of the trained model was validated through cross-validation.
              
DETAILS:      
+---------------------------------+
|Number of folds                  |
|3                                |
+---------------------------------+

****************************************************************************************************

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:

## Run Details


TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [9]:

RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Best Model

Our best model is VotingEnsemble with an AUC_weighted of **0.82302**. We will retrieve the best model and display its properties



In [11]:
best_run, fitted_model = remote_run.get_output()
print(best_run)

best_run_metrics = best_run.get_metrics()
print('Best Run Id: ', best_run.id)

Run(Experiment: employee-attrition-automl,
Id: AutoML_4d9148a5-c041-4e96-a3d9-557b261db721_20,
Type: azureml.scriptrun,
Status: Completed)
Best Run Id:  AutoML_4d9148a5-c041-4e96-a3d9-557b261db721_20


In [12]:
print(fitted_model)

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                  n_estimators=10,
                                                                                                  n_jobs=1,
                                                                                                  oob_score=False,
                                                      

In [13]:
def print_model(fitted_model, prefix=""):
    for step in fitted_model.steps:
        print(prefix + step[0])
        if hasattr(step[1], 'estimators') and hasattr(step[1], 'weights'):
            pprint({'estimators': list(
                e[0] for e in step[1].estimators), 'weights': step[1].weights})
            print()
            for estimator in step[1].estimators:
                print_model(estimator[1], estimator[0] + ' - ')
        else:
            pprint(step[1].get_params())
            print()

print_model(fitted_model)


datatransformer
{'enable_dnn': None,
 'enable_feature_sweeping': None,
 'feature_sweeping_config': None,
 'feature_sweeping_timeout': None,
 'featurization_config': None,
 'force_text_dnn': None,
 'is_cross_validation': None,
 'is_onnx_compatible': None,
 'logger': None,
 'observer': None,
 'task': None,
 'working_dir': None}

prefittedsoftvotingclassifier
{'estimators': ['9', '6', '12', '7', '8', '1', '19', '10', '11', '15'],
 'weights': [0.06666666666666667,
             0.06666666666666667,
             0.26666666666666666,
             0.06666666666666667,
             0.06666666666666667,
             0.06666666666666667,
             0.13333333333333333,
             0.13333333333333333,
             0.06666666666666667,
             0.06666666666666667]}

9 - maxabsscaler
{'copy': True}

9 - extratreesclassifier
{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 0.1,
 'max_leaf_nodes': None,
 'max_sample

In [14]:
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name,"-" , metric)

f1_score_weighted - 0.8538543075956223
precision_score_micro - 0.8659863945578231
average_precision_score_weighted - 0.8860677828200302
average_precision_score_macro - 0.7671140244975878
recall_score_weighted - 0.8659863945578231
accuracy - 0.8659863945578231
f1_score_macro - 0.7120569006195175
f1_score_micro - 0.8659863945578232
weighted_accuracy - 0.9324496327114783
recall_score_micro - 0.8659863945578231
AUC_weighted - 0.8230223307502397
precision_score_weighted - 0.8521098697580415
balanced_accuracy - 0.6881948112652639
log_loss - 0.4460903626306712
matthews_correlation - 0.44183035644626195
norm_macro_recall - 0.3763896225305278
recall_score_macro - 0.6881948112652639
AUC_macro - 0.8230223307502396
precision_score_macro - 0.7607064311137334
AUC_micro - 0.9295029848674163
average_precision_score_micro - 0.9185608350311462
confusion_matrix - aml://artifactId/ExperimentRun/dcid.AutoML_4d9148a5-c041-4e96-a3d9-557b261db721_20/confusion_matrix
accuracy_table - aml://artifactId/Experimen

In [15]:

automodel = best_run.register_model(model_name='automl_model', 
                                    model_path='outputs/model.pkl',
                                    tags={'Method':'AutoML'},
                                    properties={'AUC_weighted': best_run_metrics['AUC_weighted']})

print(automodel)

Model(workspace=Workspace.create(name='quick-starts-ws-143105', subscription_id='1b944a9b-fdae-4f97-aeb1-b7eea0beac53', resource_group='aml-quickstarts-143105'), name=automl_model, id=automl_model:1, version=1, tags={'Method': 'AutoML'}, properties={'AUC_weighted': '0.8230223307502397'})


## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [16]:
# Download scoring file 
best_run.download_file('outputs/scoring_file_v_1_0_0.py', 'score.py')

# Download environment file
best_run.download_file('outputs/conda_env_v_1_0_0.yml', 'env.yml')

In [17]:
aciconfig = AciWebservice.deploy_configuration(cpu_cores=1, 
                                               memory_gb=1, 
                                               description='Predict Employee Attrition with AutoML')

In [18]:
inference_config = InferenceConfig(entry_script="score.py", environment=best_run.get_environment())

service = Model.deploy(workspace=ws, 
                       name='automl-webservice', 
                       models=[automodel], 
                       inference_config=inference_config, 
                       deployment_config=aciconfig)

TODO: In the cell below, send a request to the web service you deployed to test it.

In [19]:
service.wait_for_deployment(show_output=True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-04-19 06:46:53+00:00 Creating Container Registry if not exists.
2021-04-19 06:46:53+00:00 Registering the environment.
2021-04-19 06:46:55+00:00 Use the existing image.
2021-04-19 06:46:55+00:00 Generating deployment configuration.
2021-04-19 06:46:57+00:00 Submitting deployment to compute.
2021-04-19 06:47:02+00:00 Checking the status of deployment automl-webservice..
2021-04-19 06:50:52+00:00 Checking the status of inference endpoint automl-webservice.
Succeeded
ACI service creation operation finished, operation "Succeeded"


In [20]:

print("Service State: ",service.state)
print("Scoring URI: ",service.scoring_uri)
print("Swagger URI: ",service.swagger_uri)

Service State:  Healthy
Scoring URI:  http://baa3b839-ff6c-4d34-952e-28f72c3f33c6.southcentralus.azurecontainer.io/score
Swagger URI:  http://baa3b839-ff6c-4d34-952e-28f72c3f33c6.southcentralus.azurecontainer.io/swagger.json


TODO: In the cell below, print the logs of the web service and delete the service

In [21]:
!python logs.py

2021-04-19T06:50:47,201909900+00:00 - gunicorn/run 
2021-04-19T06:50:47,201887700+00:00 - rsyslog/run 
2021-04-19T06:50:47,216630700+00:00 - nginx/run 
2021-04-19T06:50:47,217653500+00:00 - iot-server/run 
/usr/sbin/nginx: /azureml-envs/azureml_8e5a5a51349877e7d47c6a2872e0ebfd/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_8e5a5a51349877e7d47c6a2872e0ebfd/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_8e5a5a51349877e7d47c6a2872e0ebfd/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_8e5a5a51349877e7d47c6a2872e0ebfd/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_8e5a5a51349877e7d47c6a2872e0ebfd/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)


**Send a request to the deployed web service**

In [22]:
#Import test data
test_df = df.sample(4) # sample data from original dataset
label_df = test_df.pop('Attrition')

test_sample = json.dumps({'data': test_df.to_dict(orient='records')})

print(test_sample)

{"data": [{"Age": 32, "BusinessTravel": "Travel_Frequently", "DailyRate": 379, "Department": "Sales", "DistanceFromHome": 5, "Education": 2, "EducationField": "Life Sciences", "EmployeeCount": 1, "EmployeeNumber": 889, "EnvironmentSatisfaction": 2, "Gender": "Male", "HourlyRate": 48, "JobInvolvement": 3, "JobLevel": 2, "JobRole": "Sales Executive", "JobSatisfaction": 2, "MaritalStatus": "Married", "MonthlyIncome": 6524, "MonthlyRate": 8891, "NumCompaniesWorked": 1, "Over18": true, "OverTime": false, "PercentSalaryHike": 14, "PerformanceRating": 3, "RelationshipSatisfaction": 4, "StandardHours": 80, "StockOptionLevel": 1, "TotalWorkingYears": 10, "TrainingTimesLastYear": 3, "WorkLifeBalance": 3, "YearsAtCompany": 10, "YearsInCurrentRole": 8, "YearsSinceLastPromotion": 5, "YearsWithCurrManager": 3}, {"Age": 45, "BusinessTravel": "Travel_Rarely", "DailyRate": 192, "Department": "Research & Development", "DistanceFromHome": 10, "Education": 2, "EducationField": "Life Sciences", "EmployeeCo

In [23]:
scoring_uri = service.scoring_uri
input_data = test_sample

# Set the content type
headers = {'Content-Type': 'application/json'}

# Make the request and display the response
resp = requests.post(scoring_uri, input_data, headers=headers)
print(resp.text)

"{\"result\": [false, false, false, false]}"


In [24]:

print(service.get_logs())

2021-04-19T07:00:50,781641000+00:00 - iot-server/run 
2021-04-19T07:00:50,780963100+00:00 - gunicorn/run 
2021-04-19T07:00:50,780214700+00:00 - rsyslog/run 
2021-04-19T07:00:50,819536100+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_8e5a5a51349877e7d47c6a2872e0ebfd/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_8e5a5a51349877e7d47c6a2872e0ebfd/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_8e5a5a51349877e7d47c6a2872e0ebfd/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_8e5a5a51349877e7d47c6a2872e0ebfd/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_8e5a5a51349877e7d47c6a2872e0ebfd/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
rsyslogd

In [None]:
service.delete()