# Automated ML

### Import libraries

In [1]:
from azureml.core import Workspace, Experiment

### Create workspace and experiment instances

In [2]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'capstone-spam-classification-experiment'

experiment=Experiment(ws, experiment_name)

###  Dataset Overview
The dataset we are using is a spam classification dataset obtained from kaggle. We are going to perform multi-class text classification. 

### Get data

In [3]:
from azureml.core import Dataset
training_dataset = Dataset.get_by_name(ws, name='capstone-spam-dataset')

### Connect to compute target

In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# TODO: Create compute cluster
cluster_name = "capstone-compute-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # To use a different region for the compute, add a location='<region>' parameter
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## AutoML Configuration

TODO: Explain why you chose the automl settings and cofiguration you used below.

In [5]:
from azureml.train.automl import AutoMLConfig

automl_settings = {
    "n_cross_validations": 2,
    "primary_metric": 'accuracy',
    "enable_early_stopping": True,
    "max_concurrent_iterations": 5,
    "experiment_timeout_hours": 0.25,
    "featurization": 'auto',
}

automl_config = AutoMLConfig(
    task = 'classification',
    compute_target = compute_target,
    training_data = training_dataset,
    label_column_name = 'Category',
    **automl_settings
)

In [6]:
# TODO: Submit your experiment
remote_run = experiment.submit(automl_config)

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
capstone-spam-classification-experiment,AutoML_24da6a42-eeb8-4270-b730-f04a69d8b1be,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


### Run Details

In [7]:
from azureml.widgets import RunDetails
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [8]:
remote_run.wait_for_completion()

{'runId': 'AutoML_24da6a42-eeb8-4270-b730-f04a69d8b1be',
 'target': 'capstone-compute-cluster',
 'status': 'Completed',
 'startTimeUtc': '2022-09-22T08:39:40.892634Z',
 'endTimeUtc': '2022-09-22T08:57:04.946317Z',
 'services': {},
   'message': 'No scores improved over last 10 iterations, so experiment stopped early. This early stopping behavior can be disabled by setting enable_early_stopping = False in AutoMLConfig for notebook/python SDK runs.'}],
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '2',
  'target': 'capstone-compute-cluster',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"1e4bd0ae-220a-456f-b757-6d18b95cbbe3\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versi

### Get the best model and display its properties

In [9]:
best_run, fitted_model = remote_run.get_output()

# Get best_run metrics
best_run_metrics = best_run.get_metrics()
for name, value in best_run_metrics.items():
    print(f"{name}: {value}")

f1_score_micro: 0.9890563329745246
f1_score_macro: 0.6509232242831369
average_precision_score_macro: 0.6620424359658048
recall_score_weighted: 0.9890563329745246
AUC_micro: 0.9989243032829607
precision_score_weighted: 0.988746529697736
f1_score_weighted: 0.9887248051256088
AUC_weighted: 0.9945814354837839
average_precision_score_weighted: 0.9969940610360829
accuracy: 0.9890563329745246
balanced_accuracy: 0.6425086416103316
log_loss: 0.06003589983333342
weighted_accuracy: 0.9971306081165174
average_precision_score_micro: 0.9973630606520251
precision_score_micro: 0.9890563329745246
norm_macro_recall: 0.4637629624154973
recall_score_micro: 0.9890563329745246
AUC_macro: 0.829965707902784
recall_score_macro: 0.6425086416103316
precision_score_macro: 0.6601240000293928
matthews_correlation: 0.9522639521839865
accuracy_table: aml://artifactId/ExperimentRun/dcid.AutoML_24da6a42-eeb8-4270-b730-f04a69d8b1be_39/accuracy_table
confusion_matrix: aml://artifactId/ExperimentRun/dcid.AutoML_24da6a42-e

### Save the best model


In [10]:
import joblib, os
os.mkdir("output")
joblib.dump(value=fitted_model, filename='output/best-automl.pkl')

['output/best-automl.pkl']

In [11]:
best_run.get_file_names()

['accuracy_table',
 'automl_driver.py',
 'confusion_matrix',
 'logs/azureml/azureml_automl.log',
 'outputs/conda_env_v_1_0_0.yml',
 'outputs/engineered_feature_names.json',
 'outputs/env_dependencies.json',
 'outputs/featurization_summary.json',
 'outputs/generated_code/conda_environment.yaml',
 'outputs/generated_code/script.py',
 'outputs/generated_code/script_run_notebook.ipynb',
 'outputs/internal_cross_validated_models.pkl',
 'outputs/model.pkl',
 'outputs/pipeline_graph.json',
 'outputs/run_id.txt',
 'outputs/scoring_file_pbi_v_1_0_0.py',
 'outputs/scoring_file_v_1_0_0.py',
 'outputs/scoring_file_v_2_0_0.py',
 'system_logs/cs_capability/cs-capability.log',
 'system_logs/hosttools_capability/hosttools-capability.log',
 'system_logs/lifecycler/execution-wrapper.log',
 'system_logs/lifecycler/lifecycler.log',
 'system_logs/metrics_capability/metrics-capability.log',
 'system_logs/snapshot_capability/snapshot-capability.log',
 'user_logs/std_log.txt']

### Register the best model

In [12]:
best_run.register_model(model_name='automl-best-model', model_path='outputs/model.pkl')

Model(workspace=Workspace.create(name='quick-starts-ws-208297', subscription_id='3d1a56d2-7c81-4118-9790-f85d1acf0c77', resource_group='aml-quickstarts-208297'), name=automl-best-model, id=automl-best-model:1, version=1, tags={}, properties={})

## Model Deployment

Remember you have to deploy only one of the two models you trained but you still need to register both the models. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

TODO: In the cell below, send a request to the web service you deployed to test it.

TODO: In the cell below, print the logs of the web service and delete the service

**Submission Checklist**
- I have registered the model.
- I have deployed the model with the best accuracy as a webservice.
- I have tested the webservice by sending a request to the model endpoint.
- I have deleted the webservice and shutdown all the computes that I have used.
- I have taken a screenshot showing the model endpoint as active.
- The project includes a file containing the environment details.
