In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: ml-ud-training
Azure region: westeurope
Subscription id: b74a5367-8189-4c03-b954-e3a41b64e370
Resource group: ml-udacity


In [3]:
ws

Workspace.create(name='ml-ud-training', subscription_id='b74a5367-8189-4c03-b954-e3a41b64e370', resource_group='ml-udacity')

In [12]:
from azureml.core.compute import ComputeTarget, AmlCompute


# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Specify the cluster details
cluster_name = "udacity-prj1"
cluster_min_nodes = 0
cluster_max_nodes = 3
cluster_vm_size = "Standard_D2_V2"

# Check if the cluster already exists
try:
    cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print("Cluster already exists.")
except ComputeTargetException:
    # If the cluster does not exist, create it
    compute_config = AmlCompute.provisioning_configuration(vm_size=cluster_vm_size,
                                                           min_nodes=cluster_min_nodes,
                                                           max_nodes=cluster_max_nodes)
    cluster = ComputeTarget.create(ws, cluster_name, compute_config)
    cluster.wait_for_completion(show_output=True)


Cluster already exists.


In [3]:
# be sure you have azureml-sdk installed 
# pip install azureml-sdk
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os

# Specify parameter sampler
#ps = ### YOUR CODE HERE ###

ps = RandomParameterSampling({
    'C': uniform(0.01, 1.0),
    'max_iter': choice(50, 100, 150)
})

#RandomParameterSampling and define two hyperparameters: C and max_iter.
# These are the parameters that you can tune during the hyperparameter tuning process to find 
#the best values for your logistic regression model.
# By using the argparse library in train.py, these hyperparameters can be specified when running 
# the train.py script from the command line. The argparse library allows passing values to these 
# hyperparameters as command-line arguments. In the hyperparameter tuning process the parameter sampler
# will explore different values within the specified ranges
#  for C and max_iter to find the optimal combination that maximizes the chosen evaluation metric.
# So, updating the ps parameter sampler to include C and max_iter as hyperparameters aligns 
# with the hyperparameters defined in the train.py script.


# Specify a Policy
# policy = ### YOUR CODE HERE ###

policy = BanditPolicy(slack_factor=0.1, evaluation_interval=1, delay_evaluation=5)

# The BanditPolicy terminates any run that doesn't meet the slack factor compared to the best performing run. 
# In this example, we set the slack_factor to 0.1, meaning that any run that is worse by 10% compared to the best
# performing run will be terminated. The delay_evaluation parameter in BanditPolicy introduces a delay in the policy evaluation. 
# This means that the policy does not start evaluating runs for early termination until the specified number of 
# intervals have passed. It helps to allow the initial runs to have more time to gather data and stabilize before
# applying the policy. This can be useful in cases where the metric values may initially fluctuate or have higher variability.


if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job
#src = ### YOUR CODE HERE ###
src = ScriptRunConfig(source_directory='.',
                      script='train.py', # be sure that train.py exists in the directory
                      compute_target='udacity-prj1', #this is name of cluster you want to run the code 
                      arguments=['--C', '1.0', '--max_iter', '100'],
                      environment=sklearn_env)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
# hyperdrive_config = ### YOUR CODE HERE ###

hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=10,
                                     max_concurrent_runs=4)

                                     



- I add the cell below to test if train.py file is working correctly 

In [4]:
from azureml.core import ScriptRunConfig, Experiment
from azureml.core import Environment, ScriptRunConfig

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig for testing the script
test_config = ScriptRunConfig(source_directory='.',
                             script='train.py',
                             compute_target='udacity-prj1',
                             environment=sklearn_env)


# Create an experiment for testing
test_experiment_name = 'script_test_experiment'
test_experiment = Experiment(workspace=ws, name=test_experiment_name)

# Submit the script run for testing
test_run = test_experiment.submit(config=test_config)

# Wait for the run to complete
test_run.wait_for_completion(show_output=True)


RunId: script_test_experiment_1684333327_f5db0dc9
Web View: https://ml.azure.com/runs/script_test_experiment_1684333327_f5db0dc9?wsid=/subscriptions/b74a5367-8189-4c03-b954-e3a41b64e370/resourcegroups/ml-udacity/workspaces/ml-ud-training&tid=b33be5d6-5072-448f-bad3-d8b66cf09736

Streaming user_logs/std_log.txt

  from cryptography.hazmat.backends import default_backend
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
Cleaning up all outstanding Run operations, waiting 300.0 seconds
1 items cleaning up...
Cleanup took 0.5705831050872803 seconds

Execution Summary
RunId: script_test_experiment_1684333327_f5db0dc9
Web View: https://ml.azure.com/runs/script_test_experiment_1684333327_f5db0dc9?wsid=/subscri

{'runId': 'script_test_experiment_1684333327_f5db0dc9',
 'target': 'udacity-prj1',
 'status': 'Completed',
 'startTimeUtc': '2023-05-17T14:27:32.524793Z',
 'endTimeUtc': '2023-05-17T14:28:54.87649Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'amlctrain',
  'ContentSnapshotId': 'bec02fd6-6e0d-44c0-b697-983a70608a78',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'script': 'train.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': [],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'udacity-prj1',
  'dataReferences': {},
  'data': {},
  'outputData': {},
  'datacaches': [],
  'jobName': None,
  'maxRunDurationSeconds': 2592000,
  'nodeCount': 1,
  'instanceTypes': [],
  'priority': None,
  'credentialPassthrough': False,
  'identity': None,
  'environment': {'name': 'sklear

In [10]:
# Download the model.joblib file from the experiment run to your local directory
test_run.download_file(name='outputs/model.joblib', output_file_path='model.joblib')

# Load the model from the downloaded file
import joblib
model = joblib.load('model.joblib')

- Experiment to perform hyperdrive 

In [12]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
from azureml.core import Experiment

# Create an experiment
experiment_name = 'hyperdrive_experiment'
experiment = Experiment(workspace=ws, name=experiment_name)

# Submit the HyperDrive run
hyperdrive_run = experiment.submit(config=hyperdrive_config)

# Use the RunDetails widget to show the run details
RunDetails(hyperdrive_run).show()


_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [13]:
import joblib
# Get your best run and save the model from that run.

### YOUR CODE HERE ###

# Get your best run and save the model from that run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['arguments']

print('Best Run Id: ', best_run.id)
print(' -Accuracy:', best_run_metrics['Accuracy'])
print(' -Regularization Rate:', parameter_values)





Best Run Id:  HD_689dd68a-cd03-42ba-bcb5-7be0d15f1730_6
 -Accuracy: 0.9157309054122408
 -Regularization Rate: ['--C', '1.0', '--max_iter', '100', '--C', '0.2414056777891848', '--max_iter', '150']


- Save the best model

In [16]:
# Download the model.joblib file from the experiment run to your local directory
best_run.download_file(name='outputs/model.joblib', output_file_path='./my_models/model.joblib')

# Load the model from the downloaded file
import joblib
model = joblib.load('model.joblib')

Trying to unpickle estimator LogisticRegression from version 0.24.2 when using version 1.2.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [15]:
model

      ------**Using AutoML**----

In [2]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###

url= "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files(path=url)

In [3]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.30, random_state=0)


import pandas as pd

train_data = pd.concat([x_train, y_train], axis=1)

In [6]:
from azureml.core import Dataset

# Convert the train_data DataFrame to a CSV file
train_data.to_csv('train_data.csv', index=False)

# Get the default datastore for your workspace
datastore = ws.get_default_datastore()

# Upload the train_data.csv file to the default datastore
datastore.upload_files(files=['./train_data.csv'], target_path='train_data', overwrite=True)

# Create a TabularDataset from the train_data.csv file in the datastore
train_dataset = Dataset.Tabular.from_delimited_files(path=[(datastore, 'train_data/train_data.csv')])

"datastore.upload_files" is deprecated after version 1.0.69. Please use "FileDatasetFactory.upload_directory" instead. See Dataset API change notice at https://aka.ms/dataset-deprecation.


Uploading an estimated of 1 files
Uploading ./train_data.csv
Uploaded ./train_data.csv, 1 files out of an estimated total of 1
Uploaded 1 files


In [7]:
train_dataset

{
  "source": [
    "('workspaceblobstore', 'train_data/train_data.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ]
}

In [22]:
from azureml.core import Environment

# Create an Environment object
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Save the Environment object to a directory
sklearn_env.save_to_directory(path='environment_directory', overwrite=True)

# Load the Environment object from a directory
sklearn_env = Environment.load_from_directory(path='environment_directory')

In [35]:
from azureml.train.automl import AutoMLConfig
from azureml.core import Environment
# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.

# sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')



automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=train_dataset,
    label_column_name='y',
    n_cross_validations=5,
    compute_target='udacity-prj1')
    #environment=sklearn_env)

In [36]:
# Submit your automl run

### YOUR CODE HERE ###

from azureml.core.experiment import Experiment
from azureml.widgets import RunDetails

# Create an Experiment object
experiment = Experiment(workspace=ws, name='my-automl-experiment')

# Submit the AutoMLConfig object to the Experiment
automl_run = experiment.submit(automl_config)


RunDetails(automl_run).show()

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
my-automl-experiment,AutoML_e74717cd-89b9-42b3-bbbb-3e647737a11e,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [38]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.5-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.3/200.3 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.5
Note: you may need to restart the kernel to use updated packages.


In [44]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###

# Retrieve the best model from the AutoML run
best_run_automl, best_model_automl = automl_run.get_output()

#get_output() return the run with the corresponding best pipeline 
#that has already been tested

# Save the best model
from joblib import dump

dump(best_model_automl, './my_models/model_automl.pkl')



  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.

  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.

  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    htt

['./my_models/model_automl.pkl']

In [2]:
from azureml.core import Workspace
from azureml.core.compute import AmlCompute

# Load workspace
ws = Workspace.from_config()

# Get the compute target
compute_target = AmlCompute(ws, "udacity-prj1")

# Delete the compute target
compute_target.delete()