In [1]:
import azureml.core
from azureml.core import Workspace

ws = Workspace.from_config()

# Creating Computer Cluster
from azureml.core.compute import ComputeTarget, AmlCompute

from azureml.core.compute_target import ComputeTargetException


cluster_name = 'your-compute-cluster'
try:
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
except ComputeTargetException:
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as e:
        print(e)

In [2]:
# Datastores are references to storage locations such as Azure Storage blob containers. Every workspace comes with a default datastore. 

default_ds = ws.get_default_datastore()

# loop through all datastores and find which one is actually the default
for ds_name in ws.datastores:
    print(ds_name,'- Default =', ds_name == default_ds.name)

workspaceworkingdirectory - Default = False
workspacefilestore - Default = False
workspaceartifactstore - Default = False
workspaceblobstore - Default = True


In [3]:
datastore = ws.datastores['workspaceblobstore']

In [4]:
datastore

{
  "name": "workspaceblobstore",
  "container_name": "azureml-blobstore-db2e5a15-d42a-49d7-b494-181af9d9e23d",
  "account_name": "mlwdp100labs1149010901",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

In [5]:
# Uploading data to Datastore
from azureml.core import Dataset
from azureml.data.datapath import DataPath
Dataset.File.upload_directory(src_dir='data',
                              target=DataPath(default_ds,'diabetes-data/'))

Validating arguments.
Arguments validated.
'overwrite' is set to False. Any file already present in the target will be skipped.'
Uploading files from '/Users/timothy/Programming/azure_training/module3/working_with_data/data' to 'diabetes-data/'
Copying 1 files with concurrency set to 1
Skipped /Users/timothy/Programming/azure_training/module3/working_with_data/data/test_diabetes.csv, file 1 out of 1. Target already exists.
Files copied=0, skipped=1, failed=0
Creating new dataset


{
  "source": [
    "('workspaceblobstore', '/diabetes-data/')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ]
}

In [6]:
# Work with datasets
# Tabular datasets

from azureml.core import Dataset


default_ds = ws.get_default_datastore()
tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds,'diabetes-data/*.csv'))
tab_data_set.take(20).to_pandas_dataframe()

file_data_set = Dataset.File.from_files(path=(default_ds, 'diabetes-data/*.csv'))

print('File Paths')
for file_path in file_data_set.to_path():
    print(file_path)

File Paths
/diabetes.csv
/diabetes2.csv
/test_diabetes.csv


In [7]:
# Now we register the file so that we can make it accessible 
# Register the tabular dataset
try:
    tab_data_set = tab_data_set.register(workspace=ws,
                                         name='diabetes_dataset',
                                         description='diabetes data',
                                         tags={'format':'CSV'},
                                         create_new_version=True)
except Exception as e:
    print(e)

# Register the file dataset
try:
    file_data_set = file_data_set.register(workspace=ws,
                                           name='diabetes_file_dataset',
                                           description='diabetes files',
                                           tags={'format':'CSV'},
                                           create_new_version=True)
except Exception as e:
    print(e)
print('Datasets registered')

Datasets registered


In [8]:
print('Datasets:')
for dataset_name in list(ws.datasets.keys()):
    dataset = Dataset.get_by_name(ws, dataset_name)
    print('\t', dataset.name, 'version',dataset.version)


Datasets:
	 diabetes_file_dataset version 1
	 diabetes_dataset version 1


In [9]:
# Train a model from a tabular dataset
import os

# Create a folder for the experiment files
experiments_folder = 'diabetes_training_from_tab_dataset'
os.makedirs(experiments_folder,exist_ok=True)
print('Experiment Folder Created')

Experiment Folder Created


In [14]:
%%writefile $experiments_folder/diabetes_train_frm_tab.py

import argparse
import azureml.core
from azureml.core import Run
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

parser = argparse.ArgumentParser()
parser.add_argument('--regularization',type=float, dest='reg_rate', default=0.01, help='regularization rate')
parser.add_argument('--input-data',type=str, dest='training_dataset_id', help='training dataset')
args = parser.parse_args()

reg = args.reg_rate

# Get Experiment run context 
run = Run.get_context()

# Get training dataset
diabetes = run.input_datasets['training_data'].to_pandas_dataframe()

# Seperate features and labels 
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

# Train a logistic model regression
run.log('Regularization Rate',reg)
model = LogisticRegression(C=1/reg, solver='liblinear').fit(X_train,y_train)
y_predicted = model.predict(X_test)

# Check accuracy
accuracy = np.average(y_predicted==y_test)
run.log('Accuracy', accuracy)

# Calculate AUC
pred_probs = model.predict_proba(X_test)
auc = roc_auc_score(y_test, pred_probs[:,1])
run.log('AUC',auc)

os.makedirs('outputs',exist_ok=True)
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()
print('Run Complete')

Overwriting diabetes_training_from_tab_dataset/diabetes_train_frm_tab.py


In [15]:
from azureml.core import ScriptRunConfig, Experiment, Environment
from azureml.core.runconfig import DockerConfiguration
from azureml.widgets import RunDetails

env = Environment.from_conda_specification('env_for_run','environment.yml')

# Get the training dataset
diabetes_ds = ws.datasets.get('diabetes_dataset')

# Create the Script Run Configuration
script_config = ScriptRunConfig(source_directory=experiments_folder,
                                script='diabetes_train_frm_tab.py',
                                arguments=['--regularization',0.01,
                                           '--input-data',diabetes_ds.as_named_input('training_data')],
                                           environment=env,
                                           docker_runtime_config=DockerConfiguration(use_docker=True),
                                           compute_target=cluster_name)
# submit the request
experiment_name = 'mslearn-diabetes-train'
experiment = Experiment(workspace=ws,
                        name=experiment_name)
run = experiment.submit(config=script_config)
RunDetails(run).show()
run.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'mslearn-diabetes-train_1694576059_8100ad3b',
 'target': 'your-compute-cluster',
 'status': 'Finalizing',
 'startTimeUtc': '2023-09-13T03:34:42.021914Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'amlctrain',
  'ContentSnapshotId': '6275486b-ea0e-421f-b2ca-9add2c744d9d',
  'azureml.git.repository_uri': 'git@github.com:TimPerera/AzureLearning.git',
  'mlflow.source.git.repoURL': 'git@github.com:TimPerera/AzureLearning.git',
  'azureml.git.branch': 'main',
  'mlflow.source.git.branch': 'main',
  'azureml.git.commit': '91b229ce7a4afd9f26d3501c91765574d3437ea5',
  'mlflow.source.git.commit': '91b229ce7a4afd9f26d3501c91765574d3437ea5',
  'azureml.git.dirty': 'True',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': '834559ed-a27e-423f-89a5-09199b69aff4'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'training_data', 'mechanism': 'Direct'}}],
 'outp

In [None]:
from azureml.core import Run
import argparse
import glob

parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float,dest='reg_rate',default=0.01)
parser.add_argument('--input-data', type=str, dest='dataset_folder')
args = parser.parse_args()

# Set regularization rate
reg = args.reg_rate

run = Run.get_context()

# Loading the dataset
data_path = run.input_datasets['training_files']

