In [15]:
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Data

try:
    credential=DefaultAzureCredential()
    credential.get_token('https://management.azure.com/.default')
except Exception as ex:
    credential=InteractiveBrowserCredential()
client=MLClient.from_config(credential=credential)
path='src/diabetes.csv'
data_asset=Data(
    path=path,
    description='Diabetes-data-vscode',
    name='diabetes-datasset',
    type=AssetTypes.URI_FILE
)
client.data.create_or_update(data_asset)

Found the config file in: /config.json


Data({'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': None, 'type': 'uri_file', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'diabetes-datasset', 'description': 'Diabetes-data-vscode', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/7b1b43ca-4b64-43cf-9446-edb35a04d7d1/resourceGroups/rg01databricks/providers/Microsoft.MachineLearningServices/workspaces/ws-azureml-01/data/diabetes-datasset/versions/1', 'Resource__source_path': None, 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/satyakebakshi951/code', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7f30ec2ffd30>, 'serialize': <msrest.serialization.Serializer object at 0x7f30ec3fc580>, 'version': '1', 'latest_version': None, 'path': 'azureml://subscriptions/7b1b43ca-4b64-43cf-9446-edb35a04d7d1/resourcegroups/rg01databricks/workspaces/ws-azureml-01/datastores/workspaceblobstore/paths/LocalUpload

In [16]:
for ds in client.data.list():
    print(ds.name)

loan_dataset
diabetes
loan_data
MD-loan_automation-Train_Model-Trained_model-d6a5ebd0
TD-loan_automation-Clean_Missing_Data-Cleaning_transformation-2cd45586
dataset
diabetes-local
diabetes-blob-storage-linked
diabetes-MLTABLE
diabetes-datasset


In [17]:
datastores=client.datastores.list()
for ds in datastores:
    print(ds.name)

blob_training_data
deltalakedatastore
azureml_globaldatasets
workspaceworkingdirectory
workspaceartifactstore
workspacefilestore
workspaceblobstore


In [20]:
import pandas as pd 
df_asset=client.data.get('diabetes-datasset', version=1)
df=pd.read_csv(df_asset.path)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [21]:
import os 
script_folder='scripts'
os.makedirs(script_folder, exist_ok=True)

In [50]:
%%writefile $script_folder/training_script.py

import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from pathlib import Path
from argparse import ArgumentParser
import mlflow
import matplotlib.pyplot as plt
def parser():
    parser=ArgumentParser()
    parser.add_argument('--input_directory',dest='input_directory', type=str)
    parser.add_argument('--max_depth', dest='max_depth', type=int, default=2)
    parser.add_argument('--min_samples', dest='min_samples', type=int, default=2)
    parser.add_argument('--test_size', dest='test_size', type=float)
    parser.add_argument('--random_state', dest='random_state', type=int, default=42)
    args=parser.parse_args()
    return args

def train(args):
    df=pd.read_csv(args.input_directory)
    X=df.drop('Outcome', axis=1)
    y=df['Outcome']
    DT=DecisionTreeClassifier(max_depth=args.max_depth, min_samples_split=args.min_samples)
    X_train, X_test, y_train,y_test=train_test_split(X,y, test_size=args.test_size ,random_state=args.random_state)
    model=DT.fit(X_train, y_train)
    y_pred=model.predict(X_test)
    y_pred_prob=model.predict_proba(X_test)[:,1]
    accuracy=accuracy_score(y_test, y_pred)
    mlflow.log_metric('accuracy', accuracy)
    roc_auc=roc_auc_score(y_test, y_pred_prob)
    mlflow.log_metric('roc_auc', roc_auc)
    fpr,tpr,thresholds=roc_curve(y_test, y_pred_prob)
    plt.plot(fpr,tpr)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('ROC Curve')
    plt.savefig('roc_curve.png')
    mlflow.log_artifact('roc_curve.png')

if __name__=='__main__':
    args=parser()
    train(args)



 


Overwriting scripts/training_script.py


In [51]:
from azure.ai.ml import command
job=command(
    display_name='diabetes_job',
    compute='satyakebakshi951',
    code='./scripts',
    environment='AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest',
    command='python training_script.py --input_directory diabetes.csv --max_depth=5 --min_samples=5 --test_size=0.5 --random_state=42'
)

In [52]:
returned_job=client.create_or_update(job)
print(returned_job.studio_url)

Uploading scripts (0.03 MBs): 100%|██████████| 25551/25551 [00:00<00:00, 390239.21it/s]




https://ml.azure.com/runs/gentle_pizza_vdvmjnkw99?wsid=/subscriptions/7b1b43ca-4b64-43cf-9446-edb35a04d7d1/resourcegroups/rg01databricks/workspaces/ws-azureml-01&tid=a287f42c-46eb-424f-ab40-cd784a7b423c
