# Work with data

## Before you start

You'll need the latest version of the azureml-ai-ml package to run the code in this notebook. Run the cell below to verify that it is installed.

In [1]:
pip show azure-ai-ml

Name: azure-ai-ml
Version: 1.7.2
Summary: Microsoft Azure Machine Learning Client Library for Python
Home-page: https://github.com/Azure/azure-sdk-for-python
Author: Microsoft Corporation
Author-email: azuresdkengsysadmins@microsoft.com
License: MIT License
Location: /home/ta-seen/.local/lib/python3.10/site-packages
Requires: azure-common, azure-core, azure-mgmt-core, azure-storage-blob, azure-storage-file-datalake, azure-storage-file-share, colorama, isodate, jsonschema, marshmallow, msrest, opencensus-ext-azure, pydash, pyjwt, pyyaml, strictyaml, tqdm, typing-extensions
Required-by: 
Note: you may need to restart the kernel to use updated packages.


## Connect to the workspace

In [2]:
# Details of AML workspace
# Have to put the subscription id
subscription_id = "Have to put it"
resource_group = "rg-dp100-labs"
workspace = "mlw-dp100-labs"

In [3]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# get a handle to the workspace
ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group, workspace
)

In [None]:
stores = ml_client.datastores.list()
for ds_name in stores:
    print(ds_name.name)

In [None]:
from azure.ai.ml.entities import AzureBlobDatastore
from azure.ai.ml.entities import AccountKeyConfiguration

store = AzureBlobDatastore(
    name="blob_training_data",
    description="Blob Storage for training data",
    account_name="YOUR-STORAGE-ACCOUNT-NAME",
    container_name="training-data", 
    credentials=AccountKeyConfiguration(
        account_key="XXXX-XXXX"
    ),
)

ml_client.create_or_update(store)

In [None]:
stores = ml_client.datastores.list()
for ds_name in stores:
    print(ds_name.name)

In [None]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

my_path = './data/diabetes.csv'
my_data = Data(
    path=my_path,
    type=AssetTypes.URI_FILE,
    description="Local file",
    name="diabetes-local"
)

ml_client.data.create_or_update(my_data)

In [None]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

datastore_path = 'azureml://datastores/blob_training_data/paths/data-asset-path/'

my_data = Data(
    path = datastore_path,
    type = AssetTypes.URI_FOLDER,
    description="Data folder",
    name="diabetes-datastore-path"
)

ml_client.data.create_or_update(my_data)

In [None]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

local_path = 'data/'

my_data = Data(
    path = local_path,
    type=AssetTypes.MLTABLE,
    description="MLTable",
    name="diabetes-table"
)

ml_client.data.create_or_update(my_data)

In [None]:
datasets = ml_client.data.list()
for ds_name in datasets:
    print(ds_name.name)

In [None]:
import mltable

registered_data_asset = ml_client.data.get(name='diabetes-table', version=1)
tbl = mltable.load(f"azureml:/{registered_data_asset.id}")
df = tbl.to_pandas_dataframe()
df.head(5)

In [None]:
import os

script_folder = 'src'
os.makedirs(script_folder, exist_ok=True)
print(script_folder, 'folder created')

In [None]:

%%writefile $script_folder/move-data.py
# import libraries
import argparse
import pandas as pd
import numpy as np
from pathlib import Path

def main(args):
    # read data
    df = get_data(args.input_data)

    output_df = df.to_csv((Path(args.output_datastore) / "diabetes.csv"), index = False)

# function that reads the data
def get_data(path):
    df = pd.read_csv(path)

    # Count the rows and print the result
    row_count = (len(df))
    print('Analyzing {} rows of data'.format(row_count))
    
    return df

def parse_args():
    # setup arg parser
    parser = argparse.ArgumentParser()

    # add arguments
    parser.add_argument("--input_data", dest='input_data',
                        type=str)
    parser.add_argument("--output_datastore", dest='output_datastore',
                        type=str)

    # parse args
    args = parser.parse_args()

    # return args
    return args

# run script
if __name__ == "__main__":
    # add space in logs
    print("\n\n")
    print("*" * 60)

    # parse args
    args = parse_args()

    # run main function
    main(args)

    # add space in logs
    print("*" * 60)
    print("\n\n")

    

In [None]:
from azure.ai.ml import Input, Output
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml import command

# configure input and output
my_jobs_inputs = {
    "local_data": Input(type=AssetTypes.URI_FILE, path="azureml:diabetes-local:1")
}

my_job_outputs = {
    "datastore_data": Output(type=AssetTypes.URI_FOLDER, path="azureml://datastores/blob_training_data/paths/data-asset-path/")
}

job = command(
    code="./src",
    command="python move-data.py --input_data inputs.localdata --outputdatastore{{outputs.datastore_data}}",
    inputs=my_job_inputs,
    outputs=my_job_outputs,
    environment="AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest",
    compute="aml-cluster"
    display_name="move-diabetes-data",
    experiment_name="move-diabetes-data"
)

returned_job = ml_client.create_or_update(job)
aml_url = returned_job.studio_url
print("Monitor the job at", aml_url)