### Importing Packages

In [63]:
import azureml.core 
import pandas as pd
import numpy as np 
import logging
from azureml.core import Workspace, ComputeTarget, Datastore,Dataset
from azureml.core.compute import ComputeInstance, AmlCompute
from azureml.data import TabularDataset

ws = Workspace.from_config()

### Explore dataset

In [2]:
from IPython.display import display

# Define the path to your local file
local_file_path = "./data/energy_data.xlsx"

# Read the local file into a pandas DataFrame
df = pd.read_excel(local_file_path, engine='openpyxl')


print("Sample of Dataset")
display(df.head())

print("\n Describe Dataset")
display(df.describe())



Sample of Dataset


Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load,Cooling Load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84,28.28



 Describe Dataset


Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load,Cooling Load
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,0.764167,671.708333,318.5,176.604167,5.25,3.5,0.234375,2.8125,22.307195,24.58776
std,0.105777,88.086116,43.626481,45.16595,1.75114,1.118763,0.133221,1.55096,10.090204,9.513306
min,0.62,514.5,245.0,110.25,3.5,2.0,0.0,0.0,6.01,10.9
25%,0.6825,606.375,294.0,140.875,3.5,2.75,0.1,1.75,12.9925,15.62
50%,0.75,673.75,318.5,183.75,5.25,3.5,0.25,3.0,18.95,22.08
75%,0.83,741.125,343.0,220.5,7.0,4.25,0.4,4.0,31.6675,33.1325
max,0.98,808.5,416.5,220.5,7.0,5.0,0.4,5.0,43.1,48.03


### Create the Python Script 

- Create a script folder

In [3]:
import os

# create a folder for the script files
script_folder = 'src'
os.makedirs(script_folder, exist_ok=True)
print(script_folder, 'folder created')

src folder created


In [5]:
df.columns

Index(['Relative Compactness', 'Surface Area', 'Wall Area', 'Roof Area',
       'Overall Height', 'Orientation', 'Glazing Area',
       'Glazing Area Distribution', 'Heating Load', 'Cooling Load'],
      dtype='object')

In [6]:
%%writefile $script_folder/train-model-script.py
# import libraries
import mlflow
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from xgboost import XGBRegressor
import matplotlib.pyplot as plt

def main(args):
    # read data
    df = get_data(args.training_data)

    # split data
    X_train, X_test, y_train, y_test = split_data(df)

    # train model
    model = train_model(args.reg_rate, X_train, X_test, y_train, y_test)

    # evaluate model
    eval_model(model, X_test, y_test)

# function that reads the data
def get_data(path):
    print("Reading data...")
    # df = pd.read_csv(path)
    df = pd.read_excel(path, engine='openpyxl')
    return df

# function that splits the data
def split_data(df):
    print("Splitting data...")
    X, y = df[['Relative Compactness', 'Surface Area', 'Wall Area', 'Roof Area',
       'Overall Height', 'Orientation', 'Glazing Area',
       'Glazing Area Distribution',]].values, df['Heating Load'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

    return X_train, X_test, y_train, y_test

# function that trains the model
def train_model(X_train, y_train,learning_rate, n_estimators, max_depth):
    print("Training model...")
    
    # Create a pipeline
    pipeline = Pipeline([
        ('scaler', MaxAbsScaler()),  # Normalise data
        ('model', XGBRegressor(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth))  # XGBoost model
    ])
    
    # Train the model
    model = pipeline.fit(X_train, y_train)
    
    return model

# function that evaluates the model
def eval_model(model, X_test, y_test):
    # calculate predictions
    y_pred = model.predict(X_test)
    
    # calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', rmse)
    mlflow.log_metric("RMSE", rmse)

    # calculate R-squared
    r2 = r2_score(y_test, y_pred)
    print('R-squared: ', r2)
    mlflow.log_metric("R-squared", r2)

def parse_args():
    # setup arg parser
    parser = argparse.ArgumentParser()

    # add arguments
    parser.add_argument("--training_data", dest='training_data', type=str)
    parser.add_argument("--reg_rate", dest='reg_rate', type=float, default=0.01)
    parser.add_argument("--learning_rate", dest='learning_rate', type=float, default=0.1)
    parser.add_argument("--n_estimators", dest='n_estimators', type=int, default=100)
    parser.add_argument("--max_depth", dest='max_depth', type=int, default=3)

    # parse args
    args = parser.parse_args()

    # return args
    return args

# run script
if __name__ == "__main__":
    # add space in logs
    print("\n\n")
    print("*" * 60)

    # parse args
    args = parse_args()

    # run main function
    main(args)

    # add space in logs
    print("*" * 60)
    print("\n\n")


Writing src/train-model-script.py


In [8]:
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()


# Get a handle to workspace
ml_client = MLClient.from_config(credential=credential)

Found the config file in: .\config.json


In [49]:
# upload the data to the default datastore
datastore = ws.get_default_datastore()
# create a dataset referencing the cloud location
dataset = Dataset.File.upload_directory(src_dir='./data', target=(datastore, 'data'))

Validating arguments.
Arguments validated.
'overwrite' is set to False. Any file already present in the target will be skipped.'
Uploading files from 'c:/Users/saeed.misaghian/Documents/Repos_Personal/Azure_ML_Energy_Predict/data' to 'data'
Creating new dataset


In [54]:
from azure.ai.ml import command
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core import Environment
from azureml.core import Environment

data_path = dataset.as_mount()

job = command(
    code="./src",
    command=f"python train-model-script.py --training_data {data_path}",
    environment="AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest",
    compute="aml-cluster",
    display_name="building-energy-train",
    experiment_name="building-energy-training", 
    tags={"model_type": "randomforest", "model":"Building energy dataset"}
    )

# submit job
returned_job = ml_client.create_or_update(job)
aml_url = returned_job.studio_url
print("Monitor your job at", aml_url)

AttributeError: 'DatasetConsumptionConfig' object has no attribute 'mount'

In [66]:
from azureml.core import Workspace, Dataset
from azureml.data.datapath import DataPath

# Load your Azure ML workspace
ws = Workspace.from_config()

# Get the default datastore
datastore = ws.get_default_datastore()

# Path to your local data directory
local_data_path = './data'

# Upload the local directory to the datastore under the path 'data'
datastore_path = DataPath(datastore, 'data')
dataset = Dataset.File.upload_directory(src_dir=local_data_path, target=datastore_path)

# Register the uploaded files as a dataset
dataset = dataset.register(workspace=ws,
                          name='training_data',
                          description='Training data')

# Now you can use the dataset 'training_data' in your Azure ML job


Validating arguments.
Arguments validated.
'overwrite' is set to False. Any file already present in the target will be skipped.'
Uploading files from 'c:/Users/saeed.misaghian/Documents/Repos_Personal/Azure_ML_Energy_Predict/data' to 'data'
Creating new dataset


In [79]:
from azureml.core import Dataset

# # Assuming 'training_data' is your dataset name in Azure ML workspace
# dataset = Dataset.get_by_name(ws, name='training_data')

# # Use as_mount() if the dataset is large and you prefer to stream data, or as_download() if the dataset is small
# # dataset_input = dataset.as_mount()
# dataset_input = dataset.as_named_input('training_data_input').as_dataset()

job = command(
    code="./src",
    command="python train-model-script.py --training_data training_data",
    environment="AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest",
    compute="aml-cluster",
    display_name="building-energy-train",
    experiment_name="building-energy-training", 
    tags={"model_type": "randomforest", "model": "Building energy dataset"},
    inputs={'training_data': 'training_data'}  # Reference the dataset by name
)

# Submit job
returned_job = ml_client.create_or_update(job)
aml_url = returned_job.studio_url
print("Monitor your job at", aml_url)


Monitor your job at https://ml.azure.com/runs/placid_lock_ynhyz0y711?wsid=/subscriptions/dd022f57-1b53-4cf0-b379-44a3d7d57e27/resourcegroups/ies-pi-dev-uks-rg/workspaces/ies-pi-dev-uks-ml&tid=b33be5d6-5072-448f-bad3-d8b66cf09736


In [None]:
ImportError: cannot import name 'Input' from 'azureml.pipeline.core' (C:\Users\saeed.misaghian\AppData\Roaming\Python\Python39\site-packages\azureml\pipeline\core\__init__.py)

In [117]:
df = pd.read_csv("./data/energy_data_correct.csv",encoding='utf-8')

In [118]:
df

Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load,Cooling Load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.90,563.5,318.5,122.50,7.0,2,0.0,0,20.84,28.28
...,...,...,...,...,...,...,...,...,...,...
763,0.64,784.0,343.0,220.50,3.5,5,0.4,5,17.88,21.40
764,0.62,808.5,367.5,220.50,3.5,2,0.4,5,16.54,16.88
765,0.62,808.5,367.5,220.50,3.5,3,0.4,5,16.44,17.11
766,0.62,808.5,367.5,220.50,3.5,4,0.4,5,16.48,16.61


In [114]:
from azureml.core import Workspace, Dataset
from azureml.data.datapath import DataPath

ws = Workspace.from_config()

# Get the default datastore
datastore = ws.get_default_datastore()

# Path to your local data directory
local_data_path = './data'  # Ensure this path contains your CSV files

# Define the datastore path
datastore_path = [(datastore, 'data')]

# Create a TabularDataset to represent tabular data in CSV files
tabular_dataset = Dataset.Tabular.from_delimited_files(path=datastore_path)

# Register the TabularDataset
tabular_dataset = tabular_dataset.register(workspace=ws,
                                           name='training_data',
                                           description='Training data in tabular format')


In [123]:
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
from azure.identity import DefaultAzureCredential

# Connect to the AzureML workspace
subscription_id="dd022f57-1b53-4cf0-b379-44a3d7d57e27"
resource_group="ies-pi-dev-uks-rg"
workspace_name="ies-pi-dev-uks-ml"

ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group, workspace_name
)

# Set the version number of the data asset (for example: '1')
VERSION = "1"

# Set the path, supported paths include:
# local: './<path>/<file>' (this will be automatically uploaded to cloud storage)
# blob:  'wasbs://<container_name>@<account_name>.blob.core.windows.net/<path>/<file>'
# ADLS gen2: 'abfss://<file_system>@<account_name>.dfs.core.windows.net/<path>/<file>'
# Datastore: 'azureml://datastores/<data_store_name>/paths/<path>/<file>'
path = "./data/energy_data_correct.csv"

# Define the Data asset object
my_data = Data(
    path=path,
    type=AssetTypes.URI_FILE,
    description="this is an energy building dataset",
    name="energy-building",
    version=VERSION,
)

# Create the data asset in the workspace
ml_client.data.create_or_update(my_data)

Uploading energy_data_correct.csv (< 1 MB): 0.00B [00:00, ?B/s] (< 1 MB): 100%|##########| 34.4k/34.4k [00:00<00:00, 697kB/s]




Data({'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': None, 'type': 'uri_file', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'energy-building', 'description': 'this is an energy building dataset', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/dd022f57-1b53-4cf0-b379-44a3d7d57e27/resourceGroups/ies-pi-dev-uks-rg/providers/Microsoft.MachineLearningServices/workspaces/ies-pi-dev-uks-ml/data/energy-building/versions/1', 'Resource__source_path': None, 'base_path': 'c:\\Users\\saeed.misaghian\\Documents\\Repos_Personal\\Azure_ML_Energy_Predict', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x000001B2A076F130>, 'serialize': <msrest.serialization.Serializer object at 0x000001B2A0769760>, 'version': '1', 'latest_version': None, 'path': 'azureml://subscriptions/dd022f57-1b53-4cf0-b379-44a3d7d57e27/resourcegroups/ies-pi-dev-uks-rg/workspaces/ies-pi-dev-uks-ml/datast

In [125]:

import pandas as pd
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

ml_client = MLClient.from_config(credential=DefaultAzureCredential())
data_asset = ml_client.data.get("energy-building", version="1")


Found the config file in: .\config.json


In [133]:
data_asset.path

'azureml://subscriptions/dd022f57-1b53-4cf0-b379-44a3d7d57e27/resourcegroups/ies-pi-dev-uks-rg/workspaces/ies-pi-dev-uks-ml/datastores/workspaceblobstore/paths/LocalUpload/fc172b527e1913e669b6e71d249ab547/energy_data_correct.csv'

In [142]:
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
from azure.identity import DefaultAzureCredential
from azure.ai.ml import Input

# Connect to the AzureML workspace
subscription_id="dd022f57-1b53-4cf0-b379-44a3d7d57e27"
resource_group="ies-pi-dev-uks-rg"
workspace_name="ies-pi-dev-uks-ml"

# ml_client = MLClient(
#     DefaultAzureCredential(), subscription_id, resource_group, workspace_name
# )

# # Set the version number of the data asset (for example: '1')
# VERSION = "1"

# # Set the path, supported paths include:
# # local: './<path>/<file>' (this will be automatically uploaded to cloud storage)
# # blob:  'wasbs://<container_name>@<account_name>.blob.core.windows.net/<path>/<file>'
# # ADLS gen2: 'abfss://<file_system>@<account_name>.dfs.core.windows.net/<path>/<file>'
# # Datastore: 'azureml://datastores/<data_store_name>/paths/<path>/<file>'
# path = "./data/energy_data_correct.csv"

# # Define the Data asset object
# my_data = Data(
#     path=path,
#     type=AssetTypes.URI_FILE,
#     description="this is an energy building dataset",
#     name="energy-building",
#     version=VERSION,
# )

# # Create the data asset in the workspace
# ml_client.data.create_or_update(my_data)


ml_client = MLClient.from_config(credential=DefaultAzureCredential())
data_asset = ml_client.data.get("energy-building", version="1")


inputs = {
    "training_data": Input(type=AssetTypes.URI_FILE, path=data_asset.path)
}

job = command(
    code="./src",
    command="python train-model-script.py --training_data {inputs.energy-building}",
    environment="AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest",
    compute="aml-cluster",
    display_name="building-energy-train",
    experiment_name="building-energy-training", 
    tags={"model_type": "randomforest", "model": "Building energy dataset"},
    inputs={'training_data': 'energy-building'}  # Reference the dataset by name
)

# Submit job
returned_job = ml_client.create_or_update(job)
aml_url = returned_job.studio_url
print("Monitor your job at", aml_url)

Found the config file in: .\config.json
Use of {} for parameters is deprecated, instead use ${{}}.


Monitor your job at https://ml.azure.com/runs/keen_helmet_9g73s36yjd?wsid=/subscriptions/dd022f57-1b53-4cf0-b379-44a3d7d57e27/resourcegroups/ies-pi-dev-uks-rg/workspaces/ies-pi-dev-uks-ml&tid=b33be5d6-5072-448f-bad3-d8b66cf09736


In [137]:
df

Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load,Cooling Load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.90,563.5,318.5,122.50,7.0,2,0.0,0,20.84,28.28
...,...,...,...,...,...,...,...,...,...,...
763,0.64,784.0,343.0,220.50,3.5,5,0.4,5,17.88,21.40
764,0.62,808.5,367.5,220.50,3.5,2,0.4,5,16.54,16.88
765,0.62,808.5,367.5,220.50,3.5,3,0.4,5,16.44,17.11
766,0.62,808.5,367.5,220.50,3.5,4,0.4,5,16.48,16.61


In [138]:
df.iloc[:768,:10]

Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load,Cooling Load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.90,563.5,318.5,122.50,7.0,2,0.0,0,20.84,28.28
...,...,...,...,...,...,...,...,...,...,...
763,0.64,784.0,343.0,220.50,3.5,5,0.4,5,17.88,21.40
764,0.62,808.5,367.5,220.50,3.5,2,0.4,5,16.54,16.88
765,0.62,808.5,367.5,220.50,3.5,3,0.4,5,16.44,17.11
766,0.62,808.5,367.5,220.50,3.5,4,0.4,5,16.48,16.61
