In [7]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# authenticate
credential = DefaultAzureCredential()
# # Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id="3be6c7a8-098b-44b1-ae75-4f0591d6bab0",
    resource_group_name="Resource_Group_1",
    workspace_name="Test_Workspace_1",
)

In [8]:
import os

dependencies_dir = "./dependencies"
os.makedirs(dependencies_dir, exist_ok=True)

In [25]:
%%writefile {dependencies_dir}/conda.yaml
name: model-env
channels:
  - conda-forge
dependencies:
  - python=3.8
  - numpy=1.21.2
  - pip=21.2.4
  - scikit-learn=0.24.2
  - scipy=1.7.1
  - pandas>=1.1,<1.2
  - pip:
    - inference-schema[numpy-support]==1.3.0
    - mlflow== 2.4.1
    - azureml-mlflow==1.42.0
    - psutil>=5.8,<5.9
    - tqdm>=4.59,<4.60
    - ipykernel~=6.0
    - matplotlib

Overwriting ./dependencies/conda.yaml


In [26]:
import mlflow
print(mlflow.__version__)

from mlflow.entities import Dataset

2.4.1


In [9]:
from azure.ai.ml.entities import Environment

custom_env_name = "aml-scikit-test1"

custom_job_env = Environment(
    name=custom_env_name,
    description="Custom environment for Credit Card Defaults job",
    tags={"scikit-learn": "0.24.2"},
    conda_file=os.path.join(dependencies_dir, "conda.yaml"),
    image="mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest",
)
custom_job_env = ml_client.environments.create_or_update(custom_job_env)

print(
    f"Environment with name {custom_job_env.name} is registered to workspace, the environment version is {custom_job_env.version}"
)

Environment with name aml-scikit-test1 is registered to workspace, the environment version is 12


In [17]:
import os

train_src_dir = "./src"
os.makedirs(train_src_dir, exist_ok=True)

In [29]:
import pandas as pd

df = pd.read_csv("azureml://subscriptions/3be6c7a8-098b-44b1-ae75-4f0591d6bab0/resourcegroups/Resource_Group_1/workspaces/Test_Workspace_1/datastores/workspaceblobstore/paths/UI/2023-07-03_175409_UTC/bank-full.csv", sep=';')

#df.isna().sum().sum()

df = df.drop(columns=['pdays', 'duration'])

# Perform (one-hot) encoding for categorical variables 
df_encoded = pd.get_dummies(df, columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'poutcome', 'contact'], dtype = 'int')

# We manually encode months as their respective numbers
month_mapping = {
    'jan': 1,
    'feb': 2,
    'mar': 3,
    'apr': 4,
    'may': 5,
    'jun': 6,
    'jul': 7,
    'aug': 8,
    'sep': 9,
    'oct': 10,
    'nov': 11,
    'dec': 12
}

df_encoded['month'] = df_encoded['month'].map(month_mapping).astype(int)

# Normalize the Balance column by decimal scaling (-1 / 1)
scaling_factor = 10 ** (len(str(df_encoded['balance'].abs().max().astype(int))) - 1)
df_normalized = df_encoded.copy()
df_normalized['balance'] = df_encoded['balance'] / scaling_factor

# Normalize the Age, Day and Month column by Min-Max scaling (0 / 1)
df_normalized['age'] = (df_normalized['age'] - df_normalized['age'].min()) / (df_normalized['age'].max() - df_normalized['age'].min())
df_normalized['day'] = (df_normalized['day'] - df_normalized['day'].min()) / (df_normalized['day'].max() - df_normalized['day'].min())
df_normalized['month'] = (df_normalized['month'] - df_normalized['month'].min()) / (df_normalized['month'].max() - df_normalized['month'].min())

# Remap target variable (y) to binary integers
y_mapping = {'yes': 1, 'no': 0}
df_normalized['y'] = df_normalized['y'].map(y_mapping).astype(int)
#df_normalized['target'] = df_normalized['y'].map(y_mapping).astype(int)
#df_normalized = df_normalized.drop(columns=['y'])

df_normalized.head()


Unnamed: 0,age,balance,day,month,campaign,previous,y,job_admin.,job_blue-collar,job_entrepreneur,...,housing_yes,loan_no,loan_yes,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,contact_cellular,contact_telephone,contact_unknown
0,0.519481,0.02143,0.133333,0.363636,1,0,0,0,0,0,...,1,1,0,0,0,0,1,0,0,1
1,0.337662,0.00029,0.133333,0.363636,1,0,0,0,0,0,...,1,1,0,0,0,0,1,0,0,1
2,0.194805,2e-05,0.133333,0.363636,1,0,0,0,0,1,...,1,0,1,0,0,0,1,0,0,1
3,0.376623,0.01506,0.133333,0.363636,1,0,0,0,1,0,...,1,1,0,0,0,0,1,0,0,1
4,0.194805,1e-05,0.133333,0.363636,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1


In [41]:
# Downloading the normalized dataframe to upload it to Azure Blob Storage

df_normalized.to_csv('Users/SDUQUEP298/bank-marketing-model/bank-full-normalized.csv')

OSError: Cannot save file into a non-existent directory: 'Users/SDUQUEP298/bank-marketing-model'

# TRAINING THE MODEL 

In [3]:
from imblearn.over_sampling import SMOTE

SyntaxError: invalid syntax (71457944.py, line 1)

In [23]:
%%writefile {train_src_dir}/main.py
import os
import argparse
import pandas as pd
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

def main():
    """Main function of the script."""

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, help="path to input data")
    parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25)
    parser.add_argument("--n_estimators", required=False, default=100, type=int)
    parser.add_argument("--learning_rate", required=False, default=0.1, type=float)
    parser.add_argument("--registered_model_name", type=str, help="model name")
    args = parser.parse_args()
   
    # Start Logging
    mlflow.start_run()

    # enable autologging
    mlflow.sklearn.autolog()

    ###################
    #<prepare the data>
    ###################
    print(" ".join(f"{k}={v}" for k, v in vars( ).items()))

    print("input data:", args.data)
    
    credit_df = pd.read_csv(args.data, index_col=0)

    # Hardcoded path
    #credit_df = pd.read_csv("azureml://subscriptions/3be6c7a8-098b-44b1-ae75-4f0591d6bab0/resourcegroups/Resource_Group_1/workspaces/Test_Workspace_1/datastores/workspaceblobstore/paths/UI/2023-07-03_175409_UTC/bank-full-normalized.csv")
    #train_df, test_df = train_test_split(
    #    credit_df,
    #    test_size=0.1
    #)

    mlflow.log_metric("num_samples", credit_df.shape[0])
    mlflow.log_metric("num_features", credit_df.shape[1] - 1)

    #Split train and test datasets
    train_df, test_df = train_test_split(
        credit_df,
        test_size=args.test_train_ratio,
    )
    ####################
    #</prepare the data>
    ####################

    ##################
    #<train the model>
    ##################
    # Extracting the label column
    y_train = train_df.pop("y")

    # convert the dataframe values to array
    X_train = train_df.values

    # Extracting the label column
    y_test = test_df.pop("y")

    # convert the dataframe values to array
    X_test = test_df.values

    print(f"Training with data of shape {X_train.shape}")


  ############################
  # CLASSIFICATION ALGORITHM #
  ############################

    '''
    #Gradient Boosting
    clf = GradientBoostingClassifier(
        n_estimators=args.n_estimators, learning_rate=args.learning_rate
    )
    clf.fit(X_train, y_train)
    model = clf
    '''
    '''
    # Hyperparameter tuning for the Random Forest Classifier using random grid 
    
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap}
        

    '''
    #Random Forest
    rfc = RandomForestClassifier(
        n_estimators=200,
        min_samples_split=5,
        min_samples_leaf=4,
        max_features='auto',
        bootstrap=True,
    )

    '''
    #Oversampling the minority target variable to improve performance
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_train_oversampled, y_train_oversampled = resample(X_train_scaled[y_train == 1],
                                                        y_train[y_train == 1],
                                                        replace=True,
                                                        n_samples=X_train_scaled[y_train == 0].shape[0],
                                                        random_state=42
                                                        )
    '''

    rfc.fit(X_train, y_train)
    model = rfc
    

    '''
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    # Fit the random search model
    rf_random.fit(X_train, y_train)
    model = rf_random

    print("Grid search best paremeters:")
    print(rf_random.best_params_)
    rf_random.best_params_
    '''
    '''
    #SVM
    SVM = svm.LinearSVC()
    SVM.fit(X_train, y_train)
    model = SVM
    
    

    #Linear perceptron
    perceptron = Perceptron()
    perceptron.fit(X_train, y_train)
    model = perceptron

    '''

    y_pred = model.predict(X_test)

    print(classification_report(y_test, y_pred))

    # parameter to determine the algorithm
    


  ############################
  # /CLASSIFICATION ALGORITHM #
  ############################



    ###################
    #</train the model>
    ###################

    ##########################
    #<save and register model>
    ##########################
    # Registering the model to the workspace
    print("Registering the model via MLFlow")
    mlflow.sklearn.log_model(
        sk_model=model,
        registered_model_name=args.registered_model_name,
        artifact_path=args.registered_model_name,
    )

    # Saving the model to a file
    mlflow.sklearn.save_model(
        sk_model=model,
        path=os.path.join(args.registered_model_name, "trained_model"),
    )
    ###########################
    #</save and register model>
    ###########################
    
    # Stop Logging
    mlflow.end_run()


if __name__ == "__main__":
    main()

Overwriting ./src/main.py


In [11]:
from azure.ai.ml.entities import ComputeInstance, AmlCompute

ci_basic_name = 'SDUQUE1'

# Get compute
cpu_cluster = ml_client.compute.get(ci_basic_name)
print(cpu_cluster)

enable_node_public_ip: true
id: /subscriptions/3be6c7a8-098b-44b1-ae75-4f0591d6bab0/resourceGroups/Resource_Group_1/providers/Microsoft.MachineLearningServices/workspaces/Test_Workspace_1/computes/SDUQUE1
last_operation:
  operation_name: Start
  operation_status: Succeeded
  operation_time: '2023-07-08T21:58:02.878Z'
  operation_trigger: User
location: westeurope
name: SDUQUE1
network_settings:
  private_ip_address: 10.0.0.4
  public_ip_address: 20.23.191.157
os_image_metadata:
  current_image_version: 23.04.07
  is_latest_os_image_version: true
  latest_image_version: 23.04.07
provisioning_state: Succeeded
services:
- display_name: Jupyter
  endpoint_uri: https://sduque1.westeurope.instances.azureml.ms/tree/
- display_name: Jupyter Lab
  endpoint_uri: https://sduque1.westeurope.instances.azureml.ms/lab
size: STANDARD_DS3_V2
ssh_public_access_enabled: false
ssh_settings:
  admin_username: azureuser
  ssh_port: '4000'
state: Running
type: computeinstance



In [22]:
# Configure the command

from azure.ai.ml import command
from azure.ai.ml import Input

registered_model_name = "bank-marketing-model-tuned-random-forest"

job = command(
    inputs=dict(
        data=Input(
            type="uri_file",
            path="azureml://subscriptions/3be6c7a8-098b-44b1-ae75-4f0591d6bab0/resourcegroups/Resource_Group_1/workspaces/Test_Workspace_1/datastores/workspaceblobstore/paths/UI/2023-07-04_165901_UTC/bank-full-normalized.csv",
            #path='https://testworkspace16014057046.blob.core.windows.net/azureml-blobstore-eb4ec91f-0dc1-4782-911e-cdc4e29fd8c6/UI/2023-07-04_165901_UTC/bank-full-normalized.csv',
            #path='https://ml.azure.com/fileexplorerAzNB?wsid=/subscriptions/3be6c7a8-098b-44b1-ae75-4f0591d6bab0/resourcegroups/Resource_Group_1/providers/Microsoft.MachineLearningServices/workspaces/Test_Workspace_1&tid=b4760713-c835-4043-b494-8efa9f5b2e1c&activeFilePath=Users/SDUQUEP298/bank-full-normalized.csv',
        ),
        test_train_ratio=0.15,
        learning_rate=0.25,
        registered_model_name=registered_model_name,
    ),
    code="./src/",  # location of source code
    command="python main.py --data ${{inputs.data}} --test_train_ratio ${{inputs.test_train_ratio}} --learning_rate ${{inputs.learning_rate}} --registered_model_name ${{inputs.registered_model_name}}",
    environment="aml-scikit-test1@latest",
    compute="SDUQUE1",
    display_name="bank-marketing-prediction-tuned-random-forest",
)

In [24]:
ml_client.create_or_update(job)

[32mUploading src (0.01 MBs): 100%|██████████| 6836/6836 [00:00<00:00, 167458.60it/s]
[39m



Experiment,Name,Type,Status,Details Page
bank-marketing-model,ashy_forest_kkjtfpvk5j,command,Starting,Link to Azure Machine Learning studio
