# Adult Income

This notebook focuses on modeling using Auto machine learning using azure.

Workflow:

1. Create storage account and container.
2. Upload CSV file into container.
3. Create Azure ML Workspace.
4. Create datastore.
5. Create MLTable data asset.
6. Create ML pipeline.

![Workflow](img/azure_ml_workflow.png)

## 1. Load credentials

In [9]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

In [2]:
credential = DefaultAzureCredential()
ml_client = MLClient.from_config(credential = credential)

Found the config file in: /config.json


## 2. Data preparation

In [None]:
import os
from dotenv import load_dotenv

### 2.1. Get datastore

In [3]:
from azure.ai.ml.entities import AzureBlobDatastore
from azure.ai.ml.entities import AccountKeyConfiguration

In [12]:
load_dotenv()
ai_storage_account_key = os.getenv("STORAGE_ACCOUNT_KEY")
storage_account_name = os.getenv("STORAGE_ACCOUNT_NAME")
container_name = os.getenv("CONTAINER_NAME")
datastore_name = "workspacetabulardata"

try:
    # Get datastore
    datastore = ml_client.datastores.get(datastore_name)
    print(f"Datastore {datastore_name} found")
except:
    # Create datastore in case it is not found
    datastore = AzureBlobDatastore(
        name = datastore_name,
        description = "Azure Blob Storage for custom tabular data",
        account_name = storage_account_name,
        container_name = container_name,
        protocol = "https",
        credentials = AccountKeyConfiguration(
            account_key = ai_storage_account_key
        ),
    )

    ml_client.create_or_update(datastore)
    print(f"Datastore {datastore_name} created")

Datastore workspacetabulardata found


### 2.2. Create data asset (MLTable)

In [17]:
import mltable
from mltable import MLTableHeaders, MLTableFileEncoding
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

#### 2.2.1. Create

In [16]:
# Identify storage location
file_name = "adult_train_cleaned.csv"
path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{file_name}"

# Create path for data files
paths = [{"file": path}]

# Create schema as an MLTable
tbl = mltable.from_delimited_files(
    paths = paths,
    delimiter = ",",
    header = MLTableHeaders.all_files_same_headers,
    infer_column_types = True,
    include_path_column = False,
    encoding = MLTableFileEncoding.utf8
)

# Show the first few records
tbl.show(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [18]:
# Save the data loading steps in an MLTable file
mltable_folder = "./adult_income"
tbl.save(mltable_folder)

# Define the Data asset object
data_asset_name = "adult_income_mltable"
data_asset_version = "1.0"

my_data = Data(
    path = mltable_folder,
    type = AssetTypes.MLTABLE,
    description = "Adult income dataset MLTable",
    name = data_asset_name,
    version = data_asset_version
)

# Create the data asset in the workspace
ml_client.data.create_or_update(my_data)

[32mUploading adult_income (0.0 MBs):   0%|          | 0/386 [00:00<?, ?it/s][32mUploading adult_income (0.0 MBs): 100%|██████████| 386/386 [00:00<00:00, 3365907.16it/s]
[39m



Data({'path': 'azureml://subscriptions/2daeb7d0-57d4-43e5-911d-8abf10773fe3/resourcegroups/rg-test-01/workspaces/ws-ml-20240921/datastores/workspaceblobstore/paths/LocalUpload/02715d69992f4b4dc390839c9e207884/adult_income/', 'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': ['wasbs://mytabulardata@stml20240921.blob.core.windows.net/adult_train_cleaned.csv'], 'type': 'mltable', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'adult_income_mltable', 'description': 'Adult income dataset MLTable', 'tags': {}, 'properties': {}, 'print_as_yaml': False, 'id': '/subscriptions/2daeb7d0-57d4-43e5-911d-8abf10773fe3/resourceGroups/rg-test-01/providers/Microsoft.MachineLearningServices/workspaces/ws-ml-20240921/data/adult_income_mltable/versions/1.0', 'Resource__source_path': '', 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/cistandardds11v2/code/Users/200566948', 'creation_context': <azure.ai.ml.entities._system_da

#### 2.2.2. Read

In [19]:
# Get data asset
data_asset = ml_client.data.get(name = data_asset_name, version = data_asset_version)

# Read data asset
tbl = mltable.load(f"azureml:/{data_asset.id}")
df = tbl.to_pandas_dataframe()
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## 3. Set up compute resources

In [20]:
from azure.ai.ml.entities import AmlCompute

In [21]:
cc_name = "ccstandardds11v2"

try:
    # Get compute cluster
    compute_cluster = ml_client.compute.get(cc_name)
    print(f"Compute cluster {cc_name} found")
except:
    # Create compute cluster in case it is not found
    compute_cluster = AmlCompute(
        name = cc_name,
        type = "amlcompute",
        size = "Standard_DS11_v2",
        location = "eastus2",
        min_instances = 0,
        max_instances = 2,
        idle_time_before_scale_down = 120,
        tier = "dedicated",
    )

    ml_client.begin_create_or_update(compute_cluster).result()
    print(f"Compute cluster {cc_name} created")

Compute cluster ccstandardds11v2 found


## 4. Project code

In [22]:
from azure.ai.ml import automl
from azure.ai.ml import Input

In [23]:
# Get data asset
training_data_input = Input(type = AssetTypes.MLTABLE, path = f"azureml:{data_asset_name}:{data_asset_version}")

In [31]:
# Configure the classification job
classification_job = automl.classification(
    compute = cc_name,
    experiment_name = "automl-adult-income-classification",
    training_data = training_data_input,
    target_column_name = "income",
    primary_metric = "accuracy",
    n_cross_validations = 3,
    enable_model_explainability = True
)

In [33]:
# Set the limits
# Min iterations = 4

classification_job.set_limits(
    timeout_minutes = 20, 
    trial_timeout_minutes = 10, 
    max_trials = 5,
    enable_early_termination = True,
)

In [34]:
# Set the training properties

classification_job.set_training(
    allowed_training_algorithms = ["LogisticRegression", "DecisionTree"], 
    enable_onnx_compatible_models = True
)

In [None]:
# Submit the AutoML job
returned_job = ml_client.jobs.create_or_update(
    classification_job
)  

# Submit the job to the backend
aml_url = returned_job.studio_url
print("Monitor your job at", aml_url)