# Import Dependencies

In [1]:
from azureml.core import Dataset, Experiment, Workspace
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import RunConfiguration
from azureml.train.automl import AutoMLConfig
import logging
import pandas as pd

In [2]:
import os
print("Current working directory:", os.getcwd())
print("Files in current directory:", os.listdir())


Current working directory: /mnt/batch/tasks/shared/LS_root/mounts/clusters/computeinstancehouse/code/Users/morgan.senechal/AutoML
Files in current directory: ['.amlignore', '.amlignore.amltmp', '.ipynb_aml_checkpoints', 'AutoML.ipynb']


# Load Data and Fill NA/NaN Values with 0

In [4]:
train_df = pd.read_csv('train.csv').fillna(0)
test_df = pd.read_csv('test.csv').fillna(0)

# Upload Data to Azure Blob

In [5]:
ws = Workspace.from_config()

default_store = ws.get_default_datastore()

In [7]:

default_store.upload_files(
    ['train.csv'],
    target_path='kaggle-house-prices-training',
    overwrite=True,
    show_progress=True
)

Uploading an estimated of 1 files
Uploading train.csv
Uploaded train.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_43581835950e456f9fe6043e50955e02

In [8]:
# Charger le fichier de test
default_store.upload_files(
    ['test.csv'], # Liste des chemins de fichiers à charger
    target_path='kaggle-house-prices-testing', # Chemin cible dans le datastore
    overwrite=True, # Écraser les fichiers existants du même nom
    show_progress=True # Montrer la progression du téléchargement
)

Uploading an estimated of 1 files
Uploading test.csv
Uploaded test.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_37d8d4befada4acd8570f90e0110df53

# Create and Register Training Dataset

In [9]:
train_dataset = Dataset.Tabular.from_delimited_files(
  default_store.path('kaggle-house-prices-training')
)
train_dataset = train_dataset.register(ws, 'kaggle-house-prices-training')

In [10]:
test_dataset= Dataset.Tabular.from_delimited_files(
  default_store.path('kaggle-house-prices-testing')
)
test_dataset = test_dataset.register(ws, 'kaggle-house-prices-testing')

# Create Compute Cluster If Not Exists

In [11]:
amlcompute_cluster_name = "sandbox"

try:
    aml_compute = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
except ComputeTargetException:
    print('Compute cluster %s not found. Attempting to create it now.' % amlcompute_cluster_name)
    compute_config = AmlCompute.provisioning_configuration(
        vm_size='Standard_DS2_v2',
        max_nodes=1
    )
    aml_compute = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

aml_compute.wait_for_completion(show_output=True)

Compute cluster sandbox not found. Attempting to create it now.
InProgress........
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


# Define Compute RunConfiguration

In [12]:
aml_run_config = RunConfiguration()
aml_run_config.target = aml_compute
aml_run_config.environment.docker.enabled = True
aml_run_config.environment.python.user_managed_dependencies = False

aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['packaging']
)

'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


# Define AutoMLConfig

In [13]:
automl_settings = {
    "n_cross_validations": 3,
    "primary_metric": 'normalized_root_mean_squared_error',
    "enable_early_stopping": True,
    "experiment_timeout_hours": 1,
    "max_concurrent_iterations": 4,
    "max_cores_per_iteration": -1,
    "verbosity": logging.INFO,
}

automl_config = AutoMLConfig(
    task = 'regression',
    compute_target = aml_compute,
    training_data = train_dataset,
    label_column_name = 'SalePrice',
    **automl_settings
)

# Create Experiment

In [14]:
experiment_name = 'kaggle-house-prices-training'
experiment = Experiment(workspace=ws, name=experiment_name)

# Run AutoML Training Job Experiment and Wait for Completion

In [15]:
remote_run = experiment.submit(automl_config, show_output=False)
remote_run.wait_for_completion()

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
kaggle-house-prices-training,AutoML_9e289f8c-f630-429a-a192-0b2dff2decbc,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


{'runId': 'AutoML_9e289f8c-f630-429a-a192-0b2dff2decbc',
 'target': 'sandbox',
 'status': 'Completed',
 'startTimeUtc': '2024-03-16T15:32:27.147851Z',
 'endTimeUtc': '2024-03-16T16:04:36.655073Z',
 'services': {},
   'message': 'No scores improved over last 10 iterations, so experiment stopped early. This early stopping behavior can be disabled by setting enable_early_stopping = False in AutoMLConfig for notebook/python SDK runs.'}],
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'normalized_root_mean_squared_error',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '3',
  'target': 'sandbox',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"2a898af7-e00e-42f8-bcf6-49794955fc48\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'regression',
  'dependencies_versions': '{"azu

# Retrieve the Best Model

In [16]:
best_run, fitted_model = remote_run.get_output()

Package:azureml-automl-runtime, training version:1.52.0.post1, current version:1.51.0.post1
Package:azureml-core, training version:1.52.0, current version:1.51.0
Package:azureml-dataprep, training version:4.11.4, current version:4.10.8
Package:azureml-dataprep-rslex, training version:2.18.4, current version:2.17.12
Package:azureml-dataset-runtime, training version:1.52.0, current version:1.51.0
Package:azureml-defaults, training version:1.52.0, current version:1.51.0
Package:azureml-interpret, training version:1.52.0, current version:1.51.0
Package:azureml-mlflow, training version:1.52.0, current version:1.51.0
Package:azureml-pipeline-core, training version:1.52.0, current version:1.51.0
Package:azureml-responsibleai, training version:1.52.0, current version:1.51.0
Package:azureml-telemetry, training version:1.52.0, current version:1.51.0
Package:azureml-train-automl-client, training version:1.52.0, current version:1.51.0.post1
Package:azureml-train-automl-runtime, training version:1.

# Generate Predictions

In [18]:
test_df['SalePrice'] = fitted_model.predict(test_df)
kaggle_submission = test_df[['Id', 'SalePrice']]
kaggle_submission.to_csv('Users/morgan.senechal/AutoML', index=False)