In [1]:
!sudo apt update -y -q
!sudo apt install g++ -y -q
!pip install -q azureml-sdk azureml-widgets azureml-train-automl

Hit:1 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:2 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:4 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Fetched 252 kB in 1s (311 kB/s)
Reading package lists...
Building dependency tree...
Reading state information...
19 packages can be upgraded. Run 'apt list --upgradable' to see them.
Reading package lists...
Building dependency tree...
Reading state information...
g++ is already the newest version (4:7.4.0-1ubuntu2.3).
0 upgraded, 0 newly installed, 0 to remove and 19 not upgraded.


In [2]:
from pathlib import Path
from datetime import datetime
from dateutil.relativedelta import relativedelta
from sklearn.model_selection import train_test_split
import logging
import pandas as pd
import os

from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.experiment import Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.dataset import Dataset
from azureml.core.run import get_run

In [3]:
filename = 'OUTPUT_WBI_exposer_cyclones_v14.csv'
input_file_path = Path.cwd()\
    .parent.parent.parent.parent\
    .joinpath('#task4-eda', 'datasets', filename)

In [4]:
input_file_path

PosixPath('/home/jovyan/#task4-eda/datasets/OUTPUT_WBI_exposer_cyclones_v14.csv')

In [5]:
cyclones_df = pd.read_csv(input_file_path, delimiter=';')

In [6]:
cyclones_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 991 entries, 0 to 990
Data columns (total 51 columns):
SID                                                   991 non-null object
NAME                                                  991 non-null object
ISO                                                   991 non-null object
YEAR                                                  991 non-null int64
COORDS                                                991 non-null object
COORDS_MAX_WINDS                                      991 non-null object
COORDS_MIN_DIST2LAND                                  991 non-null object
BASIN                                                 991 non-null object
SUB BASIN                                             991 non-null object
MONTH_START                                           991 non-null int64
MONTH_END                                             991 non-null int64
DATE_START                                            991 non-null object
DATE_END  

In [7]:
ws = Workspace.from_config()

In [8]:
train_df, test_df = train_test_split(cyclones_df, test_size=0.2, random_state=223)

if not os.path.isdir('data'):
    os.mkdir('data')

train_df.to_csv('data/train.csv', index=False)
test_df.to_csv('data/test.csv', index=False)
ds = ws.get_default_datastore()
ds.upload(src_dir='./data', target_path='cyclonedata', overwrite=True, show_progress=True)

training_data = Dataset.Tabular.from_delimited_files(path=ds.path('cyclonedata/train.csv'), )

Uploading an estimated of 3 files
Uploading ./data/.ipynb_checkpoints/test-checkpoint.csv
Uploading ./data/test.csv
Uploading ./data/train.csv
Uploaded ./data/test.csv, 1 files out of an estimated total of 3
Uploaded ./data/.ipynb_checkpoints/test-checkpoint.csv, 2 files out of an estimated total of 3
Uploaded ./data/train.csv, 3 files out of an estimated total of 3
Uploaded 3 files


In [9]:
cpu_cluster_name = "test-cluster"

try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(
        vm_size='STANDARD_DS2_V2',
        max_nodes=2,
        min_nodes=2)
    
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

automl_settings = {
    "iteration_timeout_minutes": 5,
    "iterations": 10,
    "primary_metric": 'normalized_root_mean_squared_error',
    "preprocess": True,
    "verbosity": logging.INFO,
    "n_cross_validations": 5,
    "compute_target": cpu_cluster,
    "max_cores_per_iteration": -1,
    "max_concurrent_iterations": 2
}

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [10]:
automl_config = AutoMLConfig(task='regression',
                             debug_log='automated_ml_errors.log',
                             training_data=training_data,
                             label_column_name='TOTAL_AFFECTED',
                             **automl_settings)



In [11]:
experiment = Experiment(ws, "cyclone-experiment")
# run = get_run(experiment, 'AutoML_963f860f-806d-4e86-b399-2c24f537f73c')
run = experiment.submit(automl_config, show_output=True)

Running on remote or ADB.
Running on remote compute: test-cluster
Parent Run ID: AutoML_dafb46d5-e777-4f0e-afda-ba8aaa355b8b

Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Missing feature values imputation
STATUS:       DONE
DESCRIPTION:  If the missing values are expected, let the run complete. Otherwise cancel the current run and use a script to customize the handling of missing feature values that may be more appropriate based on the data type and business requirement.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization
DETAILS:      
+---------------------------------+---------------------------------+---------

In [12]:
best_run, fitted_model = run.get_output()
print(best_run)
print(fitted_model)

Run(Experiment: cyclone-experiment,
Id: AutoML_dafb46d5-e777-4f0e-afda-ba8aaa355b8b_8,
Type: azureml.scriptrun,
Status: Completed)
RegressionPipeline(pipeline=Pipeline(memory=None,
     steps=[('datatransformer', DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
        feature_sweeping_config=None, feature_sweeping_timeout=None,
        featurization_config=None, force_text_dnn=None,
        is_cross_validation=None, is_onnx_compatible=None, logger=None,
        obser...=200000, subsample_freq=7, verbose=-1))]))],
               flatten_transform=None, weights=[1.0]))]),
          stddev=None)


In [13]:
columns = test_df.columns.tolist()
columns.remove('TOTAL_AFFECTED')

y_test = test_df['TOTAL_AFFECTED']
x_test = test_df.loc[:,columns]

y_predict = fitted_model.predict(x_test)

In [18]:
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error
from math import sqrt

y_actual = y_test.values.flatten().tolist()
rmse = sqrt(mean_squared_error(y_actual, y_predict))
print('rmse: ', rmse)
r2 = r2_score(y_actual, y_predict)
print('r2: ', r2)
msle = sqrt(mean_squared_log_error(y_actual, y_predict))
print('msle: ', msle)

rmse:  2334574.3848183285
r2:  0.18283851754988156
msle:  3.7346262843470184
