# Automated ML


In [1]:
from azureml.core import Workspace, Experiment
ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="creditcardfraud")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: workspace-rainel
Azure region: brazilsouth
Subscription id: 611bccaf-ced7-4b1d-9395-57559c451c39
Resource group: raineldias88-rg


## Dataset

### Overview
I am using credit card transactions data from Kaggle ( more details on readme file). Problem statement is to predict whether the transaction is fraud (1) or not (0). This is a supervised binary classification task. 

In [2]:
from azureml.core import Dataset
dataset = Dataset.get_by_name(ws, name='creditcard')
df = dataset.to_pandas_dataframe()

In [3]:
#Check the data frame by running head
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
## check the stats
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284806.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.841366,3.918649e-15,5.682686e-16,-8.761736e-15,2.811118e-15,-1.552103e-15,2.04013e-15,-1.698953e-15,-1.958151e-16,-3.14764e-15,...,1.471982e-16,8.042109e-16,5.28245e-16,4.458267e-15,1.426896e-15,1.70164e-15,-3.671606e-16,-1.218152e-16,88.349619,0.001727
std,47488.22833,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.25,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84691.5,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.75,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [5]:
# check how output column is distributed

df['Class'].value_counts(normalize=True)

0    0.998273
1    0.001727
Name: Class, dtype: float64

#### As we can see above that the data is heavily imbalanced, there is below 1% fraud cases, so I will be using AUC weighted metric for evaluation instead of using just the accuracy.

In [6]:
# Using pandas profiling package to get exploratory data analysis (EDA) report on HTML format.
# The report is added to my repo as output_profile, this captures detailed EDA and descriptive statistics on data

from pandas_profiling import ProfileReport
prof = ProfileReport(df)
prof.to_file(output_file='output_profile.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Getting the Data Ready for Modeling

In [7]:
# check if compute clusters are set up
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cpu_cluster_name = "compute-cluster"
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
cpu_cluster.wait_for_completion(show_output=True)

InProgress..
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


### Splitting the data in training and test, and uploading dataset to the workspace for AutoML 

In [8]:
## Split the Training data and test data
y = df.iloc[:,-1].values   # output variable
X = df.iloc[:, :-1].values  # feature variables

In [9]:
#split the data set into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
import pandas as pd
df_train = pd.concat([pd.DataFrame(X_train), pd.DataFrame(y_train)], axis=1)

In [11]:
df_train.shape # merged training data to be passed to automl

(227845, 31)

In [12]:
columns= df.columns

In [13]:
df_train.columns= columns

In [14]:
df_train['Class'] = df_train['Class'].astype(bool)

In [15]:
df_train.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,143352.0,1.955041,-0.380783,-0.315013,0.330155,-0.509374,-0.086197,-0.627978,0.035994,1.05456,...,0.238197,0.968305,0.053208,-0.278602,-0.044999,-0.21678,0.045168,-0.047145,9.99,False
1,117173.0,-0.400975,-0.626943,1.555339,-2.017772,-0.107769,0.16831,0.017959,-0.401619,0.040378,...,-0.153485,0.421703,0.113442,-1.004095,-1.176695,0.361924,-0.370469,-0.144792,45.9,False
2,149565.0,0.072509,0.820566,-0.561351,-0.709897,1.080399,-0.359429,0.787858,0.117276,-0.131275,...,-0.314638,-0.872959,0.083391,0.148178,-0.431459,0.11969,0.206395,0.070288,11.99,False
3,93670.0,-0.535045,1.014587,1.750679,2.76939,0.500089,1.00227,0.847902,-0.081323,0.371579,...,0.063525,0.443431,-0.072754,0.448192,-0.655203,-0.181038,-0.093013,-0.064931,117.44,False
4,82655.0,-4.026938,1.897371,-0.429786,-0.029571,-0.855751,-0.480406,-0.435632,1.31376,0.536044,...,-0.480691,-0.230369,0.250717,0.066399,0.470787,0.245335,0.286904,-0.322672,25.76,False


In [16]:
from azureml.core import Dataset, Datastore
from azureml.data.datapath import DataPath

def_blob_store = ws.get_default_datastore()
Dataset.Tabular.register_pandas_dataframe(df_train, target= def_blob_store, name= "df_train_tabular", description=None, tags=None, show_progress=True)

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/11bbceed-75f0-40ed-b9b9-3ff2956827d6/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


{
  "source": [
    "('workspaceblobstore', 'managed-dataset/11bbceed-75f0-40ed-b9b9-3ff2956827d6/')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ReadParquetFile",
    "DropColumns"
  ],
  "registration": {
    "id": "068abf42-312c-4a90-a7e9-84ee5e60a968",
    "name": "df_train_tabular",
    "version": 1,
    "workspace": "Workspace.create(name='workspace-rainel', subscription_id='611bccaf-ced7-4b1d-9395-57559c451c39', resource_group='raineldias88-rg')"
  }
}

In [17]:
dataset = Dataset.get_by_name(ws, name='df_train_tabular')
dataset.to_pandas_dataframe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,143352.0,1.955041,-0.380783,-0.315013,0.330155,-0.509374,-0.086197,-0.627978,0.035994,1.054560,...,0.238197,0.968305,0.053208,-0.278602,-0.044999,-0.216780,0.045168,-0.047145,9.99,False
1,117173.0,-0.400975,-0.626943,1.555339,-2.017772,-0.107769,0.168310,0.017959,-0.401619,0.040378,...,-0.153485,0.421703,0.113442,-1.004095,-1.176695,0.361924,-0.370469,-0.144792,45.90,False
2,149565.0,0.072509,0.820566,-0.561351,-0.709897,1.080399,-0.359429,0.787858,0.117276,-0.131275,...,-0.314638,-0.872959,0.083391,0.148178,-0.431459,0.119690,0.206395,0.070288,11.99,False
3,93670.0,-0.535045,1.014587,1.750679,2.769390,0.500089,1.002270,0.847902,-0.081323,0.371579,...,0.063525,0.443431,-0.072754,0.448192,-0.655203,-0.181038,-0.093013,-0.064931,117.44,False
4,82655.0,-4.026938,1.897371,-0.429786,-0.029571,-0.855751,-0.480406,-0.435632,1.313760,0.536044,...,-0.480691,-0.230369,0.250717,0.066399,0.470787,0.245335,0.286904,-0.322672,25.76,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227840,75618.0,1.173488,0.100792,0.490512,0.461596,-0.296377,-0.213165,-0.165254,0.119221,-0.114199,...,-0.186027,-0.574283,0.161405,-0.006140,0.091444,0.109235,-0.020922,0.003967,1.98,False
227841,159000.0,-0.775981,0.144023,-1.142399,-1.241113,1.940358,3.912076,-0.466107,1.360620,0.400697,...,0.037078,-0.019575,0.241830,0.682820,-1.635109,-0.770941,0.066006,0.137056,89.23,False
227842,79795.0,-0.146609,0.992946,1.524591,0.485774,0.349308,-0.815198,1.076640,-0.395316,-0.491303,...,0.052649,0.354089,-0.291198,0.402849,0.237383,-0.398467,-0.121139,-0.196195,3.94,False
227843,87931.0,-2.948638,2.354849,-2.521201,-3.798905,1.866302,2.727695,-0.471769,2.217537,0.580199,...,-0.332759,-1.047514,0.143326,0.678869,0.319710,0.426309,0.496912,0.335822,1.00,False


## AutoML Configuration

Exit criteria is to run autoML for 30 mins to save time and resources. Since the data is imbalanced I am picking AUC weighted metric. When computing the weighted area under the ROC curve, weights vary with the values of the true positive rate (TPrate) among regions in a bid to focus on the accuracy of minority class that is more important in common.

In [18]:
from azureml.train.automl import AutoMLConfig

automl_settings = {
    "experiment_timeout_minutes": 30,
    "primary_metric": 'AUC_weighted'
}


automl_config = AutoMLConfig(
    task='classification',
    training_data= dataset,
    label_column_name='Class',
    n_cross_validations=5,
    compute_target=cpu_cluster,
    **automl_settings
)

## Run Details

In [19]:
# Submitting the Experiment

from azureml.widgets import RunDetails
remote_run = exp.submit(automl_config)
RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output=True)

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
creditcardfraud,AutoML_2bd69a7a-1b23-48eb-a4f3-d6068a5628c5,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

Experiment,Id,Type,Status,Details Page,Docs Page
creditcardfraud,AutoML_2bd69a7a-1b23-48eb-a4f3-d6068a5628c5,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+------------------------------+--------------------------------+--------------------------------------+
|Size of the smallest clas

{'runId': 'AutoML_2bd69a7a-1b23-48eb-a4f3-d6068a5628c5',
 'target': 'compute-cluster',
 'status': 'Completed',
 'startTimeUtc': '2022-11-19T10:50:56.059166Z',
 'endTimeUtc': '2022-11-19T11:40:01.708902Z',
 'services': {},
   'message': 'Experiment timeout reached, hence experiment stopped. Current experiment timeout: 0 hour(s) 30 minute(s)'}],
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'compute-cluster',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"068abf42-312c-4a90-a7e9-84ee5e60a968\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.47.0", "azureml-training-tabular": "1.47.0", "azureml-train": "1.47.0", "azureml-tra

## Best Model

In [20]:
# Getting the best model

best_run, fittedb_model = remote_run.get_output()

print(best_run)

Run(Experiment: creditcardfraud,
Id: AutoML_2bd69a7a-1b23-48eb-a4f3-d6068a5628c5_20,
Type: azureml.scriptrun,
Status: Completed)


In [21]:
# checking the accuracy of model
best_run.get_metrics()['AUC_weighted']

0.9791951184894012

In [25]:
# Saving the Best Model in .pkl format, and registering the model

import joblib

description = "AutoML model trained on Credit Card Fraud Dataset"

os.makedirs('outputs', exist_ok=True)
joblib.dump(fittedb_model, filename="outputs/automl-model.pkl")
automl_model = remote_run.register_model(model_name='automl-creditcard', description=description)

## Model Deployment

The accuracy of AutoML is higher than Hyper Drive method, so I am choosing this autoML model to deploy

In [26]:
# Deploying Model as Azure Container Instance service

from azureml.core.webservice import AciWebservice

aci_config = AciWebservice.deploy_configuration(
    cpu_cores=1,
    memory_gb=1,
    description='Predict if a transaction is Fraud (1) or not (0)',
    auth_enabled=True)

In [27]:
from azureml.core.model import Model
from azureml.automl.core.shared import constants
from azureml.core.model import InferenceConfig


model = Model(ws, 'automl-creditcard')
myenv = best_run.get_environment()
entry_script = 'score.py'
best_run.download_file('outputs/scoring_file_v_1_0_0.py', entry_script)
best_run.download_file(constants.CONDA_ENV_FILE_PATH, 'myenv.yml')

inference_config = InferenceConfig(entry_script=entry_script, environment=myenv)

service = Model.deploy(workspace=ws, 
                       name='automl-creditcard-1', 
                       models=[model], 
                       inference_config=inference_config, 
                       deployment_config=aci_config,
                       overwrite=True)

service.wait_for_deployment(show_output=True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2022-11-19 11:56:41+00:00 Creating Container Registry if not exists..
2022-11-19 12:06:41+00:00 Registering the environment.
2022-11-19 12:06:42+00:00 Use the existing image..
2022-11-19 12:06:42+00:00 Submitting deployment to compute..
2022-11-19 12:06:50+00:00 Checking the status of deployment automl-creditcard-1..
2022-11-19 12:09:18+00:00 Checking the status of inference endpoint automl-creditcard-1.
Succeeded
ACI service creation operation finished, operation "Succeeded"


### Getting two data points from Test set to make it json ready for scoring

In [28]:
#getting test data ready
test_data = pd.DataFrame(X_test[:2]) #pick two data points for test
features = columns[:-1]
test_data.columns= features
test_data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,41505.0,-16.53,8.58,-18.65,9.51,-13.79,-2.83,-16.7,7.52,-8.51,...,-1.51,1.19,-1.13,-2.36,0.67,-1.41,-0.46,-2.02,-1.04,364.19
1,44261.0,0.34,-2.74,-0.13,-1.39,-1.45,1.02,-0.52,0.22,0.9,...,0.51,-0.21,-0.94,-0.53,-1.16,0.31,-0.75,0.04,0.1,520.12


In [29]:
test_data['json'] = test_data.apply(lambda x: x.to_json(), axis=1)

In [30]:
test_data['json'][0]


'{"Time":41505.0,"V1":-16.5265065691,"V2":8.5849717959,"V3":-18.6498531852,"V4":9.5055935151,"V5":-13.7938185271,"V6":-2.8324042994,"V7":-16.701694296,"V8":7.5173439037,"V9":-8.5070586368,"V10":-14.1101844415,"V11":5.2992363496,"V12":-10.8340064815,"V13":1.6711202533,"V14":-9.3738585836,"V15":0.3608056416,"V16":-9.8992465408,"V17":-19.2362923698,"V18":-8.3985519949,"V19":3.1017353689,"V20":-1.5149234353,"V21":1.1907386948,"V22":-1.127670009,"V23":-2.3585787698,"V24":0.673461329,"V25":-1.4136996746,"V26":-0.4627623614,"V27":-2.0185752488,"V28":-1.0428041697,"Amount":364.19}'

In [31]:
test_data['json'][1]

'{"Time":44261.0,"V1":0.3398120639,"V2":-2.7437452373,"V3":-0.134069511,"V4":-1.3857293091,"V5":-1.4514133205,"V6":1.0158865939,"V7":-0.5243790569,"V8":0.2240603761,"V9":0.8997460049,"V10":-0.5650116836,"V11":-0.0876702573,"V12":0.9794269879,"V13":0.0768828168,"V14":-0.2178838121,"V15":-0.1368295877,"V16":-2.1428920902,"V17":0.1269560647,"V18":1.7526615075,"V19":0.4325462237,"V20":0.5060438852,"V21":-0.2134358436,"V22":-0.9425250246,"V23":-0.5268191745,"V24":-1.1569918974,"V25":0.3112105102,"V26":-0.7466466791,"V27":0.0409958027,"V28":0.1020378246,"Amount":520.12}'

In [32]:
# print the scoring uri info 
print("Health_Status: "+ service.state)
print("Key: " + service.get_keys()[0])
print("Scoring URI: " + service.scoring_uri)


Health_Status: Healthy
Key: twnjY5J7CLsm9ag2wBhzlARilolExocw
Scoring URI: http://d268e639-bcbc-4b57-be21-b48640ac7421.brazilsouth.azurecontainer.io/score


### Add these json formatted test data points to the endpoint.py file which will be passed for scoring

In [33]:
%run endpoint.py

{"result": [true, false]}


## Print logs and Delete Resources

In [34]:
## Lets enable app insights logging
service.update(enable_app_insights=True)

In [37]:
# print the logs
print(service.get_logs())

2022-11-19T12:23:13,991220932+00:00 - iot-server/run 
2022-11-19T12:23:13,991287531+00:00 - rsyslog/run 
2022-11-19T12:23:14,003051078+00:00 - gunicorn/run 
2022-11-19T12:23:14,008855103+00:00 | gunicorn/run | 
2022-11-19T12:23:14,011838364+00:00 | gunicorn/run | ###############################################
2022-11-19T12:23:14,016950197+00:00 | gunicorn/run | AzureML Container Runtime Information
2022-11-19T12:23:14,020862946+00:00 - nginx/run 
2022-11-19T12:23:14,024843294+00:00 | gunicorn/run | ###############################################
2022-11-19T12:23:14,026438374+00:00 | gunicorn/run | 
2022-11-19T12:23:14,028035853+00:00 | gunicorn/run | 
2022-11-19T12:23:14,046499812+00:00 | gunicorn/run | AzureML image information: openmpi3.1.2-ubuntu18.04, Materializaton Build:20220930.v4
2022-11-19T12:23:14,049419974+00:00 | gunicorn/run | 
2022-11-19T12:23:14,055813191+00:00 | gunicorn/run | 
2022-11-19T12:23:14,060283433+00:00 | gunicorn/run | PATH environment variable: /azureml-env

In [38]:
# deleting the webservice
service.delete()