# SAP HANA Cloud - Auto ML Hands On

### Documentation
- SAP HANA Python Client API for Machine Learning Algorithms: https://pypi.org/project/hana-ml/

- SAP HANA Predictive Analysis Library (PAL): https://help.sap.com/viewer/2cfbc5cf2bc14f028cfbe2a2bba60a50/1.0.12/en-US

SAP HANA ML Library
You will be using the 'SAP HANA Python Client API for Machine Learning Algorithm'.

In [None]:
!pip install --upgrade hana_ml

In [None]:
import hana_ml
print(hana_ml.__version__)

In [None]:
hana_address = #your hostname as string
hana_port = #your port as integer
hana_user = #your user as string
hana_password = #your password as string
hana_encrypt = True #for HANA Cloud

In [None]:
import hana_ml.dataframe as dataframe

# Establish connection
conn = dataframe.ConnectionContext(address = hana_address,
                                   port = hana_port, 
                                   user = hana_user, 
                                   password = hana_password, 
                                   encrypt = hana_encrypt,
                                   sslValidateCertificate = 'false')

Through a HANA Key we are able to hide our login credentials.

In [None]:
import hana_ml.dataframe as dataframe

# Establish connection
conn = dataframe.ConnectionContext(userkey = 'MYHANACLOUD',
                                   encrypt = 'true',
                                   sslValidateCertificate = 'false')

In [None]:
import pandas as pd
#load data, change path to your directory
df = pd.read_csv(r"YourPath\BB_data.csv")
df.head()

In [None]:
#change columns to upper string
df.columns = map(str.upper, df.columns)

In [None]:
#insert a product ID, which will later be used as key
df.insert(0, 'TRANSACTION_ID', df.reset_index().index)

In [None]:
#control a sample of the data
df.head()

In [None]:
%%time
#Create a SAP HANA dataframe and point it to the table with the uploaded data.
df_remote = dataframe.create_dataframe_from_pandas(connection_context = conn, 
                                                   pandas_df = df, 
                                                   table_name = 'TRANSACTIONS',
                                                   force = True,
                                                   replace = False)

In [None]:
df_remote = conn.table("TRANSACTIONS")

In [None]:
#control the size of the data
df_remote.count()

In [None]:
#control the variable types in SAP HANA
df_remote.dtypes()

In [None]:
#transform the variable QUALITY
df_remote = df_remote.cast('FRAUD', 'NVARCHAR(20)')

In [None]:
df_remote = df_remote.cast('AMOUNT', 'DOUBLE')
df_remote = df_remote.cast('OLD_BALANCE_ORIGIN', 'DOUBLE')
df_remote = df_remote.cast('NEW_BALANCE_ORIGIN', 'DOUBLE')
df_remote = df_remote.cast('OLD_BALANCE_DEST', 'DOUBLE')
df_remote = df_remote.cast('NEW_BALANCE_DEST', 'DOUBLE')

In [None]:
#control the variable types
df_remote.dtypes()

In [None]:
#describe the data in SAP HANA
df_remote.describe().collect()

In [None]:
%%time
#create training and testing set
from hana_ml.algorithms.pal import partition
df_remote_train, df_remote_test, df_remote_val = partition.train_test_val_split(data = df_remote, 
                                                                                   training_percentage = 0.5, 
                                                                                   testing_percentage = 0.5,
                                                                                   validation_percentage = 0)

In [None]:
#control the size of the training and testing set
print('Size of training subset: ' + str(df_remote_train.count()))
print('Size of test subset: ' + str(df_remote_test.count()))

In [None]:
from hana_ml import dataframe
from hana_ml.dataframe import ConnectionContext
from hana_ml.algorithms.pal.utility import DataSets, Settings
from hana_ml.algorithms.pal.partition import train_test_val_split
from hana_ml.algorithms.pal.auto_ml import AutomaticClassification, AutomaticRegression
from hana_ml.visualizers.automl_progress import PipelineProgressStatusMonitor
from hana_ml.visualizers.automl_report import BestPipelineReport
from hana_ml.visualizers.unified_report import UnifiedReport
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
import json
import uuid

In [None]:
conn.execute_sql('''
CREATE WORKLOAD CLASS "PAL_AUTOML_WORKLOAD" SET 'PRIORITY' = '3', 'STATEMENT MEMORY LIMIT' = '3' , 'STATEMENT THREAD LIMIT' = '20'
''')

In [None]:
import uuid
scenario_id = "{}_AutoMLc_{}".format("<YourName>", uuid.uuid1())
print(scenario_id)

# Set the initial AutoML scenario parameters
auto_c = AutomaticClassification(generations=2, 
                                 population_size=5,
                                 offspring_size=5, 
                                 elite_number=5,
                                 random_seed=1234,
                                 progress_indicator_id=scenario_id)

In [None]:
# Reinitialize the AutoML operators and their parameters
auto_c.reset_config_dict(conn)

In [None]:
auto_c.display_config_dict()

In [None]:
# Modify the AutoML Classification Scenario

# Drop all Resampler
auto_c.delete_config_dict("SAMPLING")
auto_c.delete_config_dict("SMOTE")
auto_c.delete_config_dict("TomekLinks")

auto_c.display_config_dict(category="Resampler")

In [None]:
# Drop and select Transformer
auto_c.delete_config_dict(category="Transformer")

In [None]:
# Drop and select  Classifier
auto_c.delete_config_dict("DT_Classifier")
auto_c.delete_config_dict("SVM_Classifier")
auto_c.delete_config_dict("NB_Classifier")
auto_c.delete_config_dict("MLP_Classifier")
auto_c.delete_config_dict("RDT_Classifier")

auto_c.display_config_dict(category="Classifier")

In [None]:
# Change / update Classifier parameter values and ranges
auto_c.update_config_dict("M_LOGR_Classifier", "ENET_LAMBDA", [0.001, 0.01, 0.1])
auto_c.display_config_dict("M_LOGR_Classifier")

auto_c.update_config_dict("HGBT_Classifier", "ETA", [1e-2, 1e-1, 0.5])
auto_c.update_config_dict("HGBT_Classifier", "MAX_DEPTH", {'range': [1, 1, 11]})
auto_c.update_config_dict("HGBT_Classifier", "NODE_SIZE", {'range': [1, 1, 21]})
auto_c.display_config_dict("HGBT_Classifier")


In [None]:
# Review complete AutoML Classification configuration
auto_c.display_config_dict()

In [None]:
%%time
# enable_workload_class
auto_c.enable_workload_class(workload_class_name="PAL_AUTOML_WORKLOAD")

# invoke a PipelineProgressStatusMonitor
progress_status_monitor = PipelineProgressStatusMonitor(connection_context= conn, 
                                                        automatic_obj=auto_c)

progress_status_monitor.start()

# training
try:
    auto_c.fit(data=df_remote_train, key='TRANSACTION_ID', label = "FRAUD")
except Exception as e:
    raise e

In [None]:
pipeline = auto_c.model_[1].collect().iat[0, 1]
res_ev = auto_c.evaluate(df_remote_test, pipeline=pipeline)
print(res_ev.collect())

In [None]:
res = auto_c.predict(df_remote_test.deselect("FRAUD"), key = 'TRANSACTION_ID')
print(res.collect())

In [None]:
from hana_ml.model_storage import ModelStorage
MODEL_SCHEMA = 'YourSchema' # HANA schema in which models are to be saved
model_storage = ModelStorage(connection_context=conn, schema=MODEL_SCHEMA)

In [None]:
auto_c.name = 'AutoML Classification' 
auto_c.version = 1
model_storage.save_model(model=auto_c)