# Python Machine Learning client for SAP HANA, release 2.20 enhancements sample notebook

The changelog of enhancements with the release version 2.20.240319 can be found [in the documentation](https://help.sap.com/doc/cd94b08fe2e041c2ba778374572ddba9/2024_1_QRC/en-US/change_log.html), and are summaried below  <br><br>
__New functions__<br>
    - Added Bubble Plot function: `hana_ml.visualizers.eda.buble_plot` and Parallel Co-ordinate Plot function: `hana_ml.visualizers.eda.parallel_coordinates` in EDA. <br>
    - Added time series permutation feature importance function:`hana_ml.algorithms.pal.tsa.permutation_importance`.<br>
    - Added MLP Recommender class: `hana_ml.algorithms.pal.recommender.MLPRecommender`.<br>
<br>
__Enhancements__<br>
    - Enhanced progress monitor for AutoML to display at anytime. Especially for the scheduled job.<br>
    - Support algorithm-specific parameters in automl/pipeline predict. Effect in both pipline and automl module.<br>
    - Enhanced AutoML Auto SQL Content Integration for progress log management.<br>
    - Enhanced pipeline report to display connection scores.<br>
    - Enhanced AutoML config_dict templates with new operators.<br>
    - Enhanced AutoML with connection constraints option.<br>
    - Enhanced AUtoML with different logging levels.<br>
    - Enhanced AutoML with random/grid search option.<br>
    - Enhanced outlier_detection with voting logic.<br>
    - Enhanced online BCPD in massive mode.<br>

## Connect to your SAP HANA Cloud instance

The latest Python ML client for SAP HANA package update ready for installation can be found at: https://pypi.org/project/hana-ml/

In [None]:
## Loading the Python Machine Learning client library for SAP HANA and get the version
import hana_ml
print(hana_ml.__version__)

Connect using the secure user connection store (hdbuserstore) and a connection key, see  [documentation](https://help.sap.com/docs/SAP_HANA_CLIENT/f1b440ded6144a54ada97ff95dac7adf/708e5fe0e44a4764a1b6b5ea549b88f4.html).  

In [None]:
# Create the SAP HANA CLoud connection
import hana_ml.dataframe as dataframe
#cc = dataframe.ConnectionContext( address="<hana-cloud-hostname>", port=443,  user="<HANA-user>")
#cc = dataframe.ConnectionContext( userkey=<your-key-name>, sslValidateCertificate=False)

# Utilize connection credentials from external file
from hana_ml.dataframe import ConnectionContext
from hana_ml.algorithms.pal.utility import Settings
url, port, user, pwd = Settings.load_config("../../config/e2edata.ini", "api")
cc = ConnectionContext(url, port, user, pwd)

# Control connection
print(cc.connection.isconnected())
print(cc.hana_version())

## AutoML enhancement

In [None]:
#Load sample datasets for the AutoML example
from hana_ml.algorithms.pal.utility import DataSets

#load titanic data to HANA
full, train, test, validate = DataSets.load_titanic_data(cc)

In [None]:
# Show the descriptive statistics overviewing the titanic SAP HANA training dataframe
train.stats

### AutoML fit and best pipeline report with connection constraints option

In [None]:
# Using AutoML with Connectivity optimization
import uuid
from hana_ml.algorithms.pal.auto_ml import AutomaticClassification

progress_id = "automl_{}".format(uuid.uuid1())
auto_c = AutomaticClassification(generations=5, 
                                 population_size=10,
                                 offspring_size=10,
                                 config_dict='light',
                                 connections='default',
                                 scorings={'AUC': 0.5, 'ACCURACY': 0.5},
                                 progress_indicator_id=progress_id)

auto_c.disable_workload_class_check()

In [None]:
# Fitting the AutoML classification model
from hana_ml.visualizers.automl_progress import PipelineProgressStatusMonitor

progress_status_monitor = PipelineProgressStatusMonitor(connection_context=cc, 
                                                        automatic_obj=auto_c)
progress_status_monitor.start()

auto_c.fit(data=train.cast({"SURVIVED": "VARCHAR(10)"}), 
           key="PASSENGER_ID", 
           label="SURVIVED")

auto_c.runtime

In [None]:
# Show enhanced best pipeline report with the new connection-tab in the report
from hana_ml.visualizers.unified_report import UnifiedReport

UnifiedReport(auto_c).build().display()

### Random Search Option in AutoML

In [None]:
# Preparing a time series data set, for illustrations of the new Random Search option with AutoML
import pandas as pd
from hana_ml.algorithms.pal.auto_ml import AutomaticTimeSeries
from hana_ml.dataframe import create_dataframe_from_pandas

data = [0,3.530127019,-5.930127019,-2.4,1.130127019,-8.330127019,-4.8,-1.269872981,-10.73012702,-7.2,-3.669872981,-13.13012702,-9.6,-6.069872981,-15.53012702,-12,-8.469872981,-17.93012702,-14.4,-10.86987298,-20.33012702,-16.8,-13.26987298,-22.73012702,-19.2,-15.66987298,-25.13012702,-25.13012702,-20.37484444,-22.19120076,-28.06905328,-29.8854096,-25.13012702,-20.37484444,-22.19120076,-28.06905328,-29.8854096,-25.13012702,-20.37484444,-22.19120076,-28.06905328,-29.8854096,-25.13012702,-20.37484444,-22.19120076,-28.06905328,-29.8854096,-25.13012702,-20.37484444,-22.19120076,-28.06905328,-29.8854096,5,8.909157412,9.874639561,7.169418696,2.830581304,0.125360439,1.090842588,5,8.909157412,9.874639561,7.169418696,2.830581304,0.125360439,1.090842588,5,8.909157412,9.874639561,7.169418696,2.830581304,0.125360439,1.090842588,5,8.909157412,9.874639561,7.169418696,2.830581304,0.125360439,1.090842588,5,8.909157412,9.874639561,7.169418696,2.830581304,0.125360439,1.090842588]
df = pd.DataFrame({'ID':[i for i in range(len(data))],'SERIES':data})   
dt_ml = create_dataframe_from_pandas(cc, df, table_name='#PAL_DATA_TBL', force=True)

#ts = cc.table('PAL_DATA_TBL')
print(len(dt_ml.collect()))

In [None]:
# Define a AutoML time series analysis scenario and use RANDOM search optimization instead of the Genetic Algorithm optimization
progress_id = "automl_ts_{}".format(uuid.uuid1())

auto_ts = AutomaticTimeSeries(progress_indicator_id=progress_id,
                              config_dict="default",
                              search_method='random')

auto_ts.disable_workload_class_check()

auto_ts.fit(data=dt_ml, key='ID', endog="SERIES")  
auto_ts.runtime

In [None]:
df_predict = auto_ts.make_future_dataframe(periods=10)
res = auto_ts.predict(df_predict, key="ID")
print(res.collect())

### Progress Monitor enhancement to support the scheduled AutoML tasks

In [None]:
from hana_ml.algorithms.pal.utility import Settings
Settings.set_log_level()
from datetime import datetime

from hana_ml.hana_scheduler import HANAScheduler
hana_schedule = HANAScheduler(cc)

job_name = "my_job_{}".format(str(uuid.uuid1())[:8])

# Create scheduled job for the AutoML-fit task using "creating_training_schedule"
hana_schedule.create_training_schedule(job_name=job_name,
                                       obj=auto_c,
                                       cron="* * * mon,tue,wed,thu,fri {} {} {}".format(datetime.now().hour - 8, #UTC BJ
                                                                                        datetime.now().minute,
                                                                                        datetime.now().second),
                                       output_table_names=['DM_BEST_PIPELINE_', 'DM_MODEL_', 'DM_INFO_'],
                                       force=True)

In [None]:
print(job_name)

In [None]:
hana_schedule.display_schedule_status()

In [None]:
from hana_ml.visualizers.automl_progress import SimplePipelineProgressStatusMonitor

progress_status_monitor = SimplePipelineProgressStatusMonitor(connection_context=cc)

progress_status_monitor.start(progress_indicator_id=progress_id, 
                              highlight_metric='AUC')

In [None]:
hana_schedule.delete_schedule(job_name)
Settings.set_log_level('ERROR')

## Time Series Analysis and Forecasting enhancements

### Time series outlier detection with voting

In [None]:
# create the ts data for illustrating this feature
data = [[1,  '2008-03-09',  13],
        [2,  '2008-03-10',  16],
        [3,  '2008-03-11',  14],
        [4,  '2008-03-12',  10],
        [5,  '2008-03-13',  10],
        [6,  '2008-03-14',  510],
        [7,  '2008-03-15',  510],
        [8,  '2008-03-16',  510],
        [9,  '2008-03-17',  510],
        [10, '2008-03-18',  516]]
col_name = ["ID", "timestamp", "y"]
df_s = pd.DataFrame(data=data, columns=col_name)
df_s = create_dataframe_from_pandas(connection_context=cc, 
                                    pandas_df=df_s,
                                    table_name='TEST_SINGLE_OUTLIER_DETECTION', 
                                    force=True, 
                                    replace=True)



In [None]:
# Using voting_config for defining which stats to be used with the outlier voting
from hana_ml.algorithms.pal.tsa.outlier_detection import OutlierDetectionTS

od = OutlierDetectionTS(window_size=5,
                        detect_seasonality=False,                 
                        outlier_method='z1',
                        threshold=5,
                        contamination=0.5,
                        voting_outlier_method_criterion=0.5,
                        dbscan_normalization=False,
                        voting_config={"z1": {"threshold":100}, "z2": {"threshold":1}, 
                                       "iqr": {"threshold":2}, "mad":{"threshold":3},  
                                       "isolationforest": {"contamination":0.4},
                                       "dbscan": {'minpts':1,
                                                  "eps":0.5,
                                                  "distiance_method":"euclidean", 
                                                  "dbscan_normalization":True,
                                                  "dbscan_outlier_from_cluster":False}},
                       residual_usage="outlier_correction")
res=od.fit_predict(data=df_s, key='ID', endog='y')

print(od.stats_.collect(), '\n')
print(res.head(2).collect())

### Segmented (massive-parallel) time series Online Bayesian Change Point Detection (Online BCPD)

In [None]:
# create the ts data for illustrating this feature
data = [['100', 100, 1,  '2008-03-09',  13],
        ['100', 100, 2,  '2008-03-10',  16],
        ['100', 100, 3,  '2008-03-11',  14],
        ['100', 100, 4,  '2008-03-12',  10],
        ['100', 100, 5,  '2008-03-13',  10],
        ['100', 100, 6,  '2008-03-14',  510],
        ['100', 100, 7,  '2008-03-15',  510],
        ['100', 100, 8,  '2008-03-16',  510],
        ['100', 100, 9,  '2008-03-17',  510],
        ['100', 100, 10, '2008-03-18',  516],        
        ['200', 200, 1,  '2008-07-09',  0],
        ['200', 200, 2,  '2008-07-10',  0],
        ['200', 200, 3,  '2008-07-11',  0],
        ['200', 200, 4,  '2008-07-12',  13],
        ['200', 200, 5,  '2008-07-13',  10],
        ['200', 200, 6,  '2008-07-14',  10],
        ['200', 200, 7,  '2008-07-15',  12],
        ['200', 200, 8,  '2008-07-16',  10],
        ['200', 200, 9,  '2008-07-17',  18],
        ['200', 200, 10, '2008-07-18',  0]]
col_name = ["GROUP_id_NAR", "GROUP_id_INT", "ID", "timestamp", "y"]
df_m = pd.DataFrame(data=data, columns=col_name)
df_m = create_dataframe_from_pandas(connection_context=cc, 
                                    pandas_df=df_m,
                                    table_name='DATA_MASSIVE_ONLINE_BCPD_NOTEBOOK', 
                                    force=True, 
                                    replace=True)

In [None]:
# Using the massive-parameter for invoking segmented time series online BCPD
from hana_ml.algorithms.pal.tsa.changepoint import OnlineBCPD

obcpd = OnlineBCPD(massive=True,
                   group_params= {'100': {'threshold':100, 'prune' :False, 'delay':100},
                                  '200': {'threshold':200, 'prune' :True,  'delay':200}})
init_model, cp = obcpd.fit_predict(data=df_m, group_key="GROUP_id_NAR", key='timestamp', endog='y')
print(init_model.head(30).collect())
##print(cp.collect())

### Time Series external feature importance evaluation using value permutation 

In [None]:
# create the ts data for illustrating this feature
data_s = [[ 0 , '2018-03-01 00:00:00', 1001.186965,  1001.381398, 0],
          [ 1 , '2018-03-01 01:00:00',  999.743681,  1000.449621, 0],
          [ 2 , '2018-03-01 02:00:00',  998.273447,  1000.202493, 0],
          [ 3 , '2018-03-01 03:00:00',  998.163017,   999.517705, 0],
          [ 4 , '2018-03-01 04:00:00', 1001.023297,  1000.149800, 0],
          [ 5 , '2018-03-01 05:00:00', 1001.802838,   999.582847, 1],
          [ 6 , '2018-03-01 06:00:00', 1000.902042,  1001.485573, 0],
          [ 7 , '2018-03-01 07:00:00',  999.679829,  1000.480978, 0],
          [ 8 , '2018-03-01 08:00:00', 1000.374643,   998.741293, 1],
          [ 9 , '2018-03-01 09:00:00',  998.764213,   997.642889, 0],
          [ 10, '2018-03-01 10:00:00',  997.452251,   997.075054, 0],
          [ 11, '2018-03-01 11:00:00',  996.122439,   998.310629, 0],
          [ 12, '2018-03-01 12:00:00',  994.911779,   998.415049, 0],
          [ 13, '2018-03-01 13:00:00',  994.764867,   998.523278, 0],
          [ 14, '2018-03-01 14:00:00',  995.957687,   998.204968, 0],
          [ 15, '2018-03-01 15:00:00',  994.190517,   996.617601, 0],
          [ 16, '2018-03-01 16:00:00',  994.111706,   996.972823, 0],
          [ 17, '2018-03-01 17:00:00',  994.128363,   997.372093, 0],
          [ 18, '2018-03-01 18:00:00',  993.700500,   995.882251, 1],
          [ 19, '2018-03-01 19:00:00',  993.504805,   995.968155, 0],
          [ 20, '2018-03-01 20:00:00',  993.394271,   996.612626, 2],
          [ 21, '2018-03-01 21:00:00',  994.302608,   997.103210, 0],
          [ 22, '2018-03-01 22:00:00',  993.106145,   997.017516, 0],
          [ 23, '2018-03-01 23:00:00',  993.412343,   997.597426, 0],
          [ 24, '2018-03-02 00:00:00',  993.304751,   996.252572, 0]]
col_name = ["ID_INT", "ID_TIMESTAMP", "endog", "exog", "cate"]
df_s = pd.DataFrame(data=data_s, columns=col_name)
df_s = create_dataframe_from_pandas(connection_context=cc, 
                                          pandas_df=df_s,
                                          table_name='DATA_ARIMA_TBL_NOTEBOOK', 
                                          force=True, 
                                          replace=True)
data_p_s = [[ 1,  '2018-03-02 01:00:00',  998.37443, 998.364324, 0],
            [ 2,  '2018-03-02 02:00:00',  1002.74643, 997.995616, 0],
            [ 3,  '2018-03-02 03:00:00',  1001.31243, 996.582251, 1],
            [ 4,  '2018-03-02 04:00:00',  1003.97401, 994.965102, 1],
            [ 5,  '2018-03-02 05:00:00',  1000.37643, 995.522703, 0]]
col_name = ["ID_INT", "ID_TIMESTAMP", "endog", "exog", "cate"]
data_p_s = pd.DataFrame(data=data_p_s, columns=col_name)

data_p_s = create_dataframe_from_pandas(connection_context=cc, 
                                            pandas_df=data_p_s,
                                            table_name='DATA_ARIMA_PREDICT_TBL_NOTEBOOK', 
                                            force=True, 
                                            replace=True)

In [None]:
from hana_ml.algorithms.pal.tsa.arima import ARIMA
from hana_ml.algorithms.pal.tsa.permutation_importance import permutation_importance
# ARIMA ID=INT
ar = ARIMA()
ar.fit(df_s, key='ID_INT', endog='endog', exog=['exog', 'cate'])
print(ar.model_.head(2).collect())
#res = ar.predict(df_s_p.deselect(['ID_TIMESTAMP', 'endog']), key='ID_INT')
#print(res.collect())
pires = permutation_importance(data=data_p_s, model=ar.model_, key='ID_INT', endog='endog', exog=['exog', 'cate'])
print(pires.collect())

## Recommendation functions

### Multilayer Perceptron (MLP) Neural Network Recommender

In [None]:
bst_full = DataSets.load_boston_housing_data(cc)
bst_train, bst_test = bst_full[1], bst_full[3]

In [None]:
from hana_ml.algorithms.pal.recommender import MLPRecommender

mlpr_reg = MLPRecommender( learning_rate=0.1, num_epochs=1000, random_state=2023) 
cols = bst_train.columns
cols.remove('ID')
cols.remove('MEDV')

mlpr_reg.fit(data=bst_train, key='ID', label='MEDV',
             selected_feature_set1=cols,
             selected_feature_set2=cols)

mlpr_reg.runtime

In [None]:
res = mlpr_reg.predict(bst_test, key='ID')
res.head(10).collect()

## Exploratory Data Analysis and Visualizations

### Bubble Plot & Parallel Co-ordinate Plot

In [None]:
import numpy as np
import random
import pandas as pd
from hana_ml.dataframe import create_dataframe_from_pandas

num_point=100
x = np.random.rand(num_point)
y = np.random.rand(num_point)
w = np.random.rand(num_point)
z = np.random.rand(num_point)*1000
cate = [random.randrange(1, 3, 1) for i in range(num_point)]
label = [random.randrange(1, 5, 1) for i in range(num_point)]
color = np.random.rand(num_point)

content = {'X':x, 'Y':y, 'S':z, 'W':w, 'cate':cate, 'label':label}
df_pd=pd.DataFrame(content)
print(df_pd.dtypes)
df = create_dataframe_from_pandas(cc, 
                                  pandas_df=df_pd,
                                  table_name='DATA', 
                                  force=True, 
                                  replace=True)
df_new = df.cast({"cate":"NVARCHAR(20)"})
full_set, training_set, validation_set, test_set = DataSets.load_iris_data(cc)

In [None]:
from hana_ml.visualizers.eda import parallel_coordinates, bubble_plot

fig = bubble_plot(df_new, x='X', y='Y', size='S', title="bubble plot", width=600, height=400)
fig.show()

In [None]:
fig = parallel_coordinates(data=full_set.deselect("ID"), label='SPECIES', 
                          cols=['SEPALLENGTHCM', 'SEPALWIDTHCM', 'PETALLENGTHCM', 'PETALWIDTHCM'],
                          width=600, height=400)
fig.show()