# Example - Using SAP HANA ML / Predictive Analysis Library (PAL) __pipelines__ from Python

The python machine learning client documentation [pipeline method](https://help.sap.com/doc/1d0ebfe5e8dd44d09606814d83308d4b/2.0.06/en-US/hana_ml.algorithms.pal_algorithm.html#module-hana_ml.algorithms.pal.pipeline), explains the python hana_ml method, while mapped to the HANA PAL SQL pipeline methods for execution refer and compare with [PAL pipeline documentation](https://help.sap.com/docs/HANA_CLOUD_DATABASE/319d36de4fd64ac3afbf91b1fb3ce8de/de964934276b415aa6ae0c71ce72f351.html?locale=en-US), lists the [supported pipeline operators](https://help.sap.com/docs/HANA_CLOUD_DATABASE/319d36de4fd64ac3afbf91b1fb3ce8de/fd58a3f156a24d80af2791ae60fb27bb.html?locale=en-US).

## Load packages and connect to SAP HANA Cloud

In [1]:
# Import HANA-ML package 
import hana_ml
from hana_ml import dataframe
print(hana_ml.__version__)

2.14.22102800


Connect using the secure user connection store (hdbuserstore) and a connection key, see [documentation](https://help.sap.com/docs/SAP_HANA_CLIENT/f1b440ded6144a54ada97ff95dac7adf/708e5fe0e44a4764a1b6b5ea549b88f4.html).  

In [3]:
# Create the SAP HANA CLoud connection
#conn = dataframe.ConnectionContext( address="<hana-cloud-hostname>", port=443,  user="<HANA-user>")
conn = dataframe.ConnectionContext(userkey='<your-connection-key>', sslValidateCertificate=False)

# Check SAP HANA Cloud release version 
conn.hana_version()

'4.00.000.00.1663669796 (fa/CE2022.30)'

## Upload data, create a HANA dataframe and explore data

In [4]:
# Upload a dataset from local CSV-file 
import pandas as pd
df = dataframe.create_dataframe_from_pandas(
        conn,
        pd.read_csv("./diabetes.csv", sep=',', header=0, skiprows=1,
                    names=["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", 
                           "Insulin","BMI", "DiabetesPedigreeFunction", "Age", "CLASS"]
                   ),
        table_name="DIABETES_TABLE",
        force=True,
        replace=True,
        drop_exist_tab=True,
        table_structure={"Pregnancies": "INT", "Glucose" : "INT", "BloodPressure" : "INT", 
                         "SkinThickness" : "INT", "Insulin": "INT", "BMI"  : "DOUBLE", 
                         "DiabetesPedigreeFunction"  : "DOUBLE", "Age": "INT", "CLASS": "NVARCHAR(10)"})
print(df.select_statement)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.00it/s]

SELECT "Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "CLASS" FROM (SELECT * FROM "DIABETES_TABLE") dt





In [5]:
# Now let's add ID column and look at the acual data. 
df = df.add_id(id_col='ID')

# Note, only Collect() will transfer data from the HANA to the python client
# Here head(6) will filter TOP 6 rows before the collect()-transfer to Python
display(df.head(6).collect())

# Review the changed HANA dataframe SQL statement
display(df.select_statement)

Unnamed: 0,ID,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,CLASS
0,1,1,85,66,29,0,26.6,0.351,31,0
1,2,8,183,64,0,0,23.3,0.672,32,1
2,3,1,89,66,23,94,28.1,0.167,21,0
3,4,0,137,40,35,168,43.1,2.288,33,1
4,5,5,116,74,0,0,25.6,0.201,30,0
5,6,3,78,50,32,88,31.0,0.248,26,1


'SELECT CAST(ROW_NUMBER() OVER() AS INTEGER) + 0 AS "ID", * FROM (SELECT "Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "CLASS" FROM (SELECT * FROM "DIABETES_TABLE") dt)'

In [6]:
# Explore the target variable 'CLASS' and its value distribution (aggregate-count())
df.agg([('count', 'ID', 'N')], group_by='CLASS').collect()

Unnamed: 0,CLASS,N
0,0,500
1,1,267


In [7]:
# Explore column descriptive statistics using describe-method
display(df.describe().collect())

Unnamed: 0,column,count,unique,nulls,mean,std,min,max,median,25_percent_cont,25_percent_disc,50_percent_cont,50_percent_disc,75_percent_cont,75_percent_disc
0,ID,767,767,0,384.0,221.558118,1.0,767.0,384.0,192.5,192.0,384.0,384.0,575.5,576.0
1,Pregnancies,767,17,0,3.842243,3.370876,0.0,17.0,3.0,1.0,1.0,3.0,3.0,6.0,6.0
2,Glucose,767,136,0,120.859192,31.978468,0.0,199.0,117.0,99.0,99.0,117.0,117.0,140.0,140.0
3,BloodPressure,767,47,0,69.101695,19.368154,0.0,122.0,72.0,62.0,62.0,72.0,72.0,80.0,80.0
4,SkinThickness,767,51,0,20.517601,15.954059,0.0,99.0,23.0,0.0,0.0,23.0,23.0,32.0,32.0
5,Insulin,767,186,0,79.90352,115.283105,0.0,846.0,32.0,0.0,0.0,32.0,32.0,127.5,128.0
6,BMI,767,248,0,31.990482,7.889091,0.0,67.1,32.0,27.3,27.3,32.0,32.0,36.6,36.6
7,DiabetesPedigreeFunction,767,516,0,0.471674,0.331497,0.078,2.42,0.371,0.2435,0.243,0.371,0.371,0.625,0.626
8,Age,767,52,0,33.219035,11.752295,21.0,81.0,29.0,24.0,24.0,29.0,29.0,41.0,41.0
9,CLASS,767,2,0,,,,,,,,,,,


## Define and execute a PAL algorithm pipeline

In [None]:
from hana_ml.algorithms.pal.preprocessing import FeatureNormalizer
fn = FeatureNormalizer(method="z-score")
n.fit(df1, key='ID')


In [10]:
# Import algorithms
from hana_ml.algorithms.pal.decomposition import PCA
from hana_ml.algorithms.pal.trees import HybridGradientBoostingClassifier

#Import Pipeline
from hana_ml.algorithms.pal.pipeline import Pipeline

#Define pipeline
my_pipeline = Pipeline([
     ('pca', PCA(scaling=True, scores=True)),
     ('hgbt', HybridGradientBoostingClassifier(n_estimators=4, split_threshold=0, learning_rate=0.5, fold_num=5,max_depth=6))])
fit_params = {'pca__key': 'ID',
              'pca__label': 'CLASS',
              'hgbt__key': 'ID',
              'hgbt__label': 'CLASS',
              'hgbt__categorical_variable': 'CLASS'}

#Generate pipeline JSON
my_pipeline.fit(data=df, fit_params=fit_params, use_pal_pipeline_fit=False)
json_string=my_pipeline.generate_json_pipeline()
json_string


'{"hgbt": {"args": {"ITER_NUM": 4, "MAX_DEPTH": 6, "GAMMA": 0.0, "FOLD_NUM": 5, "ETA": 0.5, "HAS_ID": 1, "CATEGORICAL_VARIABLE": "CLASS,CLASS"}, "inputs": {"data": {"pca": {"args": {"SCALING": 1, "SCORES": 1}, "inputs": {"data": "ROWDATA"}}}}}}'

In [11]:
# Visualize the pipeline
my_pipeline.plot(iframe_height=450)

In [12]:
my_pipeline.fit(data=df, fit_params=fit_params)

<hana_ml.algorithms.pal.pipeline.Pipeline at 0x24e7749bd60>

In [14]:
print(conn.last_execute_statement)


DO
BEGIN
DECLARE param_name VARCHAR(5000) ARRAY;
DECLARE int_value INTEGER ARRAY;
DECLARE double_value DOUBLE ARRAY;
DECLARE string_value VARCHAR(5000) ARRAY;
param_name[1] := N'HAS_ID';
int_value[1] := 0;
double_value[1] := NULL;
string_value[1] := NULL;
param_name[2] := N'PIPELINE';
int_value[2] := NULL;
double_value[2] := NULL;
string_value[2] := N'{"HGBT_Classifier": {"args": {"ITER_NUM": 4, "MAX_DEPTH": 6, "GAMMA": 0.0, "FOLD_NUM": 5, "ETA": 0.5}, "inputs": {"data": {"CATPCA": {"args": {"SCALING": 1, "SCORES": 1}, "inputs": {"data": "ROWDATA"}}}}}}';
params = UNNEST(:param_name, :int_value, :double_value, :string_value);
in_0 = SELECT "ID", "Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "CLASS" FROM (SELECT CAST(ROW_NUMBER() OVER() AS INTEGER) + 0 AS "ID", * FROM (SELECT "Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "CLASS" FROM (SELECT * FROM "DIABE

In [13]:
my_pipeline.get_fit_execute_statement()
##get_score_execute_statement()
#get_predict_execute_statement()

'DO\nBEGIN\nDECLARE param_name VARCHAR(5000) ARRAY;\nDECLARE int_value INTEGER ARRAY;\nDECLARE double_value DOUBLE ARRAY;\nDECLARE string_value VARCHAR(5000) ARRAY;\nparam_name[1] := N\'HAS_ID\';\nint_value[1] := 0;\ndouble_value[1] := NULL;\nstring_value[1] := NULL;\nparam_name[2] := N\'PIPELINE\';\nint_value[2] := NULL;\ndouble_value[2] := NULL;\nstring_value[2] := N\'{"HGBT_Classifier": {"args": {"ITER_NUM": 4, "MAX_DEPTH": 6, "GAMMA": 0.0, "FOLD_NUM": 5, "ETA": 0.5}, "inputs": {"data": {"CATPCA": {"args": {"SCALING": 1, "SCORES": 1}, "inputs": {"data": "ROWDATA"}}}}}}\';\nparams = UNNEST(:param_name, :int_value, :double_value, :string_value);\nin_0 = SELECT "ID", "Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "CLASS" FROM (SELECT CAST(ROW_NUMBER() OVER() AS INTEGER) + 0 AS "ID", * FROM (SELECT "Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "CLASS" FROM

In [None]:
conn.close()