# Demo - SAP HANA Cloud Machine Learning

 SAP HANA CLoud openSAP course, Week4 / Unit4: Machine Learning
 
 Demo scenario: classify peoples likelyhood for acquiring diabetes 


## Load the SAP HANA Cloud Python packages

In [44]:
#Import HANA database client library for Python
import hdbcli
from hdbcli import dbapi
#print(hdbcli.__version__)

#Import hana_ml package withHANA Dataframe and PAL algorithm classes
import hana_ml
print(hana_ml.__version__)

# Load Dataframe and Connection classes
from hana_ml import dataframe
from hana_ml.dataframe import ConnectionContext

# Load algorithm classes
from hana_ml.algorithms import pal
from hana_ml.algorithms import apl
from hana_ml.algorithms.pal.unified_classification import UnifiedClassification
from hana_ml.algorithms.pal.unified_regression import UnifiedRegression


2.11.21121103


## Connect to SAP HANA Cloud instance

In [None]:
#conn = dataframe.ConnectionContext( address="<hana-system>", port=<SQL-port>,  user="<HANA-user>", password="<password>")
hc_url = '<hanacloud SQL endpoint URL>'

conn = dataframe.ConnectionContext( address=hc_url, port=443, user="<user>" )


In [None]:
conn.hana_version()

##  Create HANA dataframe and explore the data from Python

In [None]:
# Creating a dataframe using SAP HANA Cloud table specification
diabetes_hdf = conn.table("DIABETES_DATA", schema="MLLAB_SHARE")
print(diabetes_hdf.select_statement)

In [None]:
# Dataframe Overview in SAP HANA Cloud
print('Number of records', diabetes_hdf.count())
print('Number of columns', len(diabetes_hdf.columns))
print(diabetes_hdf.columns)
print(diabetes_hdf.dtypes())

In [None]:
# Change the dataframe, move ID as first column, cast target column as categorial
diabetes_hdf = diabetes_hdf.to_head('ID')
diabetes_hdf = diabetes_hdf.cast('CLASS', 'NVARCHAR(10)')
print(diabetes_hdf.select_statement)

In [None]:
# Now let's look at the acual data. Collect() will transfer data from the HANA to the python client
# Note, here head(6) will filter TOP 6 rows before the collect()-transfer to Python
diabetes_hdf.head(6).collect()

In [None]:
# Show column descriptive statistics using the describe method
diabetes_hdf.describe().head(10).collect()

In [None]:
# Explore the target variable 'CLASS' and its value distribution (aggregate-count())
diabetes_hdf.agg([('count', 'ID', 'N')], group_by='CLASS').collect()

## Data Preparation

In [None]:
# Partition the input data set
# default: training_percentage = 0.8, testing_percentage = 0.1, validation_percentage = 0.1

from hana_ml.algorithms.pal.partition import train_test_val_split as split
d_train, d_test, d_val = split( data= diabetes_hdf, partition_method='stratified', stratified_column='CLASS')


In [None]:
print('Number of rows in training subset: {}'.format(d_train.count()))
print('Number of rows in validation subset: {}'.format(d_val.count()))
print('Number of rows in test subset: {}'.format(d_test.count()))


##  Train a SAP HANA PAL Hybrid Gradient Boosting classification model

In [None]:
# Use UnifiedClassification PAL procedure interface in Python
from hana_ml.algorithms.pal.unified_classification import UnifiedClassification

# Iterate over different Paremeter Settings of the Algorithm, find best setting
from hana_ml.algorithms.pal.model_selection import GridSearchCV

In [None]:

HGBT_MODEL = UnifiedClassification('HybridGradientBoostingTree')

MODEL_SEARCH = GridSearchCV(estimator=HGBT_MODEL, 
                    param_grid={'learning_rate': [0.1, 0.4, 0.7, 1],
                                'n_estimators': [4, 6, 8, 10],
                                'split_threshold': [0.1, 0.4, 0.7, 1]},
                    train_control=dict(fold_num=5,
                                       resampling_method='cv',
                                       random_state=1,
                                       ref_metric=['auc']),
                    scoring='error_rate')

MODEL_SEARCH.fit(data=d_train, key= 'ID',
         label='CLASS',
         partition_method='stratified',
         partition_random_state=1,
         stratified_column='CLASS',
         build_report=False)

In [None]:
#HGBT_MODEL.confusion_matrix_.collect()
#HGBT_MODEL.statistics_.collect()
#HGBT_MODEL.metrics_.collect()
HGBT_MODEL.optimal_param_.collect()

In [None]:
HGBT_MODEL.importance_.sort('IMPORTANCE', desc=True).collect()

## Validate Model

In [None]:
score_res = (HGBT_MODEL.score(d_test, key='ID', max_result_num=10, ntiles=20)[1])
score_res.collect()

## Predict with Model

In [None]:
# Create Feature list for Prediction with test data
features = d_test.columns
features.remove('CLASS')
features.remove('ID')

pred_res = HGBT_MODEL.predict(d_test, key='ID', features=features)


In [None]:
pred_res.head(5).collect()

In [None]:
pred_res.select('ID', 'SCORE', 'CONFIDENCE', 'REASON_CODE', 
                ('json_query("REASON_CODE", \'$[0].attr\')', 'Top1'), 
                ('json_query("REASON_CODE", \'$[0].pct\')', 'PCT_1'), 
                ('json_query("REASON_CODE", \'$[1].attr\')', 'Top2'), 
                ('json_query("REASON_CODE", \'$[1].pct\')', 'PCT_2') ).head(5).collect()

In [None]:
print(HGBT_MODEL.get_predict_execute_statement())

In [None]:
#end of Demo








































#end

# Extended Demo Section

## Load data from csv file

In [None]:
#Download file from https://www.kaggle.com/uciml/pima-indians-diabetes-database
# dataset under CCO public domain license https://creativecommons.org/publicdomain/zero/1.0/
import pandas as pd
df = dataframe.create_dataframe_from_pandas(conn,
                                            pd.read_csv("../datasets/diabetes.csv"),
                                            table_name="DIABETES_DATA",
                                            force=True)
df.collect()
#.deselect("Unnamed: 0")

## Detailed Model Validation

In [None]:
# Visualize Confusion Matrix
import matplotlib.pyplot as plt
from hana_ml.visualizers.metrics import MetricsVisualizer
f, ax1 = plt.subplots(1,1)
mv1 = MetricsVisualizer(ax1)
ax1 = mv1.plot_confusion_matrix(HGBT_MODEL.confusion_matrix_, normalize=False)

In [None]:
# Create Feature list for Prediction with test data
features = d_train.columns
features.remove('CLASS')
features.remove('ID')

pred_res = HGBT_MODEL.predict(d_test, key='ID', features=features)

In [None]:
pred_res.head(5).collect()

In [None]:
pred_res.select('ID', 'SCORE', 'CONFIDENCE', 'REASON_CODE', ('json_query("REASON_CODE", \'$[0].attr\')', 'Top1'), ('json_query("REASON_CODE", \'$[0].pct\')', 'PCT_1') ).head(2).collect()

In [None]:
# Build Beeswarm Shapley Explainer Plot for Test Data
import pydotplus
import graphviz
from hana_ml.visualizers.model_debriefing import TreeModelDebriefing

shapley_explainer = TreeModelDebriefing.shapley_explainer(pred_res, d_test, key='ID', label='CLASS')
shapley_explainer.summary_plot()

## Model SQL Generation

In [None]:
# Python SQL generation with release 2.11
#print(HGBT_MODEL.get_pal_function())
#print(HGBT_MODEL.get_fit_parameters())
#print(HGBT_MODEL.get_fit_output_table_names())
#print(HGBT_MODEL.fit_hdbprocedure)
#print(HGBT_MODEL.get_predict_execute_statement())
print(HGBT_MODEL.get_fit_execute_statement())

## Model Storage 

In [None]:
# Save Models and Model Quality Information to MLLAB-Sandbox
from hana_ml.model_storage import ModelStorage

MLLAB_models = ModelStorage(connection_context=conn)

MODEL_SEARCH.estimator.name = 'HGBT DIABETES Classification Model' 
MODEL_SEARCH.estimator.version = 1
MLLAB_models.save_model(model=HGBT_MODEL)
# or MLLAB_models.save_model(model=MODEL_SEARCH.estimator)





In [None]:
list_models = MLLAB_models.list_models()
print(list_models)


In [None]:
# CleanUp

#MLLAB_models.delete_models(name='HGBT DIABETES MODEL')
#MLLAB_models.clean_up()

## Model Performance Reports

In [None]:
from hana_ml.visualizers.unified_report import UnifiedReport
# Get Model version 1
mymodel = MLLAB_models.load_model('HGBT DIABETES Classification Model', 1)

UnifiedReport(mymodel).build().display()