## Check the setup and connect to the database

In [None]:
%run "010-check_setup.ipynb"

# Tables from SAP HANA

In [None]:
hdf_titanic_train=myconn.table('DATA_LABELED', schema='TITANIC')

# Random Decision Tree classification

Experiment 2: Split into Train and Test for fit

In [None]:
from hana_ml.algorithms.pal.unified_classification import UnifiedClassification

In [None]:
uc_rdt_v2 = UnifiedClassification(func='RandomDecisionTree')

🤓 **Let's discuss**:
- [Train vs Test vs Validation](https://en.wikipedia.org/wiki/Training,_validation,_and_test_data_sets)

Split input dataset into two during [fit()](https://help.sap.com/doc/cd94b08fe2e041c2ba778374572ddba9/latest/en-US/pal/algorithms/hana_ml.algorithms.pal.unified_classification.UnifiedClassification.html#hana_ml.algorithms.pal.unified_classification.UnifiedClassification.fit): training (80%) and testing (the rest) using a column `Survived` and [stratified partition](https://en.wikipedia.org/wiki/Stratified_sampling).

Meaning of partitioning parameters: https://help.sap.com/docs/hana-cloud-database/sap-hana-cloud-sap-hana-database-predictive-analysis-library-pal/partition-partition-af41e5f

In [None]:
uc_rdt_v2.fit(
    data=hdf_titanic_train,
    key='PassengerId', label='Survived',
    training_percent=0.8,
    partition_method='stratified', stratified_column='Survived', 
    partition_random_state=2,
    output_partition_result=True
);

### Analyze the partition of the table

Get output table names: https://help.sap.com/doc/cd94b08fe2e041c2ba778374572ddba9/2024_1_QRC/en-US/pal/algorithms/hana_ml.algorithms.pal.pal_base.PALBase.html#hana_ml.algorithms.pal.pal_base.PALBase.get_fit_output_table_names

In [None]:
for tab in uc_rdt_v2.get_fit_output_table_names():
    print(f"Table {tab} has {myconn.table(tab).count()} record(s).")

In [None]:
# A table with the `PARTITION_TYPE` in the name is the result of the partition
dbtable_with_partition_results=next((a_table for a_table in uc_rdt_v2.get_fit_output_table_names() if 'PARTITION_TYPE' in a_table), None)
print(f'The table that stores partition results is {dbtable_with_partition_results}')

myconn.table(dbtable_with_partition_results).collect()

In [None]:
(myconn
    .table(dbtable_with_partition_results)
    .agg(agg_list=[("count", "TYPE", "TYPE_Count")], group_by="TYPE")
    .sort("TYPE")
    .collect()
)


In [None]:
(myconn
    .sql(f'''SELECT TYPE, COUNT(*) AS "COUNT", ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 3) AS "DISTRIBUTION_PERCENTAGE"
    FROM {dbtable_with_partition_results}
    GROUP BY TYPE;''')
    .sort("TYPE")
    .collect()
)

## Analyze programmatically the Classifier's...

### ...parameters

In [None]:
display(uc_rdt_v2.get_parameters())

Now more output tables are populated, incl. Confusion Matrix and Metrics

In [None]:
for tab in uc_rdt_v2.model_:
    print(f"{tab.select_statement} returns {tab.count()} record(s)")

### ...model

In [None]:
hdf_uc_rdt_model=uc_rdt_v2.model_[0]
display(hdf_uc_rdt_model.head(3).select(hdf_uc_rdt_model.columns[2]).collect())

...confusion matrix from the training (fitting) execution

In [None]:
uc_rdt_v2.confusion_matrix_.collect()

🤓 **Let's discuss**:
1. Confusion Matrix
1. Statistics
1. Metrics

### ...statistics 

(see https://help.sap.com/docs/hana-cloud-database/sap-hana-cloud-sap-hana-database-predictive-analysis-library/model-evaluation-model-evaluation-73bbec0)

In [None]:
uc_rdt_v2.statistics_.filter("STAT_NAME='ACCURACY'").collect()

### ...metrics

In [None]:
uc_rdt_v2.metrics_.collect()

What metrics are in the table?

In [None]:
uc_rdt_v2.metrics_.distinct(cols='NAME').collect()

Let's check the CUMulative GAINS

In [None]:
uc_rdt_v2.metrics_.filter("NAME = 'CUMGAINS'").sort('X').collect()

If you want to manualy plot Cumulative Gains:
```python
uc_rdt_v2.metrics_.filter("NAME = 'CUMGAINS'").sort('X').collect().plot(x='X', y='Y');
```

## Generate a model report

In [None]:
from hana_ml.visualizers.unified_report import UnifiedReport


In [None]:
UnifiedReport(uc_rdt_v2).build().display()

## Debrief the model

In [None]:
from hana_ml.visualizers.model_debriefing import TreeModelDebriefing

In [None]:
TreeModelDebriefing.tree_debrief_with_dot(uc_rdt_v2.model_[0], iframe_height=700);

# Store the model

In [None]:
from hana_ml.model_storage import ModelStorage

Model storage: https://help.sap.com/doc/cd94b08fe2e041c2ba778374572ddba9/2024_4_QRC/en-US/hana_ml.model_storage.html

In [None]:
ms = ModelStorage(myconn)

In [None]:
uc_rdt_v2.name = 'CodeJam-Titanic-Classification'
uc_rdt_v2.version = 2
ms.save_model(model=uc_rdt_v2, if_exists='replace', save_report=False)

In [None]:
display(ms.list_models())

In [None]:
null = None
false = False
true = True
eval(ms.list_models(name=uc_rdt_v2.name).at[0, 'JSON'])

In [None]:
# Check what table represents your fitted model in this Python session
uc_rdt_v2.model_[0].select_statement

# Load the saved model

In [None]:
ms.list_models(uc_rdt_v2.name).T

In [None]:
uc_rdt_v2=ms.load_model(uc_rdt_v2.name, version=uc_rdt_v2.version)

In [None]:
uc_rdt_v2.model_[0].select_statement

# Call prediction

In [None]:
hdf_titanic_test=myconn.table('DATA_TO_PREDICT', schema='TITANIC')

In [None]:
hdf_res_v2 = uc_rdt_v2.predict(hdf_titanic_test, key = 'PassengerId')

In [None]:
display(hdf_res_v2.select('PassengerId', 'SCORE', 'CONFIDENCE').head(4).collect())

# [**Optional**] Compare to the ground truth

In [None]:
hdf_titanic_complete=myconn.table('DATA_COMPLETE', schema='TITANIC')

In [None]:
hdf_res_ext=hdf_res_v2.set_index('PassengerId').join(hdf_titanic_test.set_index('PassengerId'))

In [None]:
hdf_res_ext.head(3).collect()

In [None]:
hdf_res_incl_groundtruth=(hdf_res_ext.set_index(['Name', 'Ticket']).join(hdf_titanic_complete.set_index(['name', 'ticket']))
                 .select('PassengerId', 'Name', 'Ticket', 'SCORE','survived',('1-ABS(SCORE-"survived")', 'IS_CORRECT'))
                 .cast('SCORE', 'INT')

)

In [None]:
hdf_res_incl_groundtruth.head(3).collect()

## Using `metrics`

In [None]:
import hana_ml.algorithms.pal.metrics as pal_metrics

Accuracy score: https://help.sap.com/doc/cd94b08fe2e041c2ba778374572ddba9/2024_4_QRC/en-US/pal/algorithms/hana_ml.algorithms.pal.metrics.accuracy_score.html#accuracy-score

In [None]:
pal_metrics.accuracy_score(data=hdf_res_incl_groundtruth, label_true='survived', label_pred='SCORE')

Confusion Matrix: https://help.sap.com/doc/cd94b08fe2e041c2ba778374572ddba9/2024_4_QRC/en-US/pal/algorithms/hana_ml.algorithms.pal.metrics.confusion_matrix.html#confusion-matrix

In [None]:
hdf_cm, hdf_cr = pal_metrics.confusion_matrix(data=hdf_res_incl_groundtruth, key='PassengerId', label_true='survived', label_pred='SCORE')

In [None]:
hdf_cm.collect()

In [None]:
hdf_cr.collect()

In [None]:
from hana_ml.visualizers.metrics import MetricsVisualizer
MetricsVisualizer(enable_plotly=True).plot_confusion_matrix(hdf_cm, normalize=False);