## Check the setup and connect to the database

In [None]:
%run "010-check_setup.ipynb"

# Tables from SAP HANA

In [None]:
hdf_titanic_train=myconn.table('DATA_LABELED', schema='TITANIC')

# Random Decision Tree classification

Experiment 3: Exclude high cardinality features (names, tickets and cabins) to improve generalization of the model

In [None]:
from hana_ml.algorithms.pal.unified_classification import UnifiedClassification

In [None]:
uc_rdt_v3 = UnifiedClassification(func='RandomDecisionTree')

In [None]:
features_low_cardinality=['Age', 'SibSp', 'ParCh', 'PClass', 'Fare', 'Gender', 'Embarked']

In [None]:
uc_rdt_v3.fit(
    data=hdf_titanic_train,
    key='PassengerId', label='Survived',
    features=features_low_cardinality, #pass only low-cardinality features
    training_percent=0.8,
    partition_method='stratified', stratified_column='Survived', 
    partition_random_state=2
);

## Generate a model report

In [None]:
from hana_ml.visualizers.unified_report import UnifiedReport
UnifiedReport(uc_rdt_v3).build().display()

## Debrief the model

In [None]:
from hana_ml.visualizers.model_debriefing import TreeModelDebriefing

In [None]:
TreeModelDebriefing.tree_debrief_with_dot(uc_rdt_v3.model_[0], iframe_height=700);

# Store the model

In [None]:
from hana_ml.model_storage import ModelStorage

Model storage: https://help.sap.com/doc/cd94b08fe2e041c2ba778374572ddba9/2024_1_QRC/en-US/hana_ml.model_storage.html

In [None]:
ms = ModelStorage(myconn)

In [None]:
uc_rdt_v3.name = 'CodeJam-Titanic-Classification'
uc_rdt_v3.version = 3

In [None]:
ms.save_model(model=uc_rdt_v3, if_exists='replace', save_report=False)

In [None]:
ms.list_models()

In [None]:
null = None
false = False
true = True
eval(ms.list_models(name=uc_rdt_v3.name, version=uc_rdt_v3.version).at[0, 'JSON'])

# Call prediction

In [None]:
hdf_titanic_test=myconn.table('DATA_TO_PREDICT', schema='TITANIC')

In [None]:
hdf_res_v3 = uc_rdt_v3.predict(hdf_titanic_test, key = 'PassengerId')

In [None]:
display(hdf_res_v3.sort('CONFIDENCE', desc=True).head(3).collect())
display(hdf_res_v3.sort('CONFIDENCE', desc=True).tail(3).collect())

# [**Optional**] Compare to the ground truth

In [None]:
hdf_titanic_complete=myconn.table('DATA_COMPLETE', schema='TITANIC')

In [None]:
hdf_res_ext=hdf_res_v3.set_index('PassengerId').join(hdf_titanic_test.set_index('PassengerId'))

In [None]:
hdf_res_ext.head(3).collect()

In [None]:
hdf_res_incl_groundtruth=(hdf_res_ext.set_index(['Name', 'Ticket']).join(hdf_titanic_complete.set_index(['name', 'ticket']))
                 .select('PassengerId', 'Name', 'Ticket', 'SCORE','survived',('1-ABS(SCORE-"survived")', 'IS_CORRECT'))
                 .cast('SCORE', 'INT')

)

## Using `metrics`

In [None]:
import hana_ml.algorithms.pal.metrics as pal_metrics

In [None]:
pal_metrics.accuracy_score(data=hdf_res_incl_groundtruth, label_true='survived', label_pred='SCORE')

In [None]:
hdf_cm, hdf_cr = pal_metrics.confusion_matrix(data=hdf_res_incl_groundtruth, key='PassengerId', label_true='survived', label_pred='SCORE')

In [None]:
hdf_cm.collect()

In [None]:
hdf_cr.collect()

In [None]:
from hana_ml.visualizers.metrics import MetricsVisualizer
MetricsVisualizer(enable_plotly=True).plot_confusion_matrix(hdf_cm, normalize=False);

🤓 **Let's discuss**:
- Comparison of the last two models you trained