## Check the setup and connect to the database

In [None]:
%run "010-check_setup.ipynb"

# Tables from SAP HANA

In [None]:
hdf_titanic_train=myconn.table('DATA_LABELED', schema='TITANIC')

In [None]:
hdf_titanic_train.describe().filter(condition='"nulls"<>0').collect()

In [None]:
hdf_titanic_test=myconn.table('DATA_TO_PREDICT', schema='TITANIC')

In [None]:
hdf_titanic_test.describe().filter(condition='"nulls"<>0').collect()

🤓 **Let's discuss**:
- Missing values

## Imputation in the DATA_LABELED table

Imputation: https://help.sap.com/doc/cd94b08fe2e041c2ba778374572ddba9/2024_1_QRC/en-US/pal/algorithms/hana_ml.algorithms.pal.preprocessing.Imputer.html#imputer

In [None]:
from hana_ml.algorithms.pal.preprocessing import Imputer

In [None]:
impute = Imputer(strategy='most_frequent-median')

In [None]:
hdf_titanic_train_imputed = impute.fit_transform(hdf_titanic_train,
                                                    categorical_variable=['PClass'],
                                                    strategy_by_col=[('Cabin', 'non')]
                                                )

In [None]:
hdf_titanic_train_imputed.describe().collect()

In [None]:
hdf_titanic_train_imputed.select_statement

You want to persist this table for future experiments

In [None]:
hdf_titanic_train_imputed=hdf_titanic_train_imputed.save(where='DATA_LABELED_IMPUTED', force=True)

# Random Decision Tree classification

Experiement 4: Exclude high cardinality features (names, tickets and cabins) to improve generalization of the model

In [None]:
from hana_ml.algorithms.pal.unified_classification import UnifiedClassification

In [None]:
uc_rdt_v4 = UnifiedClassification(func='RandomDecisionTree')

In [None]:
features_low_cardinality=['Age', 'SibSp', 'ParCh', 'PClass', 'Fare', 'Gender', 'Embarked']

In [None]:
uc_rdt_v4.fit(
    data=hdf_titanic_train_imputed,
    key='PassengerId', label='Survived',
    features=features_low_cardinality,
    training_percent=0.8,
    partition_method='stratified', stratified_column='Survived', 
    partition_random_state=2
);

## Generate a model report

In [None]:
from hana_ml.visualizers.unified_report import UnifiedReport
UnifiedReport(uc_rdt_v4).build().display()

## Debrief the model

In [None]:
from hana_ml.visualizers.model_debriefing import TreeModelDebriefing

# Store the model

In [None]:
from hana_ml.model_storage import ModelStorage

Model storage: https://help.sap.com/doc/cd94b08fe2e041c2ba778374572ddba9/latest/en-US/hana_ml.model_storage.html

In [None]:
ms = ModelStorage(myconn)

In [None]:
uc_rdt_v4.name = 'CodeJam-Titanic-Classification'
uc_rdt_v4.version = 4

In [None]:
ms.save_model(model=uc_rdt_v4, if_exists='replace', save_report=False)

In [None]:
ms.list_models()

# Imputation in the DATA_TO_PREDICT table

In [None]:
hdf_titanic_test=myconn.table('DATA_TO_PREDICT', schema='TITANIC')

In [None]:
hdf_titanic_test.describe().filter(condition='"nulls"<>0').collect()

In [None]:
hdf_titanic_test_imputed = impute.fit_transform(hdf_titanic_test,
                                                    categorical_variable=['PClass'],
                                                    strategy_by_col=[('Cabin', 'non')]
                                                )

In [None]:
hdf_titanic_test_imputed.describe().collect()

In [None]:
hdf_titanic_test_imputed.select_statement

In [None]:
hdf_titanic_test_imputed=hdf_titanic_test_imputed.save(where='DATA_TO_PREDICT_IMPUTED', force=True)

In [None]:
from IPython.display import HTML
HTML(myconn.get_tables().sort_values(by='TABLE_NAME').to_html())

🤓 **Let's discuss**:
- Tables seen in your schema

# Call prediction

In [None]:
hdf_res_v4 = uc_rdt_v4.predict(hdf_titanic_test_imputed, key = 'PassengerId')

In [None]:
display(hdf_res_v4.head(3).collect())