## Check the setup and connect to the database

In [None]:
%run "010-check_setup.ipynb"

# Tables from SAP HANA

In [None]:
hdf_titanic_train = myconn.table("DATA_LABELED_IMPUTED")

🤓 **Let's discuss**:
- A table we use now and features it has

# [Feature Engineering](https://en.wikipedia.org/wiki/Feature_engineering)

In [None]:
hdf_titanic_train.get_table_structure()

In [None]:
hdf_titanic_train.head(5).collect()

## New feature 1: Fare per person

Calculate nr of persons per ticket and a fare per person.

In [None]:
(
    hdf_titanic_train
    .agg([("AVG", "Fare", "AvgFarePerTicket"), ("COUNT", "Ticket", "PersonsPerTicket")], group_by=["Ticket"])
    .select("*", ('ROUND("AvgFarePerTicket"/"PersonsPerTicket", 2)', "FarePerPerson"))
    .describe()
    .collect()
    .iloc[:, :9]
)

Define the reusable procedure applied to both DATA_LABELED and DATA_TO_PREDICT datasets

In [None]:
def fe_add_fare_per_person(hdf_in: hdf.DataFrame) -> hdf.DataFrame:
    # 1. Calculate nr of persons per ticket and a fare per person
    hdf_ticket_price = hdf_in.agg(
        [("AVG", "Fare", "AvgFarePerTicket"),
         ("COUNT", "Ticket", "PersonsPerTicket")],
        group_by=["Ticket"],
    ).select("*", ('ROUND("AvgFarePerTicket"/"PersonsPerTicket", 2)', "FarePerPerson"))
    
    # 2. Join nr of persons per ticket and a fare per person to the main table with the data
    hdf_out = hdf_in.set_index("Ticket").join(other=hdf_ticket_price.deselect("AvgFarePerTicket").set_index("Ticket"))
    return hdf_out

In [None]:
hdf_titanic_train_fe1 = fe_add_fare_per_person(hdf_titanic_train)

In [None]:
hdf_titanic_train_fe1.get_table_structure()

In [None]:
hdf_titanic_train_fe1 = hdf_titanic_train_fe1.cast({"PersonsPerTicket": "INT"})

### DATA_TO_PREDICT table

In [None]:
hdf_titanic_test = myconn.table("DATA_TO_PREDICT_IMPUTED")

In [None]:
hdf_titanic_test_fe1 = fe_add_fare_per_person(hdf_titanic_test)

In [None]:
hdf_titanic_test_fe1.head(5).collect()

In [None]:
hdf_titanic_test_fe1 = hdf_titanic_test_fe1.cast({"PersonsPerTicket": "INT"})

## New feature 2: First letter of the last name

Define the reusable procedure applied to both DATA_LABELED and DATA_TO_PREDICT datasets

In [None]:
def fe_add_1letter_lastname (hdf_in: hdf.DataFrame) -> hdf.DataFrame:
    hdf_out = hdf_in.select("*", ('UPPER(LEFT("Name", 1))', "NameFirstLetter"))
    return hdf_out

In [None]:
hdf_titanic_train_fe2 = fe_add_1letter_lastname(hdf_titanic_train_fe1)

In [None]:
hdf_titanic_train_fe2.head(5).collect()

In [None]:
from hana_ml.visualizers.eda import EDAVisualizer

In [None]:
_, df_letter_distribution = EDAVisualizer(enable_plotly=True).bar_plot(data=hdf_titanic_train_fe2, 
                                                     column='NameFirstLetter', 
                                                     aggregation={'NameFirstLetter':'count'}
                                                    )

### DATA_TO_PREDICT table

In [None]:
hdf_titanic_test_fe2=fe_add_1letter_lastname(hdf_titanic_test_fe1)

In [None]:
hdf_titanic_test_fe2.head(5).collect()

In [None]:
_, df_letter_distribution = EDAVisualizer(enable_plotly=True).bar_plot(data=hdf_titanic_test_fe2, 
                                                     column='NameFirstLetter', 
                                                     aggregation={'NameFirstLetter':'count'}
                                                    )

## New feature 3: The title from names

Define the reusable procedure applied to both DATA_LABELED and DATA_TO_PREDICT datasets

In [None]:
def fe_add_title (hdf_in: hdf.DataFrame) -> hdf.DataFrame:
    hdf_out = hdf_in.select("*", ("SUBSTR_REGEXPR('([A-Za-z\s]+)(?=\.\s)' IN \"Name\")", "Title"))
    return hdf_out

In [None]:
hdf_titanic_train_fe3 = fe_add_title(hdf_titanic_train_fe2)

In [None]:
hdf_titanic_train_fe3.head(5).collect()

In [None]:
from hana_ml.visualizers.eda import EDAVisualizer

In [None]:
EDAVisualizer(enable_plotly=True).pie_plot(
    data=hdf_titanic_train_fe3,
    column="Title",
    legend=True,
    explode=0,
    #startangle=90,
    #counterclock=False,
)[1].sort_values(by='COUNT', ascending=False);

### DATA_TO_PREDICT table

In [None]:
hdf_titanic_test_fe3=fe_add_title(hdf_titanic_test_fe2)

In [None]:
EDAVisualizer(enable_plotly=True).pie_plot(
    data=hdf_titanic_test_fe3,
    column="Title",
    legend=True,
    explode=0,
    # startangle=90,
    # counterclock=False,
)[1].sort_values(by='COUNT', ascending=False);

### Unified Report for the DataSet

In [None]:
from hana_ml.visualizers.unified_report import UnifiedReport

In [None]:
(
    UnifiedReport(hdf_titanic_train_fe3)
    .build(key="PassengerId")
    .display()
)

# Save the new datasets

In [None]:
hdf_titanic_train_fe3=hdf_titanic_train_fe3.save('DATA_LABELED_FE', force=True)

In [None]:
hdf_titanic_test_fe3=hdf_titanic_test_fe3.save('DATA_TO_PREDICT_FE', force=True)

🤓 **Let's discuss**:
- Are there any other engineered features coming to your mind?

# Manual selection of features

In [None]:
column_id="PassengerId"
column_label="Survived"
features_subset=[
            "NameFirstLetter",
            "FarePerPerson",
            "SibSp",
            "ParCh",
            "Gender",
            "PClass",
            "Embarked",
            "Title",
            "Age",
            "PersonsPerTicket"
]

In [None]:
# Exclude high-cardinality variables from the train dataset
hdf_titanic_train_v2=myconn.table('DATA_LABELED_FE').select(
            [column_id]+features_subset+[column_label]
        ).cast(column_label, 'NVARCHAR(1)')

In [None]:
print(hdf_titanic_train_v2.select_statement)

In [None]:
hdf_titanic_train_v2.head(5).collect()

# RandomDecisionTrees

In [None]:
from hana_ml.algorithms.pal.unified_classification import UnifiedClassification

In [None]:
uc_rdt_v5 = UnifiedClassification(func="RandomDecisionTree")

In [None]:
uc_rdt_v5.fit(
    data=hdf_titanic_train_v2,
    key="PassengerId",
    label="Survived",
    training_percent=0.8,
    partition_method="stratified",
    stratified_column="Survived",
    partition_random_state=2
);

## Generate a model report

In [None]:
from hana_ml.visualizers.unified_report import UnifiedReport
UnifiedReport(uc_rdt_v5).build().display()

## Debrief the model

In [None]:
from hana_ml.visualizers.model_debriefing import TreeModelDebriefing

# Store the model

In [None]:
from hana_ml.model_storage import ModelStorage

Model storage: https://help.sap.com/doc/cd94b08fe2e041c2ba778374572ddba9/2024_1_QRC/en-US/hana_ml.model_storage.html

In [None]:
ms = ModelStorage(myconn)

In [None]:
uc_rdt_v5.name = 'CodeJam-Titanic-Classification'
uc_rdt_v5.version = 5

In [None]:
ms.save_model(model=uc_rdt_v5, if_exists='replace', save_report=False)

In [None]:
ms.list_models()

# Call prediction

In [None]:
hdf_titanic_test_v2=myconn.table('DATA_TO_PREDICT_FE').select(
            [column_id]+features_subset
        )

In [None]:
hdf_res = uc_rdt_v5.predict(hdf_titanic_test_v2, key=column_id)

In [None]:
display(hdf_res.head(4).collect())

# Unsupervised [Feature selection](https://en.wikipedia.org/wiki/Feature_selection)

Feature selection: https://help.sap.com/doc/cd94b08fe2e041c2ba778374572ddba9/2024_1_QRC/en-US/pal/algorithms/hana_ml.algorithms.pal.preprocessing.FeatureSelection.html#featureselection

In PAL: https://help.sap.com/docs/hana-cloud-database/sap-hana-cloud-sap-hana-database-predictive-analysis-library/feature-selection-feature-selection-29a47ef

In [None]:
from hana_ml.algorithms.pal.preprocessing import FeatureSelection

In [None]:
hdf_titanic_train_fe3.get_table_structure()

In [None]:
for fs_method in ['chi-squared', 'gini-index', 'information-gain', 'MRMR', 'JMI', 'IWFS']:
    print(f"{fs_method}: ", end="")
    fs = FeatureSelection(fs_method=fs_method, top_k_best=5)
    fs_df = fs.fit_transform(data=hdf_titanic_train_fe3,
                             key='PassengerId',
                             label="Survived",
                             excluded_feature=['Fare','Ticket']
                            )
    print(eval(fs.result_.collect().iloc[0][1])["__SelectedFeatures__"])

🤓 **Let's discuss**:
- When would feature selection be applicable?