In [1]:
%load_ext kedro.ipython

In [2]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)

In [3]:
train = catalog.load("train")
parameters = catalog.load("parameters")

id_col = parameters["col_maps"]["id_col"]
target_col = parameters["col_maps"]["target_col"]

## Data Processing

### Helper

In [4]:
def gen_counts_for_cat_feature(
    data: pd.DataFrame, cat_col: str, feature_name: str
) -> pd.DataFrame:
    df_count = data.groupby(cat_col).size().reset_index(name=feature_name)
    data = data.merge(df_count, on=[cat_col], how="left")
    return data


def create_num_features_from_cat_features(input_data: pd.DataFrame) -> pd.DataFrame:

    cat_counts_params = {
        "CabinNumber": {"FeatureName": "PeopleInCabinNumber"},
        "CabinDeck": {"FeatureName": "PeopleInCabinDeck"},
        "LastName": {"FeatureName": "FamilySize"},
    }

    data = input_data.copy()

    for col in list(cat_counts_params.keys()):
        data = gen_counts_for_cat_feature(
            data, col, cat_counts_params[col]["FeatureName"]
        )
    return data

### Nodes

In [5]:
parameters


[1m{[0m
    [32m'col_maps'[0m: [1m{[0m[32m'id_col'[0m: [32m'PassengerId'[0m, [32m'target_col'[0m: [32m'Transported'[0m[1m}[0m,
    [32m'model_name'[0m: [32m'spaceship_titanic_model'[0m
[1m}[0m

In [6]:
from typing import Any


def preprocess_data(
    input_data: pd.DataFrame, parameters: dict[str, Any]
) -> pd.DataFrame:
    processed_data = input_data.copy()
    processed_data[["CabinDeck", "CabinNumber", "CabinSide"]] = processed_data[
        "Cabin"
    ].str.split("/", expand=True)
    processed_data["LastName"] = processed_data["Name"].str.split(" ").str[1]

    processed_data = create_num_features_from_cat_features(processed_data)

    target_col = parameters["col_maps"]["target_col"]
    processed_data[target_col] = np.where(processed_data[target_col], 1, 0)
    return processed_data

### Pipeline

In [7]:
preprocess_data(train, parameters)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,CabinDeck,CabinNumber,CabinSide,LastName,PeopleInCabinNumber,PeopleInCabinDeck,FamilySize
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,B,0,P,Ofracculy,18.0,779.0,1.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,F,0,S,Vines,18.0,2794.0,4.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,A,0,S,Susent,18.0,256.0,6.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,A,0,S,Susent,18.0,256.0,6.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,F,1,S,Santantines,15.0,2794.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,0,A,98,P,Noxnuther,11.0,256.0,3.0
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,0,G,1499,S,Mondalley,2.0,2559.0,2.0
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,1,G,1500,S,Connon,3.0,2559.0,6.0
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,0,E,608,S,Hontichre,3.0,876.0,6.0
