In [1]:
from load_dataset import load_dataset
import missingno as msno
import pandas as pd
import numpy as np
import random as rnd

from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer, KBinsDiscretizer

In [2]:
np.random.seed(11)
rnd.seed(11)

In [3]:
def extract_data(df:pd.DataFrame):

    # From PassengerId get GroupId, PassengerId inside the group and Group_size
    df[['GroupId', 'PassengerId_no_group']] = df['PassengerId'].str.split('_', expand=True)
    df = pd.merge(df, df.groupby('GroupId')['PassengerId'].count(), how='left', on='GroupId')
    # Rename columns, that they are more meaningful
    df.rename({'PassengerId_y': 'Group_size', 'PassengerId_x': 'Full_Id', 'PassengerId_no_group': 'PassengerId'}, axis=1, inplace=True)
    df['PassengerId'] = df['PassengerId'].astype('int64')
    df['GroupId'] = df['GroupId'].astype('int64')
    df['Group_size'] = df['Group_size'].astype('int64')


    # Add cabinmates as column
    df = pd.merge(df, df.groupby('Cabin')['PassengerId'].count().rename('Number_of_cabinmates'), how='left', on='Cabin')

    # From cabin get Deck, Side and cabin number on deck
    df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)
    df['Deck'] = df['Deck'].astype('category')
    df['Side'] = df['Side'].astype('category')
    df['Num'] = df['Num'].astype('float64')         # Should be int, but at this stage still NaNs in data, so use float


    df['HomePlanet'] = df['HomePlanet'].astype('category')
    df['Destination'] = df['Destination'].astype('category')

    # Split Name for First Name and Last Name
    df[['First_name', 'Last_name']] = df['Name'].str.split(' ', expand=True)

    # Last name as category
    df['Last_name'] = df['Last_name'].astype('category')
    # Add namesakes number as column
    df = pd.merge(df, df.groupby('Last_name')['PassengerId'].count().rename('Number_of_namesakes'), how='left', on='Last_name')

    df['Number_of_namesakes'] = df['Number_of_namesakes'].astype('float64') # Same reason that for cabin: NaNs are present

    

    return df

In [4]:
def add_spending_features(df:pd.DataFrame):
    # Fill NaNs with 0 (the most common value)
    df[['RoomService', 'FoodCourt','ShoppingMall','Spa','VRDeck']] = df[['RoomService', 'FoodCourt','ShoppingMall','Spa','VRDeck']].fillna(0.0)

    # Total spend
    df['TotalSpend'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']

    # Entertainment spend
    df['EntSpend'] = df['ShoppingMall'] + df['Spa'] + df['VRDeck']

    # Living spend
    df['LivSpend'] = df['RoomService'] + df['FoodCourt']

    return df

In [5]:
# The function, with my modifications, were taken from https://www.kaggle.com/code/ravi20076/sptitanic-bootstrapensemble-pipeline

def TrtNullAgeCrSlp(df:pd.DataFrame):
    """
    This function fills nulls in age and cryosleep as below-
    Cryosleep:-
    1. For non-spenders, cryosleep=1 as cryosleep customers don't spend
    2. If age <=12 and cryosleep is null, then cryosleep= 1
    3. For spenders, cryosleep= 0
    
    Age:-
    1. For spenders/ non-cryosleep, median age > 12 for family is considered (child cannot spend)
    2. For all remaining nulls, overall median age is used
    
    Flag for Is_Child (Age <=12) is also created
    """
    
    # 1. Filling nulls in cryosleep based on spending and age details:-
    df['CryoSleep'] = np.float16(df['CryoSleep']*1.0)
    df.loc[(df.CryoSleep.isna()==True) & (df.TotalSpend == 0.0), ['CryoSleep']] = 1.0
    # Assuming child (age <=12) and null cryosleep = cryosleep
    df.loc[(df.CryoSleep.isna()==True) & (df.Age <=12), ['CryoSleep']] = 1.0
    # Assuming no cryosleep for spenders:-
    df.loc[(df.CryoSleep.isna()==True) & (df['TotalSpend'] > 0.0), ['CryoSleep']] = 0.0
    df['CryoSleep'] = df['CryoSleep'].astype(np.int8)
    
    # 2. Assuming average group age for spenders:-
    df = df.merge(df.loc[df.Age >12,['GroupId', 'Age']].dropna().groupby('GroupId').agg(_Age= pd.NamedAgg('Age', np.median)), how= 'left', left_on= 'GroupId', right_on='GroupId', suffixes= ('',''))
    df.loc[(df.Age.isna()==True) & ((df.TotalSpend > 0.0) | (df.CryoSleep==0)), ['Age']] = df._Age
    # Filling median age for remaining nulls:-
    df['Age'] = df['Age'].fillna(df.Age.median())
    
    # 3. Creating flag for child:-
    df['Is_Child'] = np.where(df.Age <= 12, 1,0)
    df['Is_Child'] = df['Is_Child'].astype(np.int8)
    
    df = df.drop(['_Age'], axis=1)
    df['Age'] = df['Age'].astype(np.int8)
    return df

In [6]:
# The function, with my modifications, were taken from https://www.kaggle.com/code/ravi20076/sptitanic-bootstrapensemble-pipeline
def TrtNullVIPCabin(df: pd.DataFrame):
    """
    This function treats nulls in VIP and cabin columns using the GroupId.
    We assume that members of the same family have the same cabin and VIP IDs
    As an addition, it downcasts the float64 columns to conserve memory.
    """
    def Extract(lst):
        new_lst = []
        for item in lst:
            if ((type(item) != float) and (len(item)!=0)):
                if len(item) > 1:
                    new_lst.append(item[0])
                else:
                    new_lst.append(item)
            else:
                new_lst.append(np.nan)
        return new_lst 


    # Assuming that members of the same group have the same VIP ID:-
    df = df.merge(df[['VIP', 'GroupId']].groupby('GroupId')['VIP'].max(), how = 'left', left_on= 'GroupId', right_on= 'GroupId', suffixes= ('','_'))
    df['VIP'] = df['VIP'].fillna(df.VIP_)
    df['VIP'] = df['VIP'].fillna(0.0)
    df['VIP'] = df['VIP'].astype(np.int8)

    # Assuming that members of the same group have the same cabin ID:-
    df = df.merge(df[['Num', 'GroupId']].groupby('GroupId')['Num'].max(), how = 'left', left_on= 'GroupId', right_on= 'GroupId', suffixes= ('','_'))
    #print(df.VIP_)
    df['Num'] = df['Num'].fillna(df.Num_)
    df['Num'] = df['Num'].fillna(0.0)
    df['Num'] = df['Num'].astype(np.int16)



    df = df.merge(df[df['Group_size']>1][['Deck', 'GroupId']].groupby('GroupId')['Deck'].agg(pd.Series.mode), how = 'left', left_on= 'GroupId', right_on= 'GroupId', suffixes= ('','_'))
    df['Deck_'] = Extract(list(df['Deck_']))
    try:
        df['Deck_'] = df['Deck_'].astype('category').cat.add_categories('T')
    except ValueError:
        pass
    # print(df.Deck_.dtype)
    df['Deck'] = df['Deck'].fillna(df.Deck_)

    # Single passengers are filled with random by the same distribution
    categories = (df.Deck.value_counts() / len(df)).index.to_list()
    weights = (df.Deck.value_counts() / len(df)).to_list()
    rnd_decks = []
    for i in range(len(df)):
        rnd_decks.append(rnd.choices(categories, weights=weights)[0])

    rnd_decks = pd.Series(rnd_decks, index=df.index)
    df['Deck'] = df['Deck'].fillna(rnd_decks)
    # df['Deck'] = df['Deck'].astype(np.int8)


    df = df.merge(df[df['Group_size']>1][['Side', 'GroupId']].groupby('GroupId')['Side'].agg(pd.Series.mode), how = 'left', left_on= 'GroupId', right_on= 'GroupId', suffixes= ('','_'))
    df['Side_'] = Extract(list(df['Side_']))
    df['Side'] = df['Side'].fillna(df.Side_)

    # Single passengers are filled with random

    categories = (df.Side.value_counts() / len(df)).index.to_list()
    weights = (df.Side.value_counts() / len(df)).to_list()
    rnd_sides = []
    for i in range(len(df)):
        rnd_sides.append(rnd.choices(categories, weights=weights)[0])

    rnd_sides = pd.Series(rnd_sides, index=df.index)
    df['Side'] = df['Side'].fillna(rnd_sides)


 
    # Downcasting columns to conserve memory:-    
    df[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','TotalSpend', 'EntSpend', 'LivSpend']] = df[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','TotalSpend', 'EntSpend', 'LivSpend']].astype(np.float32); 
    
    
    # Dropping extra columns after usage:-
    df = df.drop(['VIP_', 'Num_', 'Deck_', 'Side_'], axis=1, errors= 'ignore')
    return df

In [7]:
def TrtNullHomeDest(df: pd.DataFrame):

    def Extract(lst):
        new_lst = []
        for item in lst:
            if ((type(item) != float) and (len(item)!=0)):        # If all are NaNs, then generates string like "[], Categories (3, object): ['Earth', 'Europa', 'Mars']"
                new_lst.append(item)
            else:
                new_lst.append(np.nan)
        return new_lst 


    df = df.merge(df[df['Group_size']>1][['HomePlanet', 'GroupId']].groupby('GroupId')['HomePlanet'].agg(pd.Series.mode), how = 'left', left_on= 'GroupId', right_on= 'GroupId', suffixes= ('','_'))
    df['HomePlanet_'] = Extract(list(df['HomePlanet_']))
    df['HomePlanet'] = df['HomePlanet'].fillna(df.HomePlanet_)

    # Single passengers are filled with random by the same distribution
    categories = (df.HomePlanet.value_counts() / len(df)).index.to_list()
    weights = (df.HomePlanet.value_counts() / len(df)).to_list()
    rnd_homes = []
    for i in range(len(df)):
        rnd_homes.append(rnd.choices(categories, weights=weights)[0])

    rnd_homes = pd.Series(rnd_homes, index=df.index)
    df['HomePlanet'] = df['HomePlanet'].fillna(rnd_homes)


    # Destination looks independent from group, so just fill it with random


    # filled with random by the same distribution
    categories = (df.Destination.value_counts() / len(df)).index.to_list()
    weights = (df.Destination.value_counts() / len(df)).to_list()
    rnd_dest = []
    for i in range(len(df)):
        rnd_dest.append(rnd.choices(categories, weights=weights)[0])

    rnd_dest = pd.Series(rnd_dest, index=df.index)
    df['Destination'] = df['Destination'].fillna(rnd_dest)


    df = df.drop('HomePlanet_', axis=1, errors= 'ignore')

    return df

In [8]:
def TrtNullNumbers(df: pd.DataFrame):
    df['Number_of_cabinmates'] = df['Number_of_cabinmates'].fillna(1)
    df['Number_of_namesakes'] = df['Number_of_namesakes'].fillna(1)

    # Drop features which are not needed
    df.drop(['Cabin', 'Name', 'First_name', 'Last_name'], axis=1, inplace=True)
    # Cast types and put Transported to the end
    df = df.astype({'Age': 'int8', 'GroupId': 'int16', 'Group_size': 'int8', 'PassengerId': 'int8', 'Number_of_cabinmates': 'int8', 'Number_of_namesakes': 'int8'})

    if ("Transported" in df.columns):
        # Outcome to the end
        transported = df.pop('Transported')
        df['Transported'] = transported

    return df

In [9]:
def TransformCategories(df: pd.DataFrame):
    enc = OneHotEncoder(drop='if_binary')
    enc_df = pd.DataFrame(enc.fit_transform(df[['HomePlanet', 'Destination', 'Deck', 'Side']]).toarray())

    original_labels = df.columns.to_list()
    home_labels = ['HP_' + cat for cat in df['HomePlanet'].cat.categories.to_list()]
    dest_labels = ['Dest_' + cat for cat in df['Destination'].cat.categories.to_list()]
    deck_labels = ['Deck_' + cat for cat in df['Deck'].cat.categories.to_list()]
    side_labels = ['Side_S']

    new_labels = home_labels + dest_labels + deck_labels + side_labels
    complete_new_labels = original_labels + new_labels

    df = df.join(enc_df)
    df.columns = complete_new_labels

    # reduce df size by downcasting types
    df[new_labels] = df[new_labels].astype(np.int8)

    df.drop(['HomePlanet', 'Destination', 'Deck', 'Side'], axis=1, inplace=True)

    return df

In [10]:
def TransformBillingLog(df: pd.DataFrame):

    billing_columns = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','TotalSpend', 'EntSpend', 'LivSpend']

    log_trans = PowerTransformer()
    transf_df = pd.DataFrame(log_trans.fit_transform(df[billing_columns]))

    #df = df.join()

    df[billing_columns] = transf_df

    return df

In [11]:
def TransformBillingBins(df: pd.DataFrame):
    billing_columns = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','TotalSpend', 'EntSpend', 'LivSpend']

    # Due to unusual distribution apply custom binning
    transf_df = df[billing_columns].copy()
    for column in billing_columns:
        ninety_quantile = df[column].quantile(q=0.9)
        transf_df.loc[(df[column] == 0), column] = 0
        transf_df.loc[((df[column] > 0) & (df[column] <= ninety_quantile)), column] = 1
        transf_df.loc[(df[column] > ninety_quantile), column] = 2

    df[billing_columns] = transf_df

    df[billing_columns] = df[billing_columns].astype(np.int8)

    return df


In [12]:
def NumericScale(df: pd.DataFrame):
    num_columns = ['Age', 'RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','TotalSpend', 'EntSpend', 'LivSpend', 'GroupId', 'PassengerId', 'Group_size', 'Number_of_cabinmates', 'Num', 'Number_of_namesakes']

    sc = StandardScaler()
    transf_df = pd.DataFrame(sc.fit_transform(df[num_columns]))

    df[num_columns] = transf_df

    # Put Transported to the end, as it is the last tranformation
    if ("Transported" in df.columns):
        # Outcome to the end
        transported = df.pop('Transported')
        df['Transported'] = transported

    return df


In [13]:
def CompleteTransform(df: pd.DataFrame, binning=False):
    if binning:
        return NumericScale(TransformBillingBins(TransformCategories(TrtNullNumbers(TrtNullHomeDest(TrtNullVIPCabin(TrtNullAgeCrSlp(add_spending_features(extract_data(df)))))))))
    else:
        return NumericScale(TransformBillingLog(TransformCategories(TrtNullNumbers(TrtNullHomeDest(TrtNullVIPCabin(TrtNullAgeCrSlp(add_spending_features(extract_data(df)))))))))

In [14]:
df, df_test, subm_example = load_dataset()

spaceship-titanic.zip: Skipping, found more recently modified local copy (use --force to force download)


In real life we don't have data for prediction before they came. We don't really know distribution of real world data. Therefore, we should fit all transformations only on train set and apply transformation to test set like we don't know it's distribution.

In Kaggle competions, sometimes, we have access to test data. In this case, we have complete list of passengers onboard in advance and don't expect new passengers appear. So, we can combine train and test set and transform them together. Then split, train model on train set and perform prediction on test set.

In [15]:
df_test_fake_outcome = pd.merge(df_test, subm_example)
tst_ids = df_test_fake_outcome['PassengerId']

complete_df = pd.concat([df, df_test_fake_outcome])

In [16]:
# transform completely
complete_df_log = CompleteTransform(complete_df)
complete_df_bin = CompleteTransform(complete_df, binning=True)

In [17]:
df_train_log = complete_df_log[~complete_df_log['Full_Id'].isin(list(tst_ids))].drop('Full_Id', axis=1)
df_test_log = complete_df_log[complete_df_log['Full_Id'].isin(list(tst_ids))].drop(['Transported', 'Full_Id'], axis=1)

df_train_bin = complete_df_bin[~complete_df_bin['Full_Id'].isin(list(tst_ids))].drop('Full_Id', axis=1)
df_test_bin = complete_df_bin[complete_df_bin['Full_Id'].isin(list(tst_ids))].drop(['Transported', 'Full_Id'], axis=1)

In [18]:
df_train_log.to_csv('.//data//prepared_train_log.csv', index=False)
df_test_log.to_csv('.//data//prepared_test_log.csv', index=False)
df_train_bin.to_csv('.//data//prepared_train_bin.csv', index=False)
df_test_bin.to_csv('.//data//prepared_test_bin.csv', index=False)
