In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy

import matplotlib.pyplot as plt

In [2]:
train_data = pd.read_csv("./train.csv")
test_data = pd. read_csv("./test.csv")

sub_ids = test_data["PassengerId"].to_frame()


In [3]:
missing_in_train = sum([True for idx,row in train_data.iterrows() if any(row.isnull())])
missing_in_test = sum([True for idx,row in test_data.iterrows() if any(row.isnull())])

f"There are {missing_in_train} missing rows in train and {missing_in_test} missing rows in test"

'There are 2087 missing rows in train and 996 missing rows in test'

In [4]:
def make_frame_ready(source_frame: pd.DataFrame) -> pd.DataFrame:
    

    #make new column group from the passengerId
    source_frame["Group"] = source_frame["PassengerId"].apply(lambda x: x.split("_")[0])

    #make new column family from the name of the passenger
    source_frame["Family"] = source_frame["Name"].apply(lambda x: str(x).split(" ")[-1])


    #impute missing family from group
    #source_frame["Family"] = source_frame.groupby("Group")["Family"].ffill().bfill()
    source_frame["Family"] = source_frame["Family"].fillna(source_frame.groupby("Group")["Family"].agg(lambda x: pd.Series.mode(x, dropna=True)))

    #TODO impute missing cabins from families
    source_frame["Cabin"] = source_frame["Cabin"].fillna(source_frame.groupby("Group")["Cabin"].agg(lambda x: pd.Series.mode(x, dropna=True)))
    source_frame["Cabin"].ffill(inplace=True)

    #split cabin infor into three parts
    source_frame[["Deck", "Num", "shipSide"]] = source_frame["Cabin"].str.split("/", expand=True)
    source_frame["Num"] = source_frame["Num"].astype(np.float64)

    #Put cabin number into bins
    #source_frame["NumGroup"] = pd.cut(source_frame["Num"], bins=12)
    
    #create age bins
    source_frame["AgeGroup"] = np.where(source_frame["Age"] <= 12, 0, 1)

    #set spending for cryosleepers
    source_frame.loc[source_frame["CryoSleep"] == True ,["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]] = 0.0

    # set spending of all kids to zero
    source_frame.loc[source_frame["Age"] <= 12, ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]] = 0.0

    #create totalSpending column
    source_frame["totalSpent"] = source_frame[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].sum(axis=1)

    #set age of all people not spending to average age for people 12 and under
    source_frame["Age"] = np.where((source_frame.CryoSleep == False) & (
    source_frame.Age.isna()) & (source_frame.totalSpent == 0), 5, source_frame.Age)
    
    #impute VIP status by spending   
    source_frame.loc[(source_frame.VIP.isnull()) & (source_frame.totalSpent > 3500), "VIP"] = True
    source_frame["VIP"].fillna(False, inplace=True)
    


    return source_frame

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [6]:
train_data = make_frame_ready(train_data)

In [7]:
test_data = make_frame_ready(test_data)#[["HomePlanet", "CryoSleep", "Destination", "Age", "VIP", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Deck", "shipSide", "NumBin", "AgeBin", "totalSpent"]]

In [8]:
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

In [9]:
#train_data = train_data[["HomePlanet", "CryoSleep", "Destination", "Age", "VIP", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Deck", "shipSide", "totalSpent", "Transported"]]

In [10]:
train_data.select_dtypes(object).columns.to_list()

['PassengerId',
 'HomePlanet',
 'CryoSleep',
 'Cabin',
 'Destination',
 'Name',
 'Group',
 'Family',
 'Deck',
 'shipSide']

In [11]:
cat_cols = ["HomePlanet", "CryoSleep",
            "Destination", "VIP", "Deck", "shipSide", "AgeGroup", "NumGroup"]
num_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck","totalSpent", "Num"]

In [12]:
num_pipe = Pipeline(
    [
        ("num_impute", SimpleImputer(strategy="mean")),
        ("num_scale", RobustScaler())
    ]
)

In [13]:
cat_pipe = Pipeline(
    [
        ("cat_impute", SimpleImputer(strategy="most_frequent")),
        ("cat_encode", OneHotEncoder())
    ]
)

In [14]:
ct_X = ColumnTransformer(
    [
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ]
)


In [15]:
#train_data.dropna(inplace=True)

In [16]:
X = train_data.drop(columns=["Transported"])
y = train_data["Transported"]

In [17]:
X = ct_X.fit_transform(X)

ValueError: A given column is not a column of the dataframe

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
from tensorflow import keras
import keras_tuner as kt

In [None]:
es = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    mode="auto",
    patience=10,
    restore_best_weights=True
)

In [None]:
hidden_layer_size = 2048

model = keras.Sequential()

model.add(keras.layers.Dense(25, activation='relu'))
#model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(hidden_layer_size, activation='relu'))
model.add(keras.layers.Dropout(rate= 0.2))
#model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(hidden_layer_size, activation='relu'))
#model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(hidden_layer_size, activation='relu'))
model.add(keras.layers.Dropout(rate=0.2))
#model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(1, activation="sigmoid"))

model.compile(optimizer ="adam", loss = "binary_crossentropy", metrics = ["accuracy"])

In [None]:
def model_builder(hp):
    model = keras.Sequential()

    model.add(keras.layers.Dense(25, activation="relu"))

    for i in range(hp.Int("num_layers", 2, 9)):
        model.add(keras.layers.Dense(units=hp.Int(f"units_{i}", min_value=512, max_value=4196, step=512), 
                    activation="relu"))

        if i % 2 == 1 and hp.Boolean("dropout"):
            model.add(keras.layers.Dropout(0.15))

    model.add(keras.layers.Dense(1, activation="sigmoid"))


    learning_rate = hp.Float("lr", min_value=1e-4, max_value=1e-2, sampling="log")

    optim_choice = hp.Choice("optim", ["adam", "Nadam", "sgd"])

    if optim_choice == "adam":
        optim = keras.optimizers.Adam(learning_rate=learning_rate) 
    elif optim_choice == "Nadam":    
        optim = keras.optimizers.Nadam(learning_rate=learning_rate) 
    elif optim_choice == "sgd":    
        optim = keras.optimizers.SGD(learning_rate=learning_rate)
        

    model.compile(
        optimizer= optim,
        loss="binary_crossentropy",
        metrics=["accuracy"]

    )


    return model

In [None]:
tuner = kt.BayesianOptimization(
    hypermodel=model_builder,
    objective="val_loss",
    max_trials=100,
    overwrite=True
    
)

In [None]:
tuner.search(X, y, epochs=35, validation_split=0.2, batch_size=32, callbacks=[es])

In [None]:
#hist = model.fit(X_train, y_train, epochs=1000, batch_size=4, validation_split=0.2, callbacks=[es])

In [None]:
missing_in_train = sum([True for idx,row in train_data.iterrows() if any(row.isnull())])
missing_in_test = sum([True for idx,row in test_data.iterrows() if any(row.isnull())])

f"There are {missing_in_train} missing rows in train and {missing_in_test} missing rows in test"

In [None]:
best_hps = tuner.get_best_hyperparameters(5)

model = model_builder(best_hps[0])

In [None]:
es_prod = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    mode="min",
    patience=10,
    restore_best_weights=True
)

In [None]:
model.fit(X, y, epochs=400, batch_size=4, validation_split=0.2, callbacks=[es_prod])

In [None]:
test_data = ct_X.transform(test_data)

In [None]:
preds =  model.predict(test_data)

In [None]:
subs = pd.DataFrame(preds)

In [None]:
subs = sub_ids.join(subs)

In [None]:
subs = subs.rename({0: "Transported"}, axis=1)

In [None]:
subs["Transported"] = subs["Transported"].apply(lambda x: bool(round(x)))

In [None]:
subs.to_csv("./submission.csv", index=False)