In [2]:
import pandas as pd
import numpy as np
np.random.seed(42)
import seaborn as sns
import scipy

import matplotlib.pyplot as plt

In [3]:
train_data = pd.read_csv("./train.csv", encoding="utf-8")
test_data = pd. read_csv("./test.csv", encoding="utf-8")

sub_ids = test_data["PassengerId"].to_frame()


In [4]:
missing_in_train = sum([True for idx,row in train_data.iterrows() if any(row.isnull())])
missing_in_test = sum([True for idx,row in test_data.iterrows() if any(row.isnull())])

f"There are {missing_in_train} missing rows in train and {missing_in_test} missing rows in test"

'There are 2087 missing rows in train and 996 missing rows in test'

In [5]:
def make_frame_ready(source_frame: pd.DataFrame) -> pd.DataFrame:
    

    #make new column group from the passengerId
    source_frame["Group"] = source_frame["PassengerId"].apply(lambda x: x.split("_")[0])

    #make new column family from the name of the passenger
    source_frame["Family"] = source_frame["Name"].apply(lambda x: str(x).split(" ")[-1])


    #impute missing family from group
    #source_frame["Family"] = source_frame.groupby("Group")["Family"].ffill().bfill()
    source_frame["Family"] = source_frame["Family"].fillna(source_frame.groupby("Group")["Family"].agg(lambda x: pd.Series.mode(x, dropna=True)))

    #TODO impute missing cabins from families
    source_frame["Cabin"] = source_frame["Cabin"].fillna(source_frame.groupby("Group")["Cabin"].agg(lambda x: pd.Series.mode(x, dropna=True)))
    source_frame["Cabin"].ffill(inplace=True)

    #split cabin infor into three parts
    source_frame[["Deck", "Num", "shipSide"]] = source_frame["Cabin"].str.split("/", expand=True)
    source_frame["Num"] = source_frame["Num"].astype(np.float64)

    #Put cabin number into bins
    source_frame["NumGroup"] = pd.cut(source_frame["Num"], bins=12).cat.codes

    
    #create age bins
    source_frame["AgeGroup"] = pd.cut(source_frame["Age"], bins=8).cat.codes

    #set spending for cryosleepers
    source_frame.loc[source_frame["CryoSleep"] == True ,["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]] = 0.0

    # set spending of all kids to zero
    source_frame.loc[source_frame["Age"] <= 12, ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]] = 0.0

    #create totalSpending column
    source_frame["totalSpent"] = source_frame[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].sum(axis=1)

    source_frame["SpendingGroup"] = pd.cut(source_frame["totalSpent"], bins = 10).cat.codes


    #set age of all people not spending to average age for people 12 and under
    source_frame["Age"] = np.where((source_frame.CryoSleep == False) & (
    source_frame.Age.isna()) & (source_frame.totalSpent == 0), 5, source_frame.Age)
    
    #impute VIP status by spending   
    source_frame.loc[(source_frame.VIP.isnull()) & (source_frame.totalSpent > 3500), "VIP"] = True
    source_frame["VIP"].fillna(False, inplace=True)
    


    return source_frame

In [6]:
train_data = make_frame_ready(train_data)

In [7]:
test_data = make_frame_ready(test_data)

In [8]:
train_data = train_data[["HomePlanet", "CryoSleep", "Destination", "Age", "VIP", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Deck", "shipSide", "totalSpent", "NumGroup", "AgeGroup", "SpendingGroup", "Transported"]]
test_data = test_data[["HomePlanet", "CryoSleep", "Destination", "Age", "VIP", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Deck", "shipSide", "totalSpent"]]

In [9]:
train_data.select_dtypes(object).columns.to_list()

['HomePlanet', 'CryoSleep', 'Destination', 'Deck', 'shipSide']

In [10]:
cat_cols = ["HomePlanet", "CryoSleep",
            "Destination", "VIP", "Deck", "shipSide", "AgeGroup", "NumGroup", "SpendingGroup"]
num_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "totalSpent"]

In [11]:
train_data.dropna(inplace=True)

In [12]:
X = train_data.drop(columns=["Transported"])
y = train_data["Transported"]

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [14]:
from catboost import CatBoostClassifier, Pool, cv

In [15]:
cv_dataset = Pool(data=X, label=y, cat_features=cat_cols)

In [29]:
from sklearn.model_selection import GridSearchCV

grid = dict()

#grid["loss_function"] = ["Logloss"]
grid["n_estimators"] = [900, 1000, 1100]
grid["max_depth"] = [6, 7, 8, 9]
grid["learning_rate"] = [1e-3, 3e-2, 1e-2, 1e-1]
grid["l2_leaf_reg"] = [1, 3, 5, 7, 9]
#grid["verbose"] = [True]
grid

{'n_estimators': [900, 1000, 1100],
 'max_depth': [6, 7, 8, 9],
 'learning_rate': [0.001, 0.03, 0.01, 0.1],
 'l2_leaf_reg': [1, 3, 5, 7, 9]}

In [16]:
y = [str(x) for x in y]

In [24]:
model = CatBoostClassifier(verbose=False, early_stopping_rounds=25)

In [30]:
res = model.grid_search(grid, cv_dataset, plot=True, cv=3, verbose=False)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


bestTest = 0.492285052
bestIteration = 899

0:	loss: 0.4922851	best: 0.4922851 (0)	total: 8.29s	remaining: 33m 1s
Stopped by overfitting detector  (25 iterations wait)

bestTest = 0.3675930645
bestIteration = 693

1:	loss: 0.3675931	best: 0.3675931 (1)	total: 14.7s	remaining: 29m 12s

bestTest = 0.3808566406
bestIteration = 899

2:	loss: 0.3808566	best: 0.3675931 (1)	total: 22.4s	remaining: 29m 33s
Stopped by overfitting detector  (25 iterations wait)

bestTest = 0.3681034666
bestIteration = 187

3:	loss: 0.3681035	best: 0.3675931 (1)	total: 24.3s	remaining: 23m 52s

bestTest = 0.494414033
bestIteration = 899

4:	loss: 0.4944140	best: 0.3675931 (1)	total: 32.6s	remaining: 25m 33s
Stopped by overfitting detector  (25 iterations wait)

bestTest = 0.3718871897
bestIteration = 505

5:	loss: 0.3718872	best: 0.3675931 (1)	total: 37.2s	remaining: 24m 10s

bestTest = 0.3815609757
bestIteration = 899

6:	loss: 0.3815610	best: 0.3675931 (1)	total: 45s	remaining: 24m 56s
Stopped by overfitting d

In [32]:
pdres = pd.DataFrame.from_dict(res["cv_results"])

In [31]:
res["params"]

{'depth': 8, 'l2_leaf_reg': 1, 'iterations': 900, 'learning_rate': 0.03}

In [33]:
best_model = CatBoostClassifier(verbose=True, early_stopping_rounds=25)

In [34]:
best_model.fit(cv_dataset, res["params"])

CatBoostError: Incorrect value of y: X is catboost.Pool object, y must be initialized inside catboost.Pool.