In [None]:
import pandas as pd
import numpy as np
np.random.seed(42)
import seaborn as sns
import scipy

import matplotlib.pyplot as plt

In [None]:
train_data = pd.read_csv("./train.csv")
test_data = pd. read_csv("./test.csv")

sub_ids = test_data["PassengerId"].to_frame()


In [None]:
missing_in_train = sum([True for idx,row in train_data.iterrows() if any(row.isnull())])
missing_in_test = sum([True for idx,row in test_data.iterrows() if any(row.isnull())])

f"There are {missing_in_train} missing rows in train and {missing_in_test} missing rows in test"

In [None]:
def make_frame_ready(source_frame: pd.DataFrame) -> pd.DataFrame:
    

    #make new column group from the passengerId
    source_frame["Group"] = source_frame["PassengerId"].apply(lambda x: x.split("_")[0])

    #make new column family from the name of the passenger
    source_frame["Family"] = source_frame["Name"].apply(lambda x: str(x).split(" ")[-1])


    #impute missing family from group
    #source_frame["Family"] = source_frame.groupby("Group")["Family"].ffill().bfill()
    source_frame["Family"] = source_frame["Family"].fillna(source_frame.groupby("Group")["Family"].agg(lambda x: pd.Series.mode(x, dropna=True)))

    #TODO impute missing cabins from families
    source_frame["Cabin"] = source_frame["Cabin"].fillna(source_frame.groupby("Group")["Cabin"].agg(lambda x: pd.Series.mode(x, dropna=True)))
    source_frame["Cabin"].ffill(inplace=True)

    #split cabin infor into three parts
    source_frame[["Deck", "Num", "shipSide"]] = source_frame["Cabin"].str.split("/", expand=True)
    source_frame["Num"] = source_frame["Num"].astype(np.float64)

    #Put cabin number into bins
    source_frame["NumGroup"] = pd.cut(source_frame["Num"], bins=12).cat.codes

    
    #create age bins
    source_frame["AgeGroup"] = pd.cut(source_frame["Age"], bins=8).cat.codes

    #set spending for cryosleepers
    source_frame.loc[source_frame["CryoSleep"] == True ,["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]] = 0.0

    # set spending of all kids to zero
    source_frame.loc[source_frame["Age"] <= 12, ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]] = 0.0

    #create totalSpending column
    source_frame["totalSpent"] = source_frame[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].sum(axis=1)

    source_frame["SpendingGroup"] = pd.cut(source_frame["totalSpent"], bins = 10).cat.codes


    #set age of all people not spending to average age for people 12 and under
    source_frame["Age"] = np.where((source_frame.CryoSleep == False) & (
    source_frame.Age.isna()) & (source_frame.totalSpent == 0), 5, source_frame.Age)
    
    #impute VIP status by spending   
    source_frame.loc[(source_frame.VIP.isnull()) & (source_frame.totalSpent > 3500), "VIP"] = True
    source_frame["VIP"].fillna(False, inplace=True)
    


    return source_frame

In [None]:
train_data = make_frame_ready(train_data)

In [None]:
test_data = make_frame_ready(test_data)

In [None]:
train_data = train_data[["HomePlanet", "CryoSleep", "Destination", "Age", "VIP", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Deck", "shipSide", "totalSpent", "NumGroup", "AgeGroup", "SpendingGroup", "Transported"]]

In [None]:
test_data = test_data[["HomePlanet", "CryoSleep", "Destination", "Age", "VIP", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Deck", "shipSide", "totalSpent","NumGroup", "AgeGroup", "SpendingGroup"]]

In [None]:
cat_cols = np.where(train_data.dtypes != float)[0][:-1]
cat_cols


In [None]:
train_data.HomePlanet.unique()

In [None]:
train_data.iloc[:, cat_cols] = train_data.iloc[:, cat_cols].fillna(0)

train_data = train_data.fillna(train_data.mean())

In [None]:
test_data.iloc[:, cat_cols] = test_data.iloc[:, cat_cols].fillna(0)

test_data = test_data.fillna(test_data.mean())

In [None]:
X = train_data.drop(columns=["Transported"])

y = train_data.Transported

In [None]:
cat_cols = np.where(X.dtypes != float)[0]
cat_cols

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y)

In [None]:
from catboost import CatBoostClassifier, Pool, cv, metrics
from sklearn.metrics import accuracy_score

In [None]:
train_pool = Pool(X_train, y_train, cat_features=cat_cols)
val_pool = Pool(X_val, y_val, cat_features=cat_cols)

In [None]:
model = CatBoostClassifier(
    custom_loss=[metrics.Accuracy()],
    random_seed=42,
    logging_level="Silent"
)

In [None]:
model.fit(X_train, y_train,
cat_features=cat_cols,
eval_set=(X_val, y_val),
plot=True)

In [None]:
cv_params = model.get_params()
cv_params["loss_function"] = metrics.Logloss()

cv_data = cv(
    Pool(X, y, cat_features=cat_cols),
    cv_params,
    plot=True
)

In [None]:
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

In [None]:
earlystop_params = cv_params.copy()

earlystop_params["od_type"] = "Iter"
earlystop_params["od_wait"] = 40

In [None]:
early_model = CatBoostClassifier(**earlystop_params)

In [None]:
early_model.fit(train_pool, eval_set=val_pool, plot=True)

In [None]:
import optuna

def objective(trial):

    param ={
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg",1, 10),
        "max_depth": trial.suggest_int("max_depth", 6, 12),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.2),
        "random_strength": trial.suggest_int("random_strength", 1, 6),
        "iterations": trial.suggest_int("iterations", 500, 1000, 25),
        "eval_metric": metrics.Accuracy(),
        "loss_function": metrics.Logloss(),
        "random_seed": 42,
        "verbose": False
    }

    #model = CatBoostClassifier(**param)
    #model.fit(train_pool, eval_set=val_pool, verbose=0, early_stopping_rounds=40)

    cv_data = cv(Pool(X, y, cat_features=cat_cols), param, logging_level="Silent")

    return np.mean(cv_data['test-Accuracy-mean'])

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=300, timeout=1800, n_jobs=-1)

In [None]:
best_par = study.best_params.copy()
best_par["eval_metric"] = metrics.Accuracy()

best_mod = CatBoostClassifier(**best_par, cat_features=cat_cols)
best_mod.fit(train_pool, eval_set=val_pool ,plot=True, early_stopping_rounds=40)

In [None]:
submission = pd.DataFrame()
submission["PassengerId"] = sub_ids["PassengerId"]
submission["Transported"] = best_mod.predict(test_data)

In [None]:
submission.to_csv("./submission.csv", index=False)