# Data

In [5]:
import pandas as pd
import numpy as np
from SupportFunctions import preprocess_impute_spaceship_titanic

In [11]:
train_raw = pd.read_csv("./spaceship-titanic/train.csv")
test_raw = pd.read_csv(("./spaceship-titanic/test.csv"))

# Set params for preprocesing/imputation
log_transform_exp=False
proba_imp=True
expense_strat="group_median"
age_strat="group_mean"
drop_outliers=False

# Preprocess train 
train = preprocess_impute_spaceship_titanic(train_raw, 
                                            log_transform_exp=log_transform_exp, 
                                            proba_imp=proba_imp, 
                                            expense_strat=expense_strat, 
                                            age_strat=age_strat,
                                            drop_outliers=drop_outliers)

# Preprocess test in same way
test = preprocess_impute_spaceship_titanic(test_raw, 
                                           log_transform_exp=log_transform_exp, 
                                           proba_imp=proba_imp, 
                                           expense_strat=expense_strat, 
                                           age_strat=age_strat,
                                           drop_outliers=drop_outliers)

In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

#Categorical features we want to include in the models
cat_features = ['HomePlanet', 'Destination', 'Deck', 'Side']
bool_features = ['CryoSleep', 'VIP', 'NoExpenses', 'Alone']
drop_features = ['PassengerId', 'Name', 'GroupID', 'CabinNum']
num_features = ['Age', 'TotalExp', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'GroupPos']

train = pd.concat([train, pd.get_dummies(train[cat_features])], axis=1)
train = train.drop(cat_features, axis=1)
train = train.drop(drop_features, axis=1)

# Same for test
test = pd.concat([test, pd.get_dummies(test[cat_features])], axis=1)
test = test.drop(cat_features, axis=1)
test = test.drop(drop_features, axis=1)


y = train.Transported.copy()
X = train.drop("Transported", axis=1).copy()

# DecTree

In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier


param_grid = {'max_features': [None, 'sqrt', 'log2'],
              'ccp_alpha': [0, 0.1, .01, .001],
              'max_depth' : [5, 6, 7, 8, 9],
              'criterion' :['gini', 'entropy'],
              'min_samples_split': [2, 10, 100, 500], 
              'min_samples_leaf': [1, 10, 100, 500]
             }

decTree = DecisionTreeClassifier(random_state=123)
grid_search_DT = GridSearchCV(estimator=decTree, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search_DT.fit(X, y)

Fitting 5 folds for each of 1920 candidates, totalling 9600 fits


In [14]:
grid_search_DT.best_score_

0.7878772438427705

In [None]:
decTreePred = grid_search_DT.best_estimator_.predict(test)

decTreeSub = test_raw[["PassengerId"]].copy()
decTreeSub["Transported"] = decTreePred

decTreeSub.to_csv("./spaceship-titanic/DecTree.csv", index=False)

# RF

In [None]:
'criterion' :['gini', 'entropy'],
    'min_samples_split': [2, 10, 100, 500], 
    'min_samples_leaf': [1, 10, 100, 500]

In [None]:
param_grid = { 
    'criterion' :['gini', 'entropy'],
    'n_estimators': [200, 400, 500, 600],
    'max_features': [None, 'sqrt'],
    'max_depth' : [5,6,7,8,9]
}

RF = RandomForestClassifier(random_state=123)
grid_search_RF = GridSearchCV(estimator=RF, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search_RF.fit(X, y)

In [None]:
param_grid = { 
    'criterion' :['entropy'],
    'n_estimators': [600, 700, 800],
    'max_features': [None],
    'max_depth' : [9, 10, 12, 16]
}

RF = RandomForestClassifier(random_state=123)
grid_search_RF = GridSearchCV(estimator=RF, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search_RF.fit(X, y)

In [None]:
grid_search_RF.best_score_

In [None]:
RF_pred = grid_search_RF.best_estimator_.predict(test)

RF_sub = test_raw[["PassengerId"]].copy()
RF_sub["Transported"] = RF_pred

RF_sub.to_csv("./spaceship-titanic/RF_log.csv", index=False)

In [None]:
plt.figure(figsize=(10,8))
importances = grid_search_RF.best_estimator_.feature_importances_
names = grid_search_RF.best_estimator_.feature_names_in_
feat_importances = pd.DataFrame({"Name":names, "Importance":importances}).sort_values(by="Importance")
plt.barh(width=feat_importances.Importance, 
         y=feat_importances.Name)

# AB

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC

param_grid = { 
    'n_estimators':[400, 500, 600, 700],
    'learning_rate':[0.2, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

AdaBoost = AdaBoostClassifier(random_state=123)
grid_search_AB = GridSearchCV(estimator=AdaBoost, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search_AB.fit(X, y)

In [None]:
grid_search_AB.best_score_

In [None]:
AB_pred = grid_search_AB.best_estimator_.predict(test)

AB_sub = test_raw[["PassengerId"]].copy()
AB_sub["Transported"] = AB_pred

AB_sub.to_csv("./spaceship-titanic/AB.csv", index=False)