In [None]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt

from sklearn import tree
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# Process data
### 1/ Load datasets

In [None]:
df_train = pd.read_csv('../input/spaceship-titanic/train.csv')
df_test = pd.read_csv('../input/spaceship-titanic/test.csv')
df_label = df_train.pop('Transported').astype(int)
print('train: ', df_train.shape[0], ' rows & ', df_train.shape[1], ' columns')
print('test: ', df_test.shape[0], ' rows & ', df_test.shape[1], ' columns')
df_train.head()

### 2/ Useful functions

In [None]:
def get_values(col):
    return pd.DataFrame({'train':df_train[col].value_counts(),'test':df_test[col].value_counts()})

In [None]:
def process_data(df):
    df.drop('Name', inplace=True, axis=1)
    df.CryoSleep=df.CryoSleep.astype(int)
    df.HomePlanet=df.HomePlanet.map({'Earth':0, 'Europa':1, 'Mars':2})
    df.Destination=df.Destination.map({'TRAPPIST-1e':0, '55 Cancri e':1, 'PSO J318.5-22':2})
    df.VIP=df.VIP.astype(int)
    df["Spent"]=df["RoomService"]+df["FoodCourt"]+df["ShoppingMall"]+df["Spa"]+df["VRDeck"]
    age_step = [0, 11, 21, 31, 41, 51, 61, 71, 81, 120]
    age_group = [0, 1, 2, 3, 4, 5, 6, 7, 8]
    df['Age'] = pd.cut(df.Age, age_step, labels = age_group,include_lowest = True)
    return df.pop('PassengerId')

In [None]:
def process_cabin(df):
    try:
        df[['Deck','Num','Side']] = df['Cabin'].str.split('/',expand=True)
    except:
        df[['Deck','Num','Side']] = ['NaN','NaN','NaN']
    df.Side=df.Side.map({'P':0, 'S':1})
    df.drop('Cabin', inplace=True, axis=1)
    decks = {'F':0, 'G':1, 'E':2,'B':3, 'C':4, 'D':5, 'A':6, 'T':7}
    df['Deck'] = df['Deck'].map(decks)

### 3/ Missing values

In [None]:
pd.DataFrame({'train':df_train.isna().sum(),'test':df_test.isna().sum()})

In [None]:
def fill_by_mean():
    missing_features = list(df_train.columns)
    missing_features.remove("PassengerId")
    missing_features.remove("Name")
    for feature in missing_features:
        if feature=='Age':
            fill = df_train[feature].mean()
        else:
            fill = df_train[feature].value_counts().index[0]
        df_train[feature] = df_train[feature].fillna(fill)
        df_test[feature] = df_test[feature].fillna(fill)

In [None]:
fill_by_mean()
df_train.info()

### 4/ Encode values

In [None]:
get_values('HomePlanet')

In [None]:
get_values('Destination')

In [None]:
id_train = process_data(df_train)
id_test = process_data(df_test)
df_train.head()

In [None]:
process_cabin(df_train)
process_cabin(df_test)
df_train.head()

### 5/ Normalize data

In [None]:
ct = ColumnTransformer([('normalize', StandardScaler(), ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Spent'])], remainder='passthrough')
df_train = ct.fit(df_train).transform(df_train)
df_test = ct.transform(df_test)

# Models

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df_train, df_label, test_size=0.3, random_state=42)

### 1/ Random Forest Classifier

In [None]:
max_depth = [None, 5, 10, 15, 20, 25, 30, 45, 50]
n_estimators = [10, 20, 30, 40, 50, 60, 70, 80, 90 ,100]
parameters = dict(max_depth=max_depth, n_estimators=n_estimators)
random_search = RandomizedSearchCV(RandomForestClassifier(),parameters)
random_search.fit(X_train,y_train)
best_params = random_search.best_params_
best_score= random_search.best_score_
print('Best parameters : ' + str(best_params))
start = time.time()
rfc = RandomForestClassifier(criterion='gini', 
                             n_estimators=best_params["n_estimators"],
                             max_depth=best_params["max_depth"],
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rfc.fit(X_train,y_train)
end = time.time()
print('Execution time : ' + str(round(end - start, 3)) + 's')
print('Training score: %.4f' % rfc.oob_score_)
print('Validation score: %.4f' % rfc.score(X_val,y_val))

### 2/ Adaboost

In [None]:
def plot_accuracy(accuracy_train, accuracy_val, n_estimators):
    """ Print plot for n_estimators.
    """
    n_estimators = np.arange(1,n_estimators)
    plt.plot(n_estimators, accuracy_train)
    plt.plot(n_estimators, accuracy_val)
    plt.legend(['train','val'])
    plt.title('Train & Val accuracy')
    plt.show()

def get_train_test_accuracy(ensemble, modele, X_train, X_val, y_train, y_val, n_estimators):
    """ get accuracy for n_estimators.
    """
    accuracy_train = []
    accuracy_val = []
    start = time.time()
    for i in range(1,n_estimators):
        model = ensemble(base_estimator=modele ,n_estimators=i, random_state=0)
        model.fit(X_train, y_train)
        accuracy_train.append(model.score(X_train,y_train))
        accuracy_val.append(model.score(X_val,y_val))
    end = time.time()
    print('Temps d\'éxecution : ' + str(round(end - start, 3)) + 's')
    plot_accuracy(accuracy_train, accuracy_val, n_estimators)

In [None]:
dtc = tree.DecisionTreeClassifier(max_depth=2)
get_train_test_accuracy(AdaBoostClassifier, dtc, X_train, X_val, y_train, y_val, 40)