<a href="https://colab.research.google.com/github/Tony6512/Kaggle-Datasets/blob/main/spaceship_titanic_DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

https://www.kaggle.com/competitions/spaceship-titanic

# Intro / Get Data

In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
sns.set(style="darkgrid")
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import KFold
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
                              GradientBoostingClassifier, ExtraTreesClassifier)

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer

SEED = 0


In [None]:
df_train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
df_train.head()

In [None]:
df_test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
df_test.head()

In [None]:
df_train.describe()

In [None]:
# df_test.describe()

Let us seperate categorical features and continuous features.

In [None]:
cts_features = df_train.describe().columns
cts_features

In [None]:
df_train.describe(exclude = [np.number])  ## non numeric

In [None]:
cat_features = df_train.drop(['PassengerId', 'Transported'], axis = 1).describe(exclude = [np.number]).columns
cat_features

In [None]:
# df_all = concat_df(df_train, df_test)
df_train.name = 'Training Set'
df_test.name = 'Test Set'
# df_all.name = 'All Set'
PassengerId_test = df_test['PassengerId'] ## for submission
dfs = [df_train, df_test]

print(f'Training X Shape = {df_train.shape}')
print(f'Training y Shape = {df_train.shape[0]}')
print(f'Test X Shape = {df_test.shape}')
print(f'Test y Shape = {df_test.shape[0]}')
print(df_train.columns)
print(df_test.columns)

In [None]:
for df in dfs:
    df.info()
    print('-'*50)

In [None]:
def display_missing(df):
    for col in df.columns.tolist():
        print(f'{col} column missing values: {df[col].isnull().sum()}')
    print('\n')

In [None]:
for df in dfs:
    print(f'{df.name}')
    display_missing(df)

# Visualize Data

In [None]:
fig, axs = plt.subplots(nrows=2, figsize=(15, 15))

sns.heatmap(df_train.corr(), ax=axs[0], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 14})
sns.heatmap(df_test.corr(), ax=axs[1], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 14})
plt.show()

Let us look at the target feature against some of the categorical and continuous features.

In [None]:
sns.histplot(x='Age', hue='Transported', data=df_train, kde = True)
None

In [None]:
sns.kdeplot(x='Age', hue='Transported', data=df_train, shade = True)
None

In [None]:
for feat in cts_features:
    if feat != 'Age':
        sns.histplot(x= feat, hue='Transported', data=df_train, bins = 10, multiple = 'dodge', shrink = 0.8)
    else:
        sns.histplot(x= feat, hue='Transported', data=df_train, kde = True)
    plt.show()

We might want to bin most of the numerical data since the majority of it is just 0.

In [None]:
for feat in cts_features:
    if feat == 'Age':
        pass
    else:
        sns.scatterplot(data=df_train, x= feat, y = 'Transported' , hue= 'Transported')
        plt.show()


In [None]:
sns.countplot(x='Transported', data=df_train)
None

In [None]:
# cat_features
for feat in ['HomePlanet', 'Destination', 'CryoSleep']:
    sns.countplot(x= feat, hue='Transported', data=df_train)
    plt.show()

Many people who are not transported (to another dimension) are from Earth (home planet). There is a nice relation between cryosleep and transported.

# Clean Data

## Age

In [None]:
sns.histplot(x='Age', data=df_train, kde = True)
None

In [None]:
df_train['Age'].mean()

In [None]:
df_train['Age'].median()

In [None]:
## missing values are filled in by median
for df in dfs:
    df['Age'] = df['Age'].fillna( df['Age'].median())

## Other cts features

In [None]:
## given that 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' are mostly zero, that is the value we use to fill
df_train.describe()

Most of the continuous data is 0, so filling in the median is equivalent to filling in 0.

In [None]:
## given that 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' are mostly zero, that is the value we use to fill
for df in dfs:
    for feat in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
        df[feat] = df[feat].fillna( df[feat].median())

In [None]:
## check features with missing values
for df in dfs:
    print(f'{df.name}')
    display_missing(df)

## Categorical features

In [None]:
cat_features

In [None]:
df_train.describe(exclude = [np.number])  ## non numeric

In [None]:
for df in dfs:
    df['VIP'] = df['VIP'].fillna( df['VIP'].mode()[0])

In [None]:
for df in dfs:
    print(f'{df.name}')
    display_missing(df)

In [None]:
sns.countplot(data = df_train, x = 'HomePlanet', hue = 'Destination')
None

In [None]:
sns.countplot(data = df_train, x = 'HomePlanet', hue = 'CryoSleep')
None

In [None]:
sns.countplot(data = df_train, hue = 'CryoSleep', x = 'Destination')
None

In [None]:
# sns.heatmap(data = df_train, annot = True)

In [None]:
df_train.groupby(['HomePlanet', 'Destination']).count()

In [None]:
df_train.groupby(['HomePlanet', 'Destination']).CryoSleep.count()['Earth']['TRAPPIST-1e']

In [None]:
df_train.groupby(['HomePlanet', 'Destination', 'CryoSleep']).count()

In [None]:
df_train.groupby(['HomePlanet', 'CryoSleep', 'Destination']).count()

In [None]:
sns.catplot(data = df_train, x = 'HomePlanet', hue = 'Destination', col = 'CryoSleep', kind = 'count')
None

### Attempt to fill in missing data by percentages later

In [None]:
## make distributions of situations and get percentages

In [None]:
df['HomePlanet'].unique()[0:3]

In [None]:
# for df in dfs:
#     for home in df['HomePlanet'].unique()[0:3]:
#         for destin df['Destination'].unique()[0:3]:
#             null_count = df[]

In [None]:
df_train.groupby(['HomePlanet', 'Destination']).count()

In [None]:
df_train.loc[(df_train.HomePlanet == 'Earth') & (df_train.Destination == 'TRAPPIST-1e')].CryoSleep

In [None]:
df_train.loc[(df_train.HomePlanet == 'Earth') & (df_train.Destination == 'TRAPPIST-1e') & (df_train.CryoSleep == False)]

In [None]:
df_train.loc[(df_train.HomePlanet == 'Earth') & (df_train.Destination == 'TRAPPIST-1e') & (df_train.CryoSleep == False)].shape[0]

In [None]:
df_train.loc[(df_train.HomePlanet == 'Earth') & (df_train.Destination == 'TRAPPIST-1e')].CryoSleep

In [None]:
def fill_in(df, Home, Dest, Cryo):
    return df.loc[(df.HomePlanet == Home) & (df.Destination == Dest) & (df.CryoSleep == Cryo)].shape[0]

In [None]:
fill_in(df = df_train, Home = 'Earth', Dest = 'TRAPPIST-1e', Cryo = False)

### Fill in missing data with mode (easier method)

In [None]:
for df in dfs:
    for feat in ['HomePlanet', 'Destination', 'CryoSleep']:
        df[feat] = df[feat].fillna(df[feat].mode()[0])

In [None]:
for df in dfs: ## drop features
    df.drop(['Name', 'Cabin'], inplace = True, axis=1)

In [None]:
for df in dfs: ## drop features
    df.drop(['PassengerId'], inplace = True, axis=1)

In [None]:
for df in dfs:
    print(f'{df.name}')
    display_missing(df)

## Feature Transformation

In [None]:
df_train.head(5)

In [None]:
cts_features = df_train.describe().columns
cts_features

In [None]:
cat_features = df_train.drop(['Transported'], axis = 1).describe(exclude = [np.number]).columns
cat_features

In [None]:
y_train = df_train['Transported'].values

In [None]:
y_train

In [None]:
# X_train = df_train.drop(['Transported'], axis=1)
# X_test = df_test.copy()

In [None]:
df_train.shape

In [None]:
ct = ColumnTransformer(
    [("scaling", StandardScaler(), cts_features),
     ("onehot", OneHotEncoder(), cat_features)])
# ct = ColumnTransformer(
#     [("scaling", MinMaxScaler(), cts_features),
#      ("onehot", OneHotEncoder(), cat_features)])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train_full, y_train_full = df_train, y_train
# X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, shuffle = False)

In [None]:
# print(X_train_full.shape)
# print(y_train_full.shape)
# print(X_train.shape)
# print(X_val.shape)
# print(y_train.shape)
# print(y_val.shape)

In [None]:
# X_train = ct.fit_transform(X_train.drop(['Transported'], axis=1))
X_train_full = ct.fit_transform(X_train_full.drop(['Transported'], axis=1))

In [None]:
ct.get_feature_names_out()

In [None]:
# X_val = ct.transform(X_val.drop(['Transported'], axis=1))

In [None]:
X_test = ct.transform(df_test)

In [None]:
X_test.shape

## Modeling (Neural Network)

In [None]:
import time

In [None]:
# tic = time.perf_counter()
# toc = time.perf_counter()
# print(("Elapsed time: %.2f [sec]" % ((toc-tic))))
# print(("Elapsed time: %.2f [min]" % ((toc-tic)/60)))

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.ensemble import BaggingClassifier

from tensorflow import keras
from tensorflow.keras import layers
import keras_tuner

In [None]:
# def build_model(hp):
#     """
#     sequential model with relu act. fct. and Batch norm.
#     """
# #     input_shape = [X_train.shape[1]]
#     model = keras.Sequential([
# #         layers.BatchNormalization(input_shape = input_shape),
#         layers.BatchNormalization(),

#         layers.Dense(256, activation = 'relu', kernel_initializer = keras.initializers.HeNormal()),## elu or relu act fct
#         layers.BatchNormalization(),
#         layers.Dropout(0.3),

#         layers.Dense(256, activation = 'relu', kernel_initializer = keras.initializers.HeNormal()),
#         layers.BatchNormalization(),
#         layers.Dropout(0.3),

#         layers.Dense(1, activation = 'sigmoid')

#     ])
#     lr = hp.Float('lr', min_value = 1e-4, max_value = 1e-2, sampling = 'log')
#     learning_sched = keras.optimizers.schedules.ExponentialDecay(
#         initial_learning_rate = lr,
#         decay_steps=100000,
#         decay_rate=0.96,
#         staircase=True)
#     model.compile(
#         optimizer=keras.optimizers.Nadam(learning_rate= learning_sched),#'Nadam',
#         loss='binary_crossentropy',
#         metrics=['binary_accuracy'],
#     )

#     return model
# # build_model(keras_tuner.HyperParameters())
# # model.summary()

In [None]:
# def build_model(hp):
#     """
#     sequential model w/ self-norm layers via selu act. fct. w/ alpha dropout
#     """
# #     input_shape = [X_train.shape[1]]
#     model = keras.Sequential([
#         layers.BatchNormalization(),

#         layers.Dense(256, activation = 'selu', kernel_initializer = keras.initializers.lecun_normal()),
#         layers.AlphaDropout(0.3),

#         layers.Dense(256, activation = 'selu', kernel_initializer = keras.initializers.lecun_normal()),
#         layers.AlphaDropout(0.3),

#         layers.Dense(1, activation = 'sigmoid')

#     ])
#     lr = hp.Float('lr', min_value = 1e-4, max_value = 1e-2, sampling = 'log')
#     learning_sched = keras.optimizers.schedules.ExponentialDecay(
#         initial_learning_rate = lr,
#         decay_steps=100000,
#         decay_rate=0.96,
#         staircase=True)
#     model.compile(
#         optimizer=keras.optimizers.Nadam(learning_rate= learning_sched),#'Nadam',
#         loss='binary_crossentropy',
#         metrics=['binary_accuracy'],
#     )

#     return model
# # build_model(keras_tuner.HyperParameters())
# # model.summary()

In [None]:
# def build_model(hp):
#     """
#     functional (API) model that is wide (simple rules) and deep (complex rules). Note: cannot use selu b/c network contains skip
#     connections
#     """
#     input = keras.layers.Input(shape=X_train_full.shape[1])

#     BN0 = layers.BatchNormalization()(input)

#     D1 = layers.Dense(256, activation = 'relu', kernel_initializer = keras.initializers.HeNormal())(BN0)## elu or relu act fct
#     BN1 = layers.BatchNormalization()(D1)
#     DO1 = layers.Dropout(0.3)(BN1)

#     D2 = layers.Dense(256, activation = 'relu', kernel_initializer = keras.initializers.HeNormal())(DO1)
#     BN2 = layers.BatchNormalization()(D2)
#     DO2 = layers.Dropout(0.3)(BN2)

#     concat = keras.layers.Concatenate()([input, DO2])
#     output = layers.Dense(1, activation = 'sigmoid')(concat)
#     model = keras.models.Model(inputs=[input], outputs=[output])

#     lr = hp.Float('lr', min_value = 1e-4, max_value = 1e-2, sampling = 'log')
#     learning_sched = keras.optimizers.schedules.ExponentialDecay(
#         initial_learning_rate = lr,
#         decay_steps=100000,
#         decay_rate=0.96,
#         staircase=True)
#     model.compile(
#         optimizer=keras.optimizers.Nadam(learning_rate= learning_sched),#'Nadam',
#         loss='binary_crossentropy',
#         metrics=['binary_accuracy'],
#     )

#     return model
# # build_model(keras_tuner.HyperParameters())
# # model.summary()

In [None]:
def build_model(hp):
    """
    functional (API) model that is wide (simple rules) and deep (complex rules). Note: cannot use selu b/c network contains skip
    connections. More tuning hyperparameters.
    """
    lr = hp.Float('lr', min_value = 1e-4, max_value = 1e-2, sampling = 'log')
    units = hp.Int("units", min_value=32, max_value=512, step=32, default = 64)
    hidden_layers = hp.Int("layers", min_value=1, max_value=4, step=1, default = 2)
    dropout = hp.Float('do', min_value = 0.2, max_value = 0.5, step = 0.05, default = 0.3)

    input = keras.layers.Input(shape=X_train_full.shape[1])
    BN0 = layers.BatchNormalization()(input)
    layer_names = [BN0, 'HL1', 'HL2', 'HL3', 'HL4']
    for layer in range(1,hidden_layers+1):
        layer_names[layer] = layers.Dense(units, activation = 'relu', kernel_initializer = keras.initializers.HeNormal())(layer_names[layer - 1])
        BN = layers.BatchNormalization()(layer_names[layer])
        layer_names[layer] = layers.Dropout(dropout)(BN)


    concat = keras.layers.Concatenate()([input, layer_names[layer]])
    output = layers.Dense(1, activation = 'sigmoid')(concat)
    model = keras.models.Model(inputs=[input], outputs=[output])

    learning_sched = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate = lr,
        decay_steps=100000,
        decay_rate=0.96,
        staircase=True)
    model.compile(
        optimizer=keras.optimizers.Nadam(learning_rate= learning_sched),#'Nadam',
        loss='binary_crossentropy',
        metrics=['binary_accuracy'],
    )

    return model
# build_model(keras_tuner.HyperParameters())
# model.summary()

In [None]:
tuner = keras_tuner.BayesianOptimization(
    hypermodel= build_model,
    objective='val_binary_accuracy',
    max_trials=20,
    # if True, overwrite above directory if search is run again - i.e. don't resume
    overwrite = True)

In [None]:
tuner.search_space_summary()

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
    monitor = 'val_loss', ## validation loss
    patience=5,
    min_delta=0.001,
    restore_best_weights=True,
    start_from_epoch = 20,
)

In [None]:
# tuner.search(x_train, y_train, epochs=2, validation_data=(x_val, y_val))
tuner.search(X_train_full, y_train_full, validation_split=0.2, callbacks=[early_stopping])

In [None]:
# Get the top  models.
models = tuner.get_best_models(num_models=5)
# for model in models:
#     model.build(input_shape = [None, X_train_full.shape[1]])
#     model.summary()
best_model = models[0]
# Build the model.
# Needed for `Sequential` without specified `input_shape`.
# input_shape = [None, X_train_full.shape[1]]
best_model.build(input_shape = [None, X_train_full.shape[1]])
best_model.summary()

In [None]:
tuner.results_summary()

In [None]:
# Get the top 5 hyperparameters.
best_hps = tuner.get_best_hyperparameters(5)
# Build the model with the best hp.
model = build_model(best_hps[0])
# Fit with the entire dataset.
# model.fit(X_train_full, y_train_full, epochs=1)
# tensorboard_callback = keras.callbacks.TensorBoard(histogram_freq=1)
history = model.fit(
    X_train_full, y_train_full,
#     validation_data=(X_val, y_val),
    validation_split = 0.2,
    batch_size=32,
    epochs=100,
    callbacks=[early_stopping]#, tensorboard_callback],
)

In [None]:
# %load_ext tensorboard
# %reload_ext tensorboard
# # %tensorboard --logdir logs
# %tensorboard --logdir {logs_base_dir}  --host localhost

In [None]:
# learning_sched = keras.optimizers.schedules.ExponentialDecay(
#     initial_learning_rate = 0.001,
#     decay_steps=100000,
#     decay_rate=0.96,
#     staircase=True)
# model.compile(
#     optimizer=keras.optimizers.Nadam(learning_rate= learning_sched),#'Nadam',
#     loss='binary_crossentropy',
#     metrics=['binary_accuracy'],
# )

In [None]:
# print(X_train_full.shape)
# print(y_train_full.shape)
# print(X_train.shape)
# print(X_val.shape)
# print(y_train.shape)
# print(y_val.shape)

In [None]:
# early_stopping = keras.callbacks.EarlyStopping(
#     monitor = 'val_loss', ## validation loss
#     patience=5,
#     min_delta=0.01,
#     restore_best_weights=True,
#     start_from_epoch = 30,
# )

# history = model.fit(
#     X_train, y_train,
#     validation_data=(X_val, y_val),
#     batch_size=32,
#     epochs=100,
#     callbacks=[early_stopping],
# )

In [None]:
# history.history

In [None]:
## regularization only on training data => makes training lc larger than val lc
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Accuracy")
# history_df.loc[:, ['loss']].plot(title="Cross-entropy")
# history_df.loc[:, ['binary_accuracy']].plot(title="Accuracy")

In [None]:
pred_NN = (model.predict(X_test) > 0.5) ## makes values True or False

In [None]:
# ## saving and loading the model (work for sequencial and functional)
# model.save("my_keras_model.h5")
# model = keras.models.load_model("my_keras_model.h5")

# Stacking Method

In [None]:
## building multiple (simple) neural networks for first layer and using hypertuned NN above for meta-learner

In [None]:
def build_model_layer1(lr = 1e-3, units = 256, hidden_layers = 2, dropout = 0.3):
    """
    functional (API) model that is wide (simple rules) and deep (complex rules). Note: cannot use selu b/c network contains skip
    connections. More tuning hyperparameters. used to build multiple models quickly w/o hypertuning for stacking ensemble
    """
#     lr = hp.Float('lr', min_value = 1e-4, max_value = 1e-2, sampling = 'log')
#     units = hp.Int("units", min_value=32, max_value=512, step=32, default = 64)
#     hidden_layers = hp.Int("layers", min_value=1, max_value=4, step=1, default = 2)
#     dropout = hp.Float('do', min_value = 0.2, max_value = 0.5, step = 0.05, default = 0.3)

    input = keras.layers.Input(shape=X_train_full.shape[1])
    BN0 = layers.BatchNormalization()(input)
    layer_names = [BN0, 'HL1', 'HL2', 'HL3', 'HL4']
    for layer in range(1,hidden_layers+1):
        layer_names[layer] = layers.Dense(units, activation = 'relu', kernel_initializer = keras.initializers.HeNormal())(layer_names[layer - 1])
        BN = layers.BatchNormalization()(layer_names[layer])
        layer_names[layer] = layers.Dropout(dropout)(BN)


    concat = keras.layers.Concatenate()([input, layer_names[layer]])
    output = layers.Dense(1, activation = 'sigmoid')(concat)
    model = keras.models.Model(inputs=[input], outputs=[output])

    learning_sched = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate = lr,
        decay_steps=100000,
        decay_rate=0.96,
        staircase=True)
    model.compile(
        optimizer=keras.optimizers.Nadam(learning_rate= learning_sched),#'Nadam',
        loss='binary_crossentropy',
        metrics=['binary_accuracy'],
    )

    return model
# build_model(keras_tuner.HyperParameters())
# model.summary()

In [None]:
def aggregate(y1, y2):
    X = None
    if y1 is None:
        X = y2
    if y2 is None:
        X = y1
    if X is None:
        X = np.concatenate((y1,y2), axis = 1)
    return X

In [None]:
def prepare_layer2(num_of_models = 3):
    all_models = [0]*num_of_models
    X_layer2 = None
    for i in range(num_of_models):
        model = build_model_layer1(lr = rnd.uniform(1e-4, 1e-2), units = rnd.randrange(32, 512, 32), hidden_layers = rnd.randrange(2, 5, 1), dropout = rnd.uniform(0.2,0.5))

        history = model.fit(
            X_train_full, y_train_full,
        #     validation_data=(X_val, y_val),
            validation_split = 0.2,
            batch_size=32,
            epochs=100,
            callbacks=[early_stopping]#, tensorboard_callback],
        )
        all_models[i] = model
        y_pred = model.predict(X_train_full)
        X_layer2 = aggregate(X_layer2, y_pred)
    return X_layer2, all_models


In [None]:
X_layer2, all_models = prepare_layer2()

In [None]:
scaler = StandardScaler()
X_process_layer2  = scaler.fit_transform(pd.DataFrame(X_layer2))

In [None]:
# X_process_layer2 = preprocess_layer2(pd.DataFrame(X_layer2), training = True)

In [None]:
def build_model():
    """
    sequential model with relu act. fct. and Batch norm.
    """
#     input_shape = [X_train.shape[1]]
    model = keras.Sequential([
#         layers.BatchNormalization(input_shape = input_shape),
        layers.BatchNormalization(),

        layers.Dense(256, activation = 'relu', kernel_initializer = keras.initializers.HeNormal()),## elu or relu act fct
        layers.BatchNormalization(),
        layers.Dropout(0.3),

        layers.Dense(256, activation = 'relu', kernel_initializer = keras.initializers.HeNormal()),
        layers.BatchNormalization(),
        layers.Dropout(0.3),

        layers.Dense(1, activation = 'sigmoid')

    ])
#     lr = hp.Float('lr', min_value = 1e-4, max_value = 1e-2, sampling = 'log')
    learning_sched = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate = 1e-3,
        decay_steps=100000,
        decay_rate=0.96,
        staircase=True)
    model.compile(
        optimizer=keras.optimizers.Nadam(learning_rate= learning_sched),#'Nadam',
        loss='binary_crossentropy',
        metrics=['binary_accuracy'],
    )

    return model
# build_model(keras_tuner.HyperParameters())
# model.summary()

In [None]:
best_model = build_model()
# Build the model.
# Needed for `Sequential` without specified `input_shape`.
# input_shape = [None, X_train_full.shape[1]]
best_model.build(input_shape = [None, X_process_layer2.shape[1]])
best_model.summary()

In [None]:
history = best_model.fit(
    X_process_layer2, y_train_full,
#     validation_data=(X_val, y_val),
    validation_split = 0.2,
    batch_size=32,
    epochs=100,
    callbacks=[early_stopping]#, tensorboard_callback],
)

In [None]:
## regularization only on training data => makes training lc larger than val lc
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Accuracy")
# history_df.loc[:, ['loss']].plot(title="Cross-entropy")
# history_df.loc[:, ['binary_accuracy']].plot(title="Accuracy")

In [None]:
# def predict_test(X_test)
X_test_layer2 = None
for mod in all_models:
    y_pred = mod.predict(X_test)
    X_test_layer2 = aggregate(X_test_layer2, y_pred)

X_test_layer2 = scaler.transform(pd.DataFrame(X_test_layer2))

In [None]:
pred_NN = (best_model.predict(X_test_layer2) > 0.5) ## makes values True or False

# Generate a submit file


In [None]:
y_pred = pred_NN

submission_df = pd.DataFrame(columns=['PassengerId', 'Transported'])
submission_df['PassengerId'] = PassengerId_test.values
submission_df['Transported'] = y_pred
submission_df.to_csv('submissions.csv', header=True, index=False)
print("Your submission was successfully saved!")
submission_df.head(10)

In [None]:
## under 4% off from top spot on leaderboard
## 0.79191, 0.82182

# End