<a href="https://colab.research.google.com/github/Tony6512/Kaggle-Datasets/blob/main/spaceship_titanic_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

https://www.kaggle.com/competitions/spaceship-titanic

# Intro / Get Data

In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
sns.set(style="darkgrid")
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import KFold
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
                              GradientBoostingClassifier, ExtraTreesClassifier)

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer

SEED = 0


In [None]:
df_train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
df_train.head()

In [None]:
df_test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
df_test.head()

In [None]:
df_train.describe()

In [None]:
# df_test.describe()

Let us seperate categorical features and continuous features.

In [None]:
cts_features = df_train.describe().columns
cts_features

In [None]:
df_train.describe(exclude = [np.number])  ## non numeric

In [None]:
cat_features = df_train.drop(['PassengerId', 'Transported'], axis = 1).describe(exclude = [np.number]).columns
cat_features

In [None]:
# df_all = concat_df(df_train, df_test)
df_train.name = 'Training Set'
df_test.name = 'Test Set'
# df_all.name = 'All Set'
PassengerId_test = df_test['PassengerId'] ## for submission
dfs = [df_train, df_test]

print(f'Training X Shape = {df_train.shape}')
print(f'Training y Shape = {df_train.shape[0]}')
print(f'Test X Shape = {df_test.shape}')
print(f'Test y Shape = {df_test.shape[0]}')
print(df_train.columns)
print(df_test.columns)

In [None]:
for df in dfs:
    df.info()
    print('-'*50)

In [None]:
def display_missing(df):
    for col in df.columns.tolist():
        print(f'{col} column missing values: {df[col].isnull().sum()}')
    print('\n')

In [None]:
for df in dfs:
    print(f'{df.name}')
    display_missing(df)

# Visualize Data

In [None]:
fig, axs = plt.subplots(nrows=2, figsize=(15, 15))

sns.heatmap(df_train.corr(), ax=axs[0], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 14})
sns.heatmap(df_test.corr(), ax=axs[1], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 14})
plt.show()

Let us look at the target feature against some of the categorical and continuous features.

In [None]:
sns.histplot(x='Age', hue='Transported', data=df_train, kde = True)
None

In [None]:
sns.kdeplot(x='Age', hue='Transported', data=df_train, shade = True)
None

In [None]:
for feat in cts_features:
    if feat != 'Age':
        sns.histplot(x= feat, hue='Transported', data=df_train, bins = 10, multiple = 'dodge', shrink = 0.8)
    else:
        sns.histplot(x= feat, hue='Transported', data=df_train, kde = True)
    plt.show()

We might want to bin most of the numerical data since the majority of it is just 0.

In [None]:
for feat in cts_features:
    if feat == 'Age':
        pass
    else:
        sns.scatterplot(data=df_train, x= feat, y = 'Transported' , hue= 'Transported')
        plt.show()


In [None]:
sns.countplot(x='Transported', data=df_train)
None

In [None]:
# cat_features
for feat in ['HomePlanet', 'Destination', 'CryoSleep']:
    sns.countplot(x= feat, hue='Transported', data=df_train)
    plt.show()

Many people who are not transported (to another dimension) are from Earth (home planet). There is a nice relation between cryosleep and transported.

# Clean Data

## Age

In [None]:
sns.histplot(x='Age', data=df_train, kde = True)
None

In [None]:
df_train['Age'].mean()

In [None]:
df_train['Age'].median()

In [None]:
## missing values are filled in by median
for df in dfs:
    df['Age'] = df['Age'].fillna( df['Age'].median())

## Other cts features

In [None]:
## given that 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' are mostly zero, that is the value we use to fill
df_train.describe()

Most of the continuous data is 0, so filling in the median is equivalent to filling in 0.

In [None]:
## given that 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' are mostly zero, that is the value we use to fill
for df in dfs:
    for feat in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
        df[feat] = df[feat].fillna( df[feat].median())

In [None]:
## check features with missing values
for df in dfs:
    print(f'{df.name}')
    display_missing(df)

## Categorical features

In [None]:
cat_features

In [None]:
df_train.describe(exclude = [np.number])  ## non numeric

In [None]:
for df in dfs:
    df['VIP'] = df['VIP'].fillna( df['VIP'].mode()[0])

In [None]:
for df in dfs:
    print(f'{df.name}')
    display_missing(df)

In [None]:
sns.countplot(data = df_train, x = 'HomePlanet', hue = 'Destination')
None

In [None]:
sns.countplot(data = df_train, x = 'HomePlanet', hue = 'CryoSleep')
None

In [None]:
sns.countplot(data = df_train, hue = 'CryoSleep', x = 'Destination')
None

In [None]:
# sns.heatmap(data = df_train, annot = True)

In [None]:
df_train.groupby(['HomePlanet', 'Destination']).count()

In [None]:
df_train.groupby(['HomePlanet', 'Destination']).CryoSleep.count()['Earth']['TRAPPIST-1e']

In [None]:
df_train.groupby(['HomePlanet', 'Destination', 'CryoSleep']).count()

In [None]:
df_train.groupby(['HomePlanet', 'CryoSleep', 'Destination']).count()

In [None]:
sns.catplot(data = df_train, x = 'HomePlanet', hue = 'Destination', col = 'CryoSleep', kind = 'count')
None

### Attempt to fill in missing data by percentages later

In [None]:
## make distributions of situations and get percentages

In [None]:
df['HomePlanet'].unique()[0:3]

In [None]:
# for df in dfs:
#     for home in df['HomePlanet'].unique()[0:3]:
#         for destin df['Destination'].unique()[0:3]:
#             null_count = df[]

In [None]:
df_train.groupby(['HomePlanet', 'Destination']).count()

In [None]:
df_train.loc[(df_train.HomePlanet == 'Earth') & (df_train.Destination == 'TRAPPIST-1e')].CryoSleep

In [None]:
df_train.loc[(df_train.HomePlanet == 'Earth') & (df_train.Destination == 'TRAPPIST-1e') & (df_train.CryoSleep == False)]

In [None]:
df_train.loc[(df_train.HomePlanet == 'Earth') & (df_train.Destination == 'TRAPPIST-1e') & (df_train.CryoSleep == False)].shape[0]

In [None]:
df_train.loc[(df_train.HomePlanet == 'Earth') & (df_train.Destination == 'TRAPPIST-1e')].CryoSleep

In [None]:
def fill_in(df, Home, Dest, Cryo):
    return df.loc[(df.HomePlanet == Home) & (df.Destination == Dest) & (df.CryoSleep == Cryo)].shape[0]

In [None]:
fill_in(df = df_train, Home = 'Earth', Dest = 'TRAPPIST-1e', Cryo = False)

### Fill in missing data with mode (easier method)

In [None]:
for df in dfs:
    for feat in ['HomePlanet', 'Destination', 'CryoSleep']:
        df[feat] = df[feat].fillna(df[feat].mode()[0])

In [None]:
for df in dfs: ## drop features
    df.drop(['Name', 'Cabin'], inplace = True, axis=1)

In [None]:
for df in dfs: ## drop features
    df.drop(['PassengerId'], inplace = True, axis=1)

In [None]:
for df in dfs:
    print(f'{df.name}')
    display_missing(df)

## Feature Transformation

In [None]:
df_train.head(5)

In [None]:
cts_features = df_train.describe().columns
cts_features

In [None]:
cat_features = df_train.drop(['Transported'], axis = 1).describe(exclude = [np.number]).columns
cat_features

In [None]:
y_train = df_train['Transported'].values

In [None]:
y_train

In [None]:
# X_train = df_train.drop(['Transported'], axis=1)
# X_test = df_test.copy()

In [None]:
# ct = ColumnTransformer(
#     [("scaling", StandardScaler(), cts_features),
#      ("onehot", OneHotEncoder(), cat_features)])
ct = ColumnTransformer(
    [("scaling", MinMaxScaler(), cts_features),
     ("onehot", OneHotEncoder(), cat_features)])

In [None]:
X_train = ct.fit_transform(df_train.drop(['Transported'], axis=1))

In [None]:
ct.get_feature_names_out()

In [None]:
X_train

In [None]:
X_train

In [None]:
X_test = ct.transform(df_test)

In [None]:
X_test

In [None]:
X_train[0,:]

In [None]:
X_train[:,1]

## Modeling

In [None]:
import time

In [None]:
# tic = time.perf_counter()
# toc = time.perf_counter()
# print(("Elapsed time: %.2f [sec]" % ((toc-tic))))
# print(("Elapsed time: %.2f [min]" % ((toc-tic)/60)))

In [None]:
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.inspection import permutation_importance
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
# from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier,
                              HistGradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier)
import xgboost as xgb

### Voting

In [None]:
# # estimators_voting = [('lr', LogisticRegression()),('dt', DecisionTreeClassifier()), ('svc', SVC()), ('lsvc', LinearSVC()),
# #                      ('knn', KNeighborsClassifier()), ('gnb', GaussianNB()), ('sgd', SGDClassifier())]

# estimators_voting = [('lr', LogisticRegression()),('dt', DecisionTreeClassifier()), ('knn', KNeighborsClassifier())]
# parameters = {'lr': {'C': [1,10,100]}, 'dt':{'max_depth': [3, 6, 9, 12]}, 'knn':{'n_neighbors': [5,25,50,100]} }

In [None]:
# estimators_voting_para = [0]*len(estimators_voting)
# for i, (name, clf) in enumerate(estimators_voting):
#     clf_search = RandomizedSearchCV(clf, parameters[name], cv = 5).fit(X_train, y_train)
#     estimators_voting_para[i] = (name, clf_search.best_estimator_)

In [None]:
# estimators_voting_para

In [None]:
# clf_voting = VotingClassifier(estimators = estimators_voting_para)

In [None]:
# clf_voting.fit(X_train, y_train)

In [None]:
# pred_voting = clf_voting.predict(X_test)

In [None]:
# clf_voting.score(X_train, y_train)

### Stacking

In [None]:
estimators_stacking = [('rf', RandomForestClassifier()), ('ada', AdaBoostClassifier()), ('gb', HistGradientBoostingClassifier()),
                      ('et', ExtraTreesClassifier()), ('lr', LogisticRegression()),('dt', DecisionTreeClassifier()),
                       ('knn', KNeighborsClassifier()), ('xgboost', xgb.XGBClassifier()), ('MLP', MLPClassifier())
                      ]

In [None]:
parameters = {'rf': {'n_estimators': [100,250,500], 'max_depth':[3,6,9,12]}, 'ada':{'n_estimators': [100,250,500], 'learning_rate': [1, 0,75]},
              'gb':{'max_iter':[100, 150, 200]}, 'et' :{'n_estimators': [100,250,500], 'max_depth':[3,6,9,12]},
             'lr': {'C': [1,10,100]}, 'dt':{'max_depth': [3, 6, 9, 12]}, 'knn':{'n_neighbors': [5,25,50,100]},
             'xgboost':{'n_estimators':[100,250,500]}, 'MLP':{'hidden_layer_sizes':[(100,100,100)]}    }

In [None]:
estimators_stacking_para = [0]*len(estimators_stacking)
for i, (name, clf) in enumerate(estimators_stacking):
    if name != 'v':
        clf_search = RandomizedSearchCV(clf, parameters[name], cv = 5).fit(X_train, y_train)
        estimators_stacking_para[i] = (name, clf_search.best_estimator_)
    else:
        estimators_stacking_para[i] = estimators_stacking[i]

In [None]:
clf_stacking = StackingClassifier(estimators = estimators_stacking_para, final_estimator = None, cv = 5)

In [None]:
clf_stacking.fit(X_train, y_train)

In [None]:
clf_stacking.score(X_train, y_train)

In [None]:
clf_stacking.predict_proba(X_train)

In [None]:
pred_stacking = clf_stacking.predict(X_test)

# Explainability

In [None]:
X_train.shape

In [None]:
importance_stacking = permutation_importance(estimator = clf_stacking, X = X_train[0:2000], y = y_train[0:2000], n_repeats = 5)

In [None]:
importance_stacking

In [None]:
importance_stacking.importances_mean

In [None]:
importance_stacking.importances_std

In [None]:
ct.get_feature_names_out()

In [None]:
plt.barh(ct.get_feature_names_out(), importance_stacking.importances_mean, xerr = importance_stacking.importances_std)
# plt.xticks(rotation='vertical')
None

In [None]:
# pred_stacking = clf_stacking.predict(X_test)

In [None]:
from sklearn.inspection import PartialDependenceDisplay

In [None]:
X_train_df = pd.DataFrame(X_train, columns = ct.get_feature_names_out())

In [None]:
X_train_df

In [None]:
feature_dependence = 'scaling__Spa'
disp1 = PartialDependenceDisplay.from_estimator(clf_stacking, X_train_df.iloc[0:2000], [feature_dependence])
plt.show()

In [None]:
# clf_stacking.predict_proba(X_train)

In [None]:
pip install shap

In [None]:
import shap

In [None]:
# {'xgboost':{'n_estimators':[100,250,500]}}
# ('xgboost', xgb.XGBClassifier())

In [None]:
clf_search = RandomizedSearchCV(xgb.XGBClassifier(), {'n_estimators':[100,250,500]}, cv = 5).fit(X_train, y_train)
XGBoost_clf = clf_search.best_estimator_

In [None]:
XGBoost_clf.score(X_train, y_train)

In [None]:
XGBoost_clf.predict_proba(X_train)

In [None]:
X_train_df = pd.DataFrame(X_train , columns = ct.get_feature_names_out() )

In [None]:
X_train_df

In [None]:
importance_xgb = permutation_importance(estimator = XGBoost_clf , X = X_train[0:2000], y = y_train[0:2000], n_repeats = 5)

In [None]:
plt.barh(ct.get_feature_names_out(), importance_xgb.importances_mean, xerr = importance_xgb.importances_std)
# plt.xticks(rotation='vertical')
None

In [None]:
explainer = shap.Explainer(XGBoost_clf)
shap_values = explainer(X_train_df)

In [None]:
# explainer(X_train)

In [None]:
shap.plots.waterfall(shap_values[0])
shap.initjs()
shap.plots.force(shap_values[0])

In [None]:
shap.plots.waterfall(shap_values[1])
shap.initjs()
shap.plots.force(shap_values[1])

In [None]:
## multiple force plots rotated 90 degrees and stacked on top of each other
## interactive plot
shap.plots.force(shap_values[:500])

In [None]:
# shap.plots.violin(shap_values.values, X_train_df)

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
shap.plots.bar(shap_values)

In [None]:
# shap.decision_plot(explainer.expected_value, shap_values, features_display)
shap.decision_plot(explainer.expected_value, shap_values.values[0], X_train_df)

In [None]:
shap.decision_plot(explainer.expected_value, shap_values.values[0:20], X_train_df)

In [None]:
# help(shap.decision_plot)

In [None]:
shap_values_short = explainer(X_train_df[0:1000])

In [None]:
shap.plots.heatmap(shap_values_short)

#### Try final estimator as xgboost

In [None]:
# import xgboost as xgb

In [None]:
# help(xgb.XGBClassifier)

In [None]:
# clf_stacking_xgb = StackingClassifier(estimators = estimators_stacking_para, final_estimator = xgb.XGBClassifier(), cv = 5)

In [None]:
# clf_stacking_xgb.fit(X_train, y_train)

In [None]:
# clf_stacking_xgb.score(X_train, y_train)

In [None]:
# pred_stacking_xgb = clf_stacking_xgb.predict(X_test)

# Generate a submit file


In [None]:
# # Store our passenger ID for easy access
# PassengerId = test_df['PassengerId']

In [None]:
# # Generate Submission File
# StackingSubmission = pd.DataFrame({ 'PassengerId': PassengerId,
#                             'Survived': predictions })
# StackingSubmission.to_csv("StackingSubmission.csv", index=False

In [None]:
y_pred = pred_stacking

submission_df = pd.DataFrame(columns=['PassengerId', 'Transported'])
submission_df['PassengerId'] = PassengerId_test.values
submission_df['Transported'] = y_pred
submission_df.to_csv('submissions.csv', header=True, index=False)
print("Your submission was successfully saved!")
submission_df.head(10)

In [None]:
## under 4% off from top of leaderboard
## 0.79261, 0.82183

# End