In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [12]:
train_df = pd.read_csv('cleaned_train_df.csv')
test_df = pd.read_csv('cleaned_test_df.csv')

# Seperate the train and test data into 'X' and 'y'

In [18]:
X_train = train_df.drop('Transported', axis=1)
y_train = train_df['Transported']

X_test = test_df

In [23]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

cat_features = X_train.select_dtypes(include='object').columns
num_features = X_train.select_dtypes(exclude='object').columns

ohe = OneHotEncoder()
scale = StandardScaler()

preprocessing = ColumnTransformer(
    [
        ('OneHotEncoder', ohe, cat_features),
        ('StandardScaler', scale, num_features)
    ]
)

In [24]:
preprocessing.fit(X_train, y_train)

In [27]:
X_train = preprocessing.transform(X_train)
X_test = preprocessing.transform(X_test)

In [29]:
col_names = [name.split('__')[-1] for name in preprocessing.get_feature_names_out()]
X_train = pd.DataFrame(X_train, columns=col_names)
X_test = pd.DataFrame(X_test, columns=col_names)

# Train the model

In [34]:
from sklearn.model_selection import train_test_split

X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(X_train, y_train,
                                                                  test_size=0.3,
                                                                  random_state=42)

# Model Creating

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, f1_score, recall_score
from imblearn.over_sampling import SMOTE

In [41]:
models = {
    'Logistic': LogisticRegression(),
    'SVC': SVC(),
    'Naive Bayes': GaussianNB(),
    'Knn': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Ada Boost': AdaBoostClassifier(),
    'Gradient Boost': GradientBoostingClassifier(),
    'XG Boost': XGBClassifier()
}

In [42]:
for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train_val, y_train_val)

    # Make Prediction
    y_train_pred = model.predict(X_train_val)
    y_test_pred = model.predict(X_test_val)

    # Training set performance
    train_as = accuracy_score(y_train_val, y_train_pred)
    train_f1 = f1_score(y_train_val, y_train_pred)
    train_recall = recall_score(y_train_val, y_train_pred)

    # Testing set performance
    test_as = accuracy_score(y_test_val, y_test_pred)
    test_f1 = f1_score(y_test_val, y_test_pred)
    test_recall = recall_score(y_test_val, y_test_pred)

    print(list(models.keys())[i])
    print('\t-Train accuracy score: ', train_as)
    print('\t-Train f1 score: ', train_f1)
    print('\t-Train recall score: ', train_recall, '\n')

    print('\t-Test accuracy score: ', test_as)
    print('\t-Test f1 score: ', test_f1)
    print('\t-Test recall score: ', test_recall)

    print('*'*15)

Logistic
	-Train accuracy score:  0.7700903861955629
	-Train f1 score:  0.7728527358337393
	-Train recall score:  0.7780320366132724 

	-Test accuracy score:  0.7641871165644172
	-Test f1 score:  0.7726432532347505
	-Test recall score:  0.7922668688400303
***************
SVC
	-Train accuracy score:  0.7804437140509449
	-Train f1 score:  0.7857601026298909
	-Train recall score:  0.8009153318077803 

	-Test accuracy score:  0.772239263803681
	-Test f1 score:  0.7830533235938641
	-Test recall score:  0.8127369219105383
***************
Naive Bayes
	-Train accuracy score:  0.7462612982744453
	-Train f1 score:  0.734525447042641
	-Train recall score:  0.6982674076495586 

	-Test accuracy score:  0.7538343558282209
	-Test f1 score:  0.7494145199063232
	-Test recall score:  0.7278241091736164
***************
Knn
	-Train accuracy score:  0.8060805258833197
	-Train f1 score:  0.8070003271180898
	-Train recall score:  0.8064727034978751 

	-Test accuracy score:  0.7584355828220859
	-Test f1 score

# Hyperparameter tuning

In [48]:
forest_params = {
    'n_estimators': [100, 200, 500, 1000],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

gradient_params = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 1.0]
}

xgboost_params = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.3],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [1, 2, 5]
}

In [49]:
random_cv_model = [
    ('Random Forest', RandomForestClassifier(), forest_params),
    ('Gradient Boosting', GradientBoostingClassifier(), gradient_params),
    ('XGBoost Classifier', XGBClassifier(), xgboost_params)
]

In [50]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, param in random_cv_model:
    randomCV = RandomizedSearchCV(estimator=model, param_distributions=param,
                                  n_iter=100,
                                  cv=3, 
                                  verbose=2,
                                  n_jobs=1)
    randomCV.fit(X_train, y_train)
    model_param[name] =randomCV.best_params_

for model_name in model_param:
    print(f"---------- Best params for {model_name} ----------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=False, max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   0.9s
[CV] END bootstrap=False, max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   0.9s
[CV] END bootstrap=False, max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   0.9s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.7s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.7s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.7s
[CV] END bootstrap=True, max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimato

In [76]:
models = {
    'Random Forest': RandomForestClassifier(n_estimators=200, min_samples_split=5, min_samples_leaf=2, max_features='sqrt',
                                           max_depth=10, 
                                           bootstrap=False),
    'Gradient Boost': GradientBoostingClassifier(subsample= 0.8, n_estimators= 500, min_samples_split= 10, min_samples_leaf= 4,
                                                 max_depth= 5, learning_rate= 0.01),
    'XG Boost': XGBClassifier(subsample= 0.8, reg_lambda= 1, reg_alpha= 0, n_estimators= 100,
                              min_child_weight= 1, max_depth= 7, learning_rate= 0.01, gamma= 0.1, colsample_bytree= 0.8)
}

In [78]:
for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train_val, y_train_val)

    # Make Prediction
    y_train_pred = model.predict(X_train_val)
    y_test_pred = model.predict(X_test_val)

    # Training set performance
    train_as = accuracy_score(y_train_val, y_train_pred)
    train_f1 = f1_score(y_train_val, y_train_pred)
    train_recall = recall_score(y_train_val, y_train_pred)

    # Testing set performance
    test_as = accuracy_score(y_test_val, y_test_pred)
    test_f1 = f1_score(y_test_val, y_test_pred)
    test_recall = recall_score(y_test_val, y_test_pred)

    print(list(models.keys())[i])
    print('\t-Train accuracy score: ', train_as)
    print('\t-Train f1 score: ', train_f1)
    print('\t-Train recall score: ', train_recall, '\n')

    print('\t-Test accuracy score: ', test_as)
    print('\t-Test f1 score: ', test_f1)
    print('\t-Test recall score: ', test_recall)

    print('*'*15)

Random Forest
	-Train accuracy score:  0.8407559572719803
	-Train f1 score:  0.8421567030460987
	-Train recall score:  0.8450474011114744 

	-Test accuracy score:  0.7718558282208589
	-Test f1 score:  0.7787281517292675
	-Test recall score:  0.7937831690674754
***************
Gradient Boost
	-Train accuracy score:  0.8101889893179951
	-Train f1 score:  0.8134991119005328
	-Train recall score:  0.823471722785224 

	-Test accuracy score:  0.7714723926380368
	-Test f1 score:  0.780559646539028
	-Test recall score:  0.8036391205458681
***************
XG Boost
	-Train accuracy score:  0.8029580936729663
	-Train f1 score:  0.8051990251827782
	-Train recall score:  0.8100686498855835 

	-Test accuracy score:  0.7730061349693251
	-Test f1 score:  0.7799256505576208
	-Test recall score:  0.7952994692949203
***************


# Select "RandomForestClassifier" as final model.

In [83]:
random_forest =  RandomForestClassifier(n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features='log2',
                                           max_depth=10, 
                                           bootstrap=False)

random_forest.fit(X_train, y_train)
prediction = random_forest.predict(X_test)

# Create 'target_df'

In [88]:
target_df = pd.DataFrame(prediction, columns=['Transported'])

In [90]:
target_df['Transported'] = target_df['Transported'].astype(bool)

In [92]:
old_test = pd.read_csv('test.csv')
old_test.head(2)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers


In [95]:
target_df['PassengerId'] = old_test['PassengerId']

In [97]:
target_df = target_df[['PassengerId', 'Transported']]

# Save the Target Dataset

In [102]:
target_df.to_csv('Spaceship_Titanic_Final.csv', index=False)