## Import

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import set_config; set_config(display='diagram')

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier, StackingClassifier
from xgboost import XGBClassifier


In [4]:
df = pd.read_csv('Data/train.csv')

## Data Exploration

In [3]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [5]:
df.shape

(8693, 14)

In [6]:
df.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [7]:
df.nunique()

PassengerId     8693
HomePlanet         3
CryoSleep          2
Cabin           6560
Destination        3
Age               80
VIP                2
RoomService     1273
FoodCourt       1507
ShoppingMall    1115
Spa             1327
VRDeck          1306
Name            8473
Transported        2
dtype: int64

In [9]:
df.isnull().sum().sort_values(ascending=False)/len(df)

CryoSleep       0.024963
ShoppingMall    0.023927
VIP             0.023352
HomePlanet      0.023122
Name            0.023007
Cabin           0.022892
VRDeck          0.021627
FoodCourt       0.021051
Spa             0.021051
Destination     0.020936
RoomService     0.020821
Age             0.020591
PassengerId     0.000000
Transported     0.000000
dtype: float64

## Data Preprocessing

In [None]:
# df.duplicated().sum()

In [5]:
df['total_spent'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']

In [6]:
df[['cabin_deck', 'cabin_num', 'cabin_side']] = df.Cabin.str.split("/", expand = True)

In [7]:
X = df.drop(columns=['PassengerId', 'Transported', 'Name', 'Cabin', 'cabin_num', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'])
y = df.Transported
X.shape

(8693, 8)

In [8]:
X.select_dtypes(include='object')

Unnamed: 0,HomePlanet,CryoSleep,Destination,VIP,cabin_deck,cabin_side
0,Europa,False,TRAPPIST-1e,False,B,P
1,Earth,False,TRAPPIST-1e,False,F,S
2,Europa,False,TRAPPIST-1e,True,A,S
3,Europa,False,TRAPPIST-1e,False,A,S
4,Earth,False,TRAPPIST-1e,False,F,S
...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,True,A,P
8689,Earth,True,PSO J318.5-22,False,G,S
8690,Earth,False,TRAPPIST-1e,False,G,S
8691,Europa,False,55 Cancri e,False,E,S


In [9]:
# X.select_dtypes(include='object').nunique()

In [10]:
# X.select_dtypes(include="float64")

In [11]:
feat_categorical = X.select_dtypes(include='object').columns

In [12]:
feat_numerical = X.select_dtypes(include="float64").columns

In [13]:
label_encoder = LabelEncoder()

y_encoded = label_encoder.fit_transform(y)

## Feature Correlation

## Baseline Pipe

In [14]:
preproc_numerical_baseline = make_pipeline(
    SimpleImputer(), MinMaxScaler())

preproc_categorical_baseline = make_pipeline(
    OneHotEncoder(handle_unknown="ignore"))

preproc_baseline = make_column_transformer(
    (preproc_numerical_baseline, feat_numerical),
    (preproc_categorical_baseline, feat_categorical),
    remainder="drop")

preproc_baseline

In [20]:
pipe_baseline = make_pipeline(preproc_baseline, LogisticRegression())
pipe_baseline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer()),
                                                                  ('minmaxscaler',
                                                                   MinMaxScaler())]),
                                                  Index(['Age', 'total_spent'], dtype='object')),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'cabin_deck',
       'cabin_side'],
      dtype='object'))])),
                ('logisticregressio

In [21]:
score_baseline = cross_val_score(pipe_baseline, X, y_encoded, cv=5, scoring='accuracy').mean()
score_baseline

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7259879128449018

In [22]:
X_test = pd.read_csv("Data/test.csv")
X_test_ids = X_test['PassengerId']
X_test['total_spent'] = X_test['RoomService'] + X_test['FoodCourt'] + X_test['ShoppingMall'] + X_test['Spa'] + X_test['VRDeck']
X_test[['cabin_deck', 'cabin_num', 'cabin_side']] = X_test.Cabin.str.split("/", expand = True)
X_test = X_test.drop(columns=['PassengerId', 'Name', 'Cabin', 'cabin_num', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'])

pipe_baseline.fit(X,y_encoded)
y_pred_baseline = pipe_baseline.predict(X_test)
y_pred_baseline

array([1, 0, 1, ..., 1, 0, 1])

## Model Iteration

In [29]:
preproc_numerical = make_pipeline(
    SimpleImputer(strategy="median"), MinMaxScaler())

preproc_categorical = make_pipeline(
    SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore"))

preproc = make_column_transformer(
    (preproc_numerical, feat_numerical),
    (preproc_categorical, feat_categorical),
    remainder="drop")

preproc

### Decision Tree

In [18]:
model = DecisionTreeClassifier()
pipe_decision_tree = make_pipeline(preproc, model) 
cross_val_score(pipe_decision_tree, X, y_encoded, cv=5, scoring='accuracy').mean()

0.6747941193403084

### SVC

In [19]:
model = SVC()
pipe_decision_tree = make_pipeline(preproc, model) 
cross_val_score(pipe_decision_tree, X, y_encoded, cv=5, scoring='accuracy').mean()

0.724031972133238

### Random Forest Classifier

In [18]:
model = RandomForestClassifier()
pipe_RandomForest = make_pipeline(preproc, model) 
cross_val_score(pipe_RandomForest, X, y_encoded, cv=5, scoring='accuracy').mean()

0.7045902867341057

### AdaBoost

In [19]:
model = AdaBoostClassifier()
pipe_AdaBoost = make_pipeline(preproc, model) 
cross_val_score(pipe_AdaBoost, X, y_encoded, cv=5, scoring='accuracy').mean()

0.7311644259395405

### KNN

In [20]:
model = KNeighborsClassifier(n_neighbors=3)
pipe_KNN = make_pipeline(preproc, model) 
cross_val_score(pipe_KNN, X, y_encoded, cv=5, scoring='accuracy').mean()

0.6808911646509276

### XGBoost

In [34]:
model = XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=100, reg_lambda=1, reg_alpha=1, use_label_encoder=False,
                subsample=0.8, colsample_bytree=0.8, gamma=0, min_child_weight=1, booster='gbtree', eval_metric='logloss')
                                        
pipe_XGBoost = make_pipeline(preproc, model) 
cross_val_score(pipe_XGBoost, X, y_encoded, cv=5, scoring='accuracy', n_jobs=-1).mean()

0.7494553633524815

In [28]:
XGBClassifier().get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': True,
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'enable_categorical': False,
 'gamma': None,
 'gpu_id': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [None]:
allow_grid_searching = True

if allow_grid_searching:
    param_grid =  {'xgbclassifier__max_depth': [3, 5, 7],
                  'xgbclassifier__n_estimators': [10, 50, 100],
                   'xgbclassifier__learning_rate': [0.01, 0.1, 1],
                  }
    search_XGBoost = GridSearchCV(pipe_XGBoost, param_grid=param_grid, 
                              cv=5, n_jobs=-1, verbose=2, scoring='roc_auc')

    search_XGBoost.fit(X, y_encoded);
    svm_XGBoost_best = search_XGBoost.best_estimator_
    print(search_XGBoost.best_params_)
    search_XGBoost.best_score_

In [None]:
allow_grid_searching = True

if allow_grid_searching:
    param_grid =  {'xgbclassifier__n_estimators': [10, 50, 100],
                   'xgbclassifier__learning_rate': [0.1, 0.3, 0.5],
                  }
    search_XGBoost = GridSearchCV(pipe_XGBoost, param_grid=param_grid, 
                              cv=5, n_jobs=-1, verbose=2, scoring='roc_auc')

    search_XGBoost.fit(X, y_encoded);
    svm_XGBoost_best = search_XGBoost.best_estimator_
    print(search_XGBoost.best_params_)
    search_XGBoost.best_score_

## Feature Selection

In [22]:
from sklearn.inspection import permutation_importance

model = pipe_XGBoost.fit(X, y_encoded)

permutation_score = permutation_importance(model, X, y, n_repeats=10)

importance_df = pd.DataFrame(np.vstack((X.columns,
                                        permutation_score.importances_mean)).T)
importance_df.columns=['feature','score decrease']

importance_df.sort_values(by="score decrease", ascending = False)





Unnamed: 0,feature,score decrease
1,CryoSleep,0.124905
5,total_spent,0.05027
3,Age,0.035339
6,cabin_deck,0.032923
0,HomePlanet,0.020131
7,cabin_side,0.014667
2,Destination,0.004854
4,VIP,0.000978


In [27]:
X = X.drop(columns=['Mars'])

## Train and Tune the Model

In [None]:
from sklearn.model_selection import GridSearchCV
if allow_grid_searching:
    param_grid =  {'n_estimators': [100, 150, 200],
                   'n_jobs': [3,4,5],
                   'max_depth': [10,15,20]
                   }
    search_ridge = GridSearchCV(model, param_grid=param_grid, 
                              cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

    search_ridge.fit(X, y);
    print(search_ridge.best_params_)
    search_ridge.best_score_