## Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import set_config; set_config(display='diagram')

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier, StackingClassifier
from xgboost import XGBClassifier


In [3]:
df = pd.read_csv('Data/train.csv')

## Data Exploration

In [11]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,total_spent,total_spent_log,cabin_deck,cabin_num,cabin_side,AgeBinary
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0.0,0.0,B,0,P,Adult
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,736.0,6.602588,F,0,S,Adult
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,10383.0,9.248021,A,0,S,Adult
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,5176.0,8.551981,A,0,S,Adult
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,1091.0,6.995766,F,1,S,Children


In [None]:
df.describe()

In [5]:
df.shape

(8693, 14)

In [12]:
df.dtypes

PassengerId          object
HomePlanet           object
CryoSleep            object
Cabin                object
Destination          object
Age                 float64
VIP                  object
RoomService         float64
FoodCourt           float64
ShoppingMall        float64
Spa                 float64
VRDeck              float64
Name                 object
Transported            bool
total_spent         float64
total_spent_log     float64
cabin_deck           object
cabin_num            object
cabin_side           object
AgeBinary          category
dtype: object

In [7]:
df.nunique()

PassengerId     8693
HomePlanet         3
CryoSleep          2
Cabin           6560
Destination        3
Age               80
VIP                2
RoomService     1273
FoodCourt       1507
ShoppingMall    1115
Spa             1327
VRDeck          1306
Name            8473
Transported        2
dtype: int64

In [9]:
df.isnull().sum().sort_values(ascending=False)/len(df)

CryoSleep       0.024963
ShoppingMall    0.023927
VIP             0.023352
HomePlanet      0.023122
Name            0.023007
Cabin           0.022892
VRDeck          0.021627
FoodCourt       0.021051
Spa             0.021051
Destination     0.020936
RoomService     0.020821
Age             0.020591
PassengerId     0.000000
Transported     0.000000
dtype: float64

In [12]:
df.Age.nunique()

80

## Data Preprocessing

In [None]:
# df.duplicated().sum()

In [4]:
df[['RoomService', 'FoodCourt','ShoppingMall','Spa','VRDeck']]=\
df[['RoomService', 'FoodCourt','ShoppingMall','Spa','VRDeck']].fillna(0.0);
df['total_spend'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + \
df['Spa'] + df['VRDeck'];
df['total_spent_log'] = np.log1p(df['total_spend'])

In [6]:
df['CryoSleep'] = np.float16(df['CryoSleep']*1.0);
df.loc[(df.CryoSleep.isna()==True) & (df['total_spend'] == 0.0), ['CryoSleep']] = 1.0;
df.loc[(df.CryoSleep.isna()==True) & (df['total_spend'] > 0.0), ['CryoSleep']] = 0.0;

In [7]:
df[['cabin_deck', 'cabin_num', 'cabin_side']] = df.Cabin.str.split("/", expand = True)

In [8]:
df['Age'] = df['Age'].fillna(df.Age.median())
df['AgeCategory'] = pd.cut(x = df['Age'],
                             bins= [df['Age'].min()-1,
                                    5,
                                    18,
                                    65,
                                    df['Age'].max()+1],
                             labels=['Baby','Children','Adult','Senior'])

In [9]:
# df[['passenger_group', 'passenger_number']] = df.PassengerId.str.split("_", expand = True)

In [9]:
X = df.drop(columns=['PassengerId', 'Transported', 'Name', 'Cabin', 'cabin_num', 'total_spend', 'Age'])
y = df.Transported
X.shape

(8693, 13)

In [10]:
X.isnull().sum().sort_values(ascending=False)/len(df)

VIP                0.023352
HomePlanet         0.023122
cabin_deck         0.022892
cabin_side         0.022892
Destination        0.020936
CryoSleep          0.000000
RoomService        0.000000
FoodCourt          0.000000
ShoppingMall       0.000000
Spa                0.000000
VRDeck             0.000000
total_spent_log    0.000000
AgeCategory        0.000000
dtype: float64

In [None]:
X.head()

In [None]:
X.dtypes

In [None]:
X.select_dtypes(include='object').nunique()

In [None]:
X.select_dtypes(include="float64")

In [11]:
feat_categorical = X.select_dtypes(include=['object', 'category']).columns

In [12]:
feat_numerical = X.select_dtypes(include="float64").columns

In [13]:
label_encoder = LabelEncoder()

y_encoded = label_encoder.fit_transform(y)

## Feature Correlation

## Baseline Pipe

In [11]:
preproc_numerical_baseline = make_pipeline(
    SimpleImputer(), MinMaxScaler())

preproc_categorical_baseline = make_pipeline(
    OneHotEncoder(handle_unknown="ignore"))

preproc_baseline = make_column_transformer(
    (preproc_numerical_baseline, feat_numerical),
    (preproc_categorical_baseline, feat_categorical),
    remainder="drop")

preproc_baseline

In [20]:
pipe_baseline = make_pipeline(preproc_baseline, LogisticRegression())
pipe_baseline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer()),
                                                                  ('minmaxscaler',
                                                                   MinMaxScaler())]),
                                                  Index(['Age', 'total_spent'], dtype='object')),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'cabin_deck',
       'cabin_side'],
      dtype='object'))])),
                ('logisticregressio

In [21]:
score_baseline = cross_val_score(pipe_baseline, X, y_encoded, cv=5, scoring='accuracy').mean()
score_baseline

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7259879128449018

In [22]:
X_test = pd.read_csv("Data/test.csv")
X_test_ids = X_test['PassengerId']
X_test['total_spent'] = X_test['RoomService'] + X_test['FoodCourt'] + X_test['ShoppingMall'] + X_test['Spa'] + X_test['VRDeck']
X_test[['cabin_deck', 'cabin_num', 'cabin_side']] = X_test.Cabin.str.split("/", expand = True)
X_test = X_test.drop(columns=['PassengerId', 'Name', 'Cabin', 'cabin_num', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'])

pipe_baseline.fit(X,y_encoded)
y_pred_baseline = pipe_baseline.predict(X_test)
y_pred_baseline

array([1, 0, 1, ..., 1, 0, 1])

## Model Iteration

In [14]:
preproc_numerical = make_pipeline(
    SimpleImputer(strategy="median"), StandardScaler())

preproc_categorical = make_pipeline(
    SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown='ignore', sparse=False))

preproc = make_column_transformer(
    (preproc_numerical, feat_numerical),
    (preproc_categorical, feat_categorical),
    remainder="drop")

preproc

### Decision Tree

In [12]:
model = DecisionTreeClassifier()
pipe_decision_tree = make_pipeline(preproc, model) 
cross_val_score(pipe_decision_tree, X, y_encoded, cv=5, scoring='accuracy').mean()

0.6749100543875659

### SVC

In [19]:
model = SVC()
pipe_decision_tree = make_pipeline(preproc, model) 
cross_val_score(pipe_decision_tree, X, y_encoded, cv=5, scoring='accuracy').mean()

0.724031972133238

### Random Forest Classifier

In [18]:
model = RandomForestClassifier()
pipe_RandomForest = make_pipeline(preproc, model) 
cross_val_score(pipe_RandomForest, X, y_encoded, cv=5, scoring='accuracy').mean()

0.7045902867341057

### AdaBoost

In [19]:
model = AdaBoostClassifier()
pipe_AdaBoost = make_pipeline(preproc, model) 
cross_val_score(pipe_AdaBoost, X, y_encoded, cv=5, scoring='accuracy').mean()

0.7311644259395405

### KNN

In [20]:
model = KNeighborsClassifier(n_neighbors=3)
pipe_KNN = make_pipeline(preproc, model) 
cross_val_score(pipe_KNN, X, y_encoded, cv=5, scoring='accuracy').mean()

0.6808911646509276

### XGBoost

In [23]:
model = XGBClassifier(learning_rate=0.065, max_depth=4, n_estimators=300, reg_lambda=1, reg_alpha=1, use_label_encoder=False,
                subsample=0.8, colsample_bytree=0.8, gamma=10, min_child_weight=1, booster='gbtree', eval_metric='logloss')
                                        
pipe_XGBoost = make_pipeline(preproc, model) 
cross_val_score(pipe_XGBoost, X, y_encoded, cv=5, scoring='accuracy', n_jobs=-1).mean()

0.796965373668848

### Voting

In [None]:
Xgboost = XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=100, reg_lambda=1, reg_alpha=1, use_label_encoder=False,
                subsample=0.8, colsample_bytree=0.8, gamma=0, min_child_weight=1, booster='gbtree', eval_metric='logloss')
Logreg = LogisticRegression()
Svc = SVC()
Adaboost = AdaBoostClassifier()


model = VotingClassifier(
    estimators = [("Xgboost", Xgboost),("Logreg", Logreg),("Svc", Svc), ("Adaboost", Adaboost)],
    weights = [1,1,1,1], # to equally weight the two models
    n_jobs=-1
)

pipe_ensemble = make_pipeline(preproc, model)

score = cross_val_score(pipe_ensemble, X, y_encoded, cv=5, scoring='accuracy', n_jobs=-1)
print(score.std())
score.mean()

### Stacking

In [22]:
Xgboost = XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=100, reg_lambda=1, reg_alpha=1, use_label_encoder=False,
                subsample=0.8, colsample_bytree=0.8, gamma=0, min_child_weight=1, booster='gbtree', eval_metric='logloss')
Logreg = LogisticRegression()
Svc = SVC()
Adaboost = AdaBoostClassifier()
Gnb = GaussianNB()

model = StackingClassifier(
    estimators=[("Xgboost", Xgboost),("Svc", Svc),("Adaboost", Adaboost), ("Gnb", Gnb)],
    
    final_estimator=LogisticRegression(),
    cv=5,
    n_jobs=-1
)

pipe_stacking = make_pipeline(preproc, model)
score = cross_val_score(pipe_stacking, X, y_encoded, cv=5, scoring='accuracy', n_jobs=-1)
print(score.std())
score.mean()

0.008769707095997229


0.7468091061950475

## Train and Tune the Model

In [None]:
pipe_XGBoost.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('pipeline-1',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    Index(['Age', 'total_spent'], dtype='object')),
                                   ('pipeline-2',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('onehotencoder',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                    Index(['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'cabin_deck',
          'cabin_side'],
 

In [None]:
allow_grid_searching = True

if allow_grid_searching:
    param_grid =  {'xgbclassifier__max_depth': [4, 5, 6],
                  'xgbclassifier__n_estimators': [100, 150, 200],
                   'xgbclassifier__learning_rate': [0.07, 0.1, 0.13],
                  }
    search_XGBoost = GridSearchCV(pipe_XGBoost, param_grid=param_grid, 
                              cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

    search_XGBoost.fit(X, y_encoded);
    svm_XGBoost_best = search_XGBoost.best_estimator_
    print(search_XGBoost.best_params_)
    search_XGBoost.best_score_

## Feature Selection

In [19]:
from sklearn.inspection import permutation_importance

model = pipe_XGBoost.fit(X, y_encoded)

permutation_score = permutation_importance(model, X, y_encoded, n_repeats=10)

importance_df = pd.DataFrame(np.vstack((X.columns,
                                        permutation_score.importances_mean)).T)
importance_df.columns=['feature','score decrease']

importance_df.sort_values(by="score decrease", ascending = False)

Unnamed: 0,feature,score decrease
9,total_spent_log,0.089555
7,Spa,0.051501
8,VRDeck,0.046854
5,FoodCourt,0.04621
4,RoomService,0.031531
10,cabin_deck,0.028322
6,ShoppingMall,0.021834
11,cabin_side,0.015208
2,Destination,0.009157
0,HomePlanet,0.008846


In [27]:
X = X.drop(columns=[''])

## Final Result

In [29]:
X_test = pd.read_csv('Data/test.csv')
X_test_ids = X_test['PassengerId']

X_test[['RoomService', 'FoodCourt','ShoppingMall','Spa','VRDeck']]=\
X_test[['RoomService', 'FoodCourt','ShoppingMall','Spa','VRDeck']].fillna(0.0);
X_test['total_spend'] = X_test['RoomService'] + X_test['FoodCourt'] + X_test['ShoppingMall'] + \
X_test['Spa'] + X_test['VRDeck'];
X_test['total_spent_log'] = np.log1p(X_test['total_spend'])

X_test['CryoSleep'] = np.float16(X_test['CryoSleep']*1.0);
X_test.loc[(X_test.CryoSleep.isna()==True) & (X_test['total_spend'] == 0.0), ['CryoSleep']] = 1.0;
X_test.loc[(X_test.CryoSleep.isna()==True) & (X_test['total_spend'] > 0.0), ['CryoSleep']] = 0.0;

X_test[['cabin_deck', 'cabin_num', 'cabin_side']] = X_test.Cabin.str.split("/", expand = True)

X_test['Age'] = X_test['Age'].fillna(X_test.Age.median())
X_test['AgeCategory'] = pd.cut(x = X_test['Age'],
                             bins= [X_test['Age'].min()-1,
                                    5,
                                    18,
                                    65,
                                    X_test['Age'].max()+1],
                             labels=['Baby','Children','Adult','Senior'])

X_test = X_test.drop(columns=['PassengerId', 'Name', 'Cabin', 'cabin_num', 'total_spend', 'Age'])

pipe_XGBoost.fit(X,y_encoded)
y_pred_encoded = pipe_XGBoost.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

In [30]:
results = pd.concat([X_test_ids, pd.Series(y_pred, name="Transported")], axis=1)
results

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [31]:
results.to_csv("Data/submission_final.csv", header=True, index=False)