In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df_Train = pd.read_csv("train_spaceship.csv")
df_Test = pd.read_csv("test_spaceship.csv")

In [3]:
df_Train.head(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True


In [4]:
df_Train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [5]:
df_Test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


In [6]:
df_Train.corr()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
Age,1.0,0.068723,0.130421,0.033133,0.12397,0.101007,-0.075026
RoomService,0.068723,1.0,-0.015889,0.05448,0.01008,-0.019581,-0.244611
FoodCourt,0.130421,-0.015889,1.0,-0.014228,0.221891,0.227995,0.046566
ShoppingMall,0.033133,0.05448,-0.014228,1.0,0.013879,-0.007322,0.010141
Spa,0.12397,0.01008,0.221891,0.013879,1.0,0.153821,-0.221131
VRDeck,0.101007,-0.019581,0.227995,-0.007322,0.153821,1.0,-0.207075
Transported,-0.075026,-0.244611,0.046566,0.010141,-0.221131,-0.207075,1.0


In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

num_pipelines = Pipeline([
    ("imputer",SimpleImputer(strategy='median')), 
    ("Scaler",StandardScaler()),
])

cat_pipelines = Pipeline([
    ("imputer",SimpleImputer(strategy='most_frequent')),
    ("cat_encoder",OrdinalEncoder())
])

attribs = ['HomePlanet','CryoSleep','Cabin','Destination','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
num_attribs = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
cat_attribs = ['HomePlanet','CryoSleep','Cabin','Destination','VIP']

preprocess_pipeline = ColumnTransformer([
    ("num",num_pipelines,num_attribs),
    ("cat",cat_pipelines,cat_attribs)
])

In [8]:
X_Train = preprocess_pipeline.fit_transform(df_Train[num_attribs + cat_attribs])
X_Test = preprocess_pipeline.fit_transform(df_Test[num_attribs + cat_attribs])

X_Train = pd.DataFrame(X_Train, columns=num_attribs + cat_attribs, index=df_Train.index)
X_Test = pd.DataFrame(X_Test, columns=num_attribs + cat_attribs, index=df_Test.index)

In [9]:
y_Train = df_Train['Transported']

In [10]:
X_Train

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet,CryoSleep,Cabin,Destination,VIP
0,0.711945,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,1.0,0.0,149.0,2.0,0.0
1,-0.334037,-0.168073,-0.275387,-0.241771,0.217158,-0.224205,0.0,0.0,2184.0,2.0,0.0
2,2.036857,-0.268001,1.959998,-0.283579,5.695623,-0.219796,1.0,0.0,1.0,2.0,1.0
3,0.293552,-0.333105,0.523010,0.336851,2.687176,-0.092818,1.0,0.0,1.0,2.0,0.0
4,-0.891895,0.125652,-0.237159,-0.031059,0.231374,-0.261240,0.0,0.0,2186.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
8688,0.851410,-0.333105,3.992336,-0.283579,1.189173,-0.197751,1.0,0.0,146.0,0.0,1.0
8689,-0.752431,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,0.0,1.0,5280.0,1.0,0.0
8690,-0.194573,-0.333105,-0.281027,2.846999,-0.269737,-0.263003,0.0,0.0,5285.0,2.0,0.0
8691,0.223820,-0.333105,0.376365,-0.283579,0.043013,2.589576,1.0,0.0,2131.0,0.0,0.0


In [12]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, cv_index in split.split(X_Train, y_Train):
    X_train, X_cv = X_Train.iloc[train_index], X_Train.iloc[cv_index]
    y_train, y_cv = y_Train.iloc[train_index], y_Train.iloc[cv_index]

In [22]:
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier,ExtraTreesClassifier , AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from xgboost.sklearn import XGBClassifier

from sklearn.metrics import accuracy_score

In [105]:
def model_selection(X_train, X_cv, y_train, y_cv):
    
#     sgd_clf = SGDClassifier(random_state=42)
#     sgd_clf.fit(X_train, y_train)
#     y_pred_train = sgd_clf.predict(X_train)
#     sgd_clf_train_score = accuracy_score(y_train, y_pred_train) ###
#     y_pred_cv = sgd_clf.predict(X_cv)
#     sgd_clf_cv_score = accuracy_score(y_cv, y_pred_cv) ###
    
    log_clf = LogisticRegression(max_iter=10000,solver='sag',random_state=42)
    log_clf.fit(X_train, y_train)
    y_pred_train = log_clf.predict(X_train)
    log_clf_train_score = accuracy_score(y_train, y_pred_train) ###
    y_pred_cv = log_clf.predict(X_cv)
    log_clf_cv_score = accuracy_score(y_cv, y_pred_cv)###
    
    rnd_clf = RandomForestClassifier(random_state=42)
    rnd_clf.fit(X_train, y_train)
    y_pred_train = rnd_clf.predict(X_train)
    rnd_clf_train_score = accuracy_score(y_train, y_pred_train)###
    y_pred_cv = rnd_clf.predict(X_cv)
    rnd_clf_cv_score = accuracy_score(y_cv, y_pred_cv)###
    
    ext_rnd_clf = ExtraTreesClassifier(random_state=42)
    ext_rnd_clf.fit(X_train, y_train)
    y_pred_train = ext_rnd_clf.predict(X_train)
    ext_rnd_clf_train_score = accuracy_score(y_train, y_pred_train)###
    y_pred_cv = ext_rnd_clf.predict(X_cv)
    ext_rnd_clf_cv_score = accuracy_score(y_cv, y_pred_cv)###
    
    ada_clf = AdaBoostClassifier(random_state=42)
    ada_clf.fit(X_train, y_train)
    y_pred_train = ada_clf.predict(X_train)
    ada_clf_train_score = accuracy_score(y_train, y_pred_train)###
    y_pred_cv = ada_clf.predict(X_cv)
    ada_clf_cv_score = accuracy_score(y_cv, y_pred_cv)###
    
    tree_clf = DecisionTreeClassifier(random_state=42)
    tree_clf.fit(X_train, y_train)
    y_pred_train = tree_clf.predict(X_train)
    tree_clf_train_score = accuracy_score(y_train, y_pred_train)###
    y_pred_cv = tree_clf.predict(X_cv)
    tree_clf_cv_score = accuracy_score(y_cv, y_pred_cv)###
    
    
#     svm_clf = SVC()
#     svm_clf.fit(X_train, y_train)
#     y_pred_train = svm_clf.predict(X_train)
#     svm_clf_train_score = accuracy_score(y_train, y_pred_train)###
#     y_pred_cv = svm_clf.predict(X_cv)
#     svm_clf_cv_score = accuracy_score(y_cv, y_pred_cv)###
    
#     lin_svm_clf = LinearSVC()
#     lin_svm_clf.fit(X_train, y_train)
#     y_pred_train = lin_svm_clf.predict(X_train)
#     lin_svm_clf_train_score = accuracy_score(y_train, y_pred_train)###
#     y_pred_cv = lin_svm_clf.predict(X_cv)
#     lin_svm_clf_cv_score = accuracy_score(y_cv, y_pred_cv)###
    
    xgb_clf = XGBClassifier(random_state=42)
    xgb_clf.fit(X_train, y_train)
    y_pred_train = xgb_clf.predict(X_train)
    xgb_clf_train_score = accuracy_score(y_train, y_pred_train)###
    y_pred_cv = xgb_clf.predict(X_cv)
    xgb_clf_cv_score = accuracy_score(y_cv, y_pred_cv)###
    
    grd_bst_clf = GradientBoostingClassifier(random_state=42)
    grd_bst_clf.fit(X_train, y_train)
    y_pred_train = grd_bst_clf.predict(X_train)
    grd_bst_clf_train_score = accuracy_score(y_train, y_pred_train)###
    y_pred_cv = grd_bst_clf.predict(X_cv)
    grd_bst_clf_cv_score = accuracy_score(y_cv, y_pred_cv)###
    
    estimators = [ ('lr', log_clf), ('rf', rnd_clf), ('ext', ext_rnd_clf), ('ada', ada_clf),
                  ('tr', tree_clf), ('xgb', xgb_clf), ('grd', grd_bst_clf)]
    vot_clf = VotingClassifier(estimators=estimators, voting='hard')
    vot_clf.fit(X_train, y_train)
    y_pred_train = vot_clf.predict(X_train)
    vot_clf_train_score = accuracy_score(y_train, y_pred_train)###
    y_pred_cv = vot_clf.predict(X_cv)
    vot_clf_cv_score = accuracy_score(y_cv, y_pred_cv)###
    
#     bag_clf = BaggingClassifier(vot_clf, n_estimators=100, max_samples=0.7, bootstrap=True, verbose=2, oob_score=True, n_jobs=-1)
#     bag_clf.fit(X_train, y_train)
#     y_pred_train = bag_clf.predict(X_train)
#     bag_clf_train_score = accuracy_score(y_train, y_pred_train)
#     y_pred_cv = bag_clf.predict(X_cv)
#     bag_clf_cv_score = accuracy_score(y_cv, y_pred_cv)
    
    models = ['LogisticRegression', 'RandomForestClassifier', 'ExtraTreesClassifier', 'AdaBoostClassifier',
              'DecisionTreeClassifier','XGBClassifier', 'GradientBoostingClassifier', 'VotingClassifier']
    train_scores = [ log_clf_train_score, rnd_clf_train_score, ext_rnd_clf_train_score, ada_clf_train_score,
                   tree_clf_train_score,  xgb_clf_train_score, grd_bst_clf_train_score,
                   vot_clf_train_score]
    cv_scores = [ log_clf_cv_score, rnd_clf_cv_score, ext_rnd_clf_cv_score, ada_clf_cv_score,
                   tree_clf_cv_score, xgb_clf_cv_score, grd_bst_clf_cv_score,
                   vot_clf_cv_score]
    
    dic = {'Models': models, 'train_scores':train_scores, 'cv_scores':cv_scores}
    table = pd.DataFrame(dic)
    print(table)
    
    predictions = xgb_clf.predict(X_Test)
    predictions = predictions.astype('bool')

    output = pd.DataFrame({'PassengerId': df_Test.PassengerId,
                          'Transported': predictions})

    output.to_csv('spaceship3.csv', index= False)

In [106]:
model_selection(X_train, X_cv, y_train, y_cv)

                       Models  train_scores  cv_scores
0          LogisticRegression      0.770923   0.758482
1      RandomForestClassifier      0.999425   0.795860
2        ExtraTreesClassifier      0.999425   0.787234
3          AdaBoostClassifier      0.796232   0.797585
4      DecisionTreeClassifier      0.999425   0.749856
5               XGBClassifier      0.914294   0.796435
6  GradientBoostingClassifier      0.816796   0.803910
7            VotingClassifier      0.931982   0.803910


In [51]:
sgd_clf = SGDClassifier(random_state=42)
log_clf = LogisticRegression(max_iter=10000, random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
ext_rnd_clf = ExtraTreesClassifier(random_state=42)
ada_clf = AdaBoostClassifier(random_state=42)
tree_clf = DecisionTreeClassifier(random_state=42)
svm_clf = SVC()
lin_svm_clf = LinearSVC()
xgb_clf = XGBClassifier(random_state=42)
grd_bst_clf = GradientBoostingClassifier(random_state=42)

for clf in (sgd_clf, log_clf,rnd_clf, ext_rnd_clf, ada_clf, tree_clf,svm_clf,lin_svm_clf, xgb_clf, grd_bst_clf):
    bag_clf = BaggingClassifier(clf, n_estimators=50, max_samples=1.0, bootstrap=True, oob_score=True, n_jobs=-1)
    bag_clf.fit(X_train, y_train)
    y_pred_train = bag_clf.predict(X_train)
    bag_clf_train_score = accuracy_score(y_train, y_pred_train)
    y_pred_cv = bag_clf.predict(X_cv)
    bag_clf_cv_score = accuracy_score(y_cv, y_pred_cv)
    table = pd.DataFrame({'Model': clf.__class__.__name__ , 'train_score': bag_clf_train_score, 'cv_score': bag_clf_cv_score}, index=[0])
    print(table)

           Model  train_score  cv_score
0  SGDClassifier     0.522002  0.512363
                Model  train_score  cv_score
0  LogisticRegression     0.780846  0.776308
                    Model  train_score  cv_score
0  RandomForestClassifier     0.964912   0.80161
                  Model  train_score  cv_score
0  ExtraTreesClassifier     0.999425  0.798735
                Model  train_score  cv_score
0  AdaBoostClassifier     0.796232   0.79701
                    Model  train_score  cv_score
0  DecisionTreeClassifier     0.999281   0.79356
  Model  train_score  cv_score
0   SVC     0.576503  0.571593
       Model  train_score  cv_score
0  LinearSVC     0.740293  0.751006
           Model  train_score  cv_score
0  XGBClassifier     0.907104  0.798735
                        Model  train_score  cv_score
0  GradientBoostingClassifier      0.81277  0.799885


In [62]:
y_train

3600     True
1262     True
8612    False
5075     True
4758    False
        ...  
4087     True
4406    False
7111     True
426      True
7925     True
Name: Transported, Length: 6954, dtype: bool

In [63]:
y_cv

3586     True
7173    False
8559    False
6528     True
7934    False
        ...  
3749     True
1637    False
5820     True
5757    False
4135     True
Name: Transported, Length: 1739, dtype: bool

In [65]:
X_Test

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet,CryoSleep,Cabin,Destination,VIP
0,-0.114147,-0.357339,-0.283840,-0.312173,-0.267841,-0.246712,0.0,1.0,2784.0,2.0,0.0
1,-0.684313,-0.357339,-0.277879,-0.312173,2.287504,-0.246712,0.0,0.0,1867.0,2.0,0.0
2,0.170937,-0.357339,-0.283840,-0.312173,-0.267841,-0.246712,1.0,1.0,257.0,0.0,0.0
3,0.669832,-0.357339,4.121518,-0.312173,-0.104002,0.226648,1.0,0.0,259.0,2.0,0.0
4,-0.613042,-0.340723,-0.283840,0.832122,-0.267841,-0.246712,0.0,0.0,1940.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
4272,0.384749,-0.357339,-0.283840,-0.312173,-0.267841,-0.246712,0.0,1.0,2679.0,2.0,0.0
4273,0.954916,-0.357339,0.277095,-0.281538,-0.258790,-0.130193,0.0,0.0,2691.0,2.0,0.0
4274,-0.185417,-0.357339,-0.283840,-0.312173,-0.267841,-0.246712,2.0,1.0,602.0,0.0,0.0
4275,-0.185417,-0.357339,1.491019,-0.312173,-0.267841,0.176479,1.0,0.0,603.0,2.0,0.0


In [111]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

gbrt = GradientBoostingClassifier(random_state = 42)
parameters  = {
    'learning_rate':[0.1,0.09,0.08,0.07,0.06,0.05,0.04,0.03,0.02],
    'min_samples_split':[2,3,4,5,6,7,8,9],
    'min_samples_leaf':[1,2,3,4,5],
    'max_depth':[1,2,3],
    'min_weight_fraction_leaf':[0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
              }
grid_search = GridSearchCV(estimator = gbrt,
                           param_grid = parameters,
                           cv = 3,
                           verbose=1,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 6480 candidates, totalling 19440 fits


In [112]:
grid_search.best_estimator_ #learning_rate=0.1, min_samples_split=2, min_samples_leaf=1 => default

In [113]:
grid_search.best_params_

{'learning_rate': 0.09,
 'max_depth': 3,
 'min_samples_leaf': 2,
 'min_samples_split': 6,
 'min_weight_fraction_leaf': 0.0}

In [69]:
grd_bst_clf = GradientBoostingClassifier(learning_rate=0.09, min_samples_leaf=2,
                           min_samples_split=5, random_state=42)
grd_bst_clf.fit(X_train, y_train)
y_pred_train = grd_bst_clf.predict(X_train)
grd_bst_clf_train_score = accuracy_score(y_train, y_pred_train)###
y_pred_cv = grd_bst_clf.predict(X_cv)
grd_bst_clf_cv_score = accuracy_score(y_cv, y_pred_cv)###

In [71]:
dic = {'Models': 'GradientBoostingClassifier', 'train_scores':grd_bst_clf_train_score, 'cv_scores':grd_bst_clf_cv_score}
table = pd.DataFrame(dic, index=[0])
print(table)

                       Models  train_scores  cv_scores
0  GradientBoostingClassifier      0.813345    0.80276


In [72]:
predictions = grd_bst_clf.predict(X_Test)

output = pd.DataFrame({'PassengerId': df_Test.PassengerId,
                      'Transported': predictions})

output.to_csv('spaceship4.csv', index= False)

In [104]:
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

clf = Pipeline([
    ('kmeans', KMeans(n_clusters=50)),
    ('clf_',  AdaBoostClassifier(random_state=42))
])

clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
clf_train_score = accuracy_score(y_train, y_pred_train)###
y_pred_cv = clf.predict(X_cv)
clf_cv_score = accuracy_score(y_cv, y_pred_cv)###

dic = {'Models': 'clf', 'train_scores': clf_train_score, 'cv_scores': clf_cv_score}
table = pd.DataFrame(dic, index=[0])
print(table)

  Models  train_scores  cv_scores
0    clf       0.62367   0.610121


In [114]:
grd_bst_clf = GradientBoostingClassifier(learning_rate=0.09, min_samples_leaf=2,
                           min_samples_split=6, random_state=42)
grd_bst_clf.fit(X_train, y_train)
y_pred_train = grd_bst_clf.predict(X_train)
grd_bst_clf_train_score = accuracy_score(y_train, y_pred_train)###
y_pred_cv = grd_bst_clf.predict(X_cv)
grd_bst_clf_cv_score = accuracy_score(y_cv, y_pred_cv)###

# {'learning_rate': 0.09,#
#  'max_depth': 3,
#  'min_samples_leaf': 2,#
#  'min_samples_split': 6,
#  'min_weight_fraction_leaf': 0.0}
dic = {'Models': 'GradientBoostingClassifier', 'train_scores':grd_bst_clf_train_score, 'cv_scores':grd_bst_clf_cv_score}
table = pd.DataFrame(dic, index=[0])
print(table)

                       Models  train_scores  cv_scores
0  GradientBoostingClassifier      0.812482   0.801035


In [115]:
predictions = grd_bst_clf.predict(X_Test)

output = pd.DataFrame({'PassengerId': df_Test.PassengerId,
                      'Transported': predictions})

output.to_csv('spaceship5.csv', index= False)