In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MaxAbsScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [10]:
train_data = pd.read_csv('/Users/yukihuang/Documents/Grinnell College/Spring2024/STA395/final project/Spaceship_Titanic/spaceship-titanic/train_data_knn.csv')
train_data_mode = pd.read_csv('/Users/yukihuang/Documents/Grinnell College/Spring2024/STA395/final project/Spaceship_Titanic/spaceship-titanic/train_data_mode.csv')

In [11]:
train_data_mode

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0.0,1.0,0.0,149.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,5252.0,0.0
1,1.0,0.0,0.0,2184.0,2.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,4502.0,1.0
2,2.0,1.0,0.0,1.0,2.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,457.0,0.0
3,3.0,1.0,0.0,1.0,2.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,7149.0,0.0
4,4.0,0.0,0.0,2186.0,2.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,8319.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,8688.0,1.0,0.0,146.0,0.0,41.0,1.0,0.0,6819.0,0.0,1643.0,74.0,3524.0,0.0
8689,8689.0,0.0,1.0,5280.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,4780.0,0.0
8690,8690.0,0.0,0.0,5285.0,2.0,26.0,0.0,0.0,0.0,1872.0,1.0,0.0,3002.0,1.0
8691,8691.0,1.0,0.0,2131.0,0.0,32.0,0.0,0.0,1049.0,0.0,353.0,3235.0,1596.0,0.0


In [14]:
train_y = train_data['Transported']
train_X = train_data.drop(['Transported'], axis = 1)

In [25]:
# knn

pipe = Pipeline([
    ('transformer',  PowerTransformer(method = 'yeo-johnson')),
    ('scaler', StandardScaler()),
    ('reducer', PCA()),
    ('classifier', None)
])

parms_knn = [
    {'classifier': [KNeighborsClassifier()],
    'scaler': [StandardScaler(), RobustScaler(), MaxAbsScaler()],
    'reducer': ['passthrough', PCA(n_components=4), PCA(n_components=8)],
    'classifier__n_neighbors': [5,10,15,20],
    'classifier__weights': ['distance', 'uniform'],
    'classifier__p': [1, 2]
}]

grid_res_knn = GridSearchCV(pipe, parms_knn, cv=5, scoring = 'accuracy').fit(train_X, train_y)
print(grid_res_knn.best_estimator_)
print(grid_res_knn.best_score_)


Pipeline(steps=[('transformer', PowerTransformer()), ('scaler', MaxAbsScaler()),
                ('reducer', 'passthrough'),
                ('classifier',
                 KNeighborsClassifier(n_neighbors=20, p=1,
                                      weights='distance'))])
0.7593474947905328


In [26]:
# decision tree

parms_dt = [
    {'classifier': [DecisionTreeClassifier()],
    'scaler': [StandardScaler(), RobustScaler(), MaxAbsScaler()],
    'reducer': ['passthrough', PCA(n_components=4), PCA(n_components=8)],
    'classifier__max_depth': range(3, 10),  
    'classifier__min_samples_split': range(2, 11), 
    'classifier__min_samples_leaf': range(1, 6)
}]

grid_res_dt = GridSearchCV(pipe, parms_dt, cv=5, scoring = 'accuracy').fit(train_X, train_y)
print(grid_res_dt.best_estimator_)
print(grid_res_dt.best_score_)


Pipeline(steps=[('transformer', PowerTransformer()), ('scaler', RobustScaler()),
                ('reducer', 'passthrough'),
                ('classifier',
                 DecisionTreeClassifier(max_depth=7, min_samples_leaf=5,
                                        min_samples_split=3))])
0.7793661423340927


In [36]:
# Support vector machine

parms_svm = [{
    'classifier': [SVC()],
    'scaler': [StandardScaler(), RobustScaler(), MaxAbsScaler()],
    'reducer': ['passthrough', PCA(n_components=4), PCA(n_components=8)],
    'classifier__kernel': ['poly', 'linear', 'rbf'],
    'classifier__C': [0.1, 1, 5],
    'classifier__class_weight': ['balanced', None]
}]

grid_res_svm = GridSearchCV(pipe, parms_svm, cv=5, scoring = 'accuracy', n_jobs = -1).fit(train_X, train_y)
print(grid_res_svm.best_estimator_)
print(grid_res_svm.best_score_)

Pipeline(steps=[('transformer', PowerTransformer()), ('scaler', MaxAbsScaler()),
                ('reducer', 'passthrough'),
                ('classifier',
                 SVC(C=5, class_weight='balanced', kernel='poly'))])
0.7757989559228451


In [31]:
# random forest 

parms_rf = [
    {
        'classifier': [RandomForestRegressor()],
        'classifier__max_depth': [10, 15, 20],
        'classifier__max_features': ['sqrt'],
        'classifier__min_samples_split': [3, 5, 10],
        'classifier__min_samples_leaf' : [1, 2, 4, 6],
        'classifier__n_estimators': [100, 200, 400]
    }
]

grid_res_rf = GridSearchCV(pipe, parms_rf, cv = 5, scoring = 'accuracy',n_jobs = -1).fit(train_X, train_y)

print(grid_res_rf.best_params_)
print(grid_res_rf.best_score_)

Traceback (most recent call last):
  File "/Users/yukihuang/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/yukihuang/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/yukihuang/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/yukihuang/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/U

{'classifier': RandomForestRegressor(max_depth=10, max_features='sqrt', min_samples_split=3), 'classifier__max_depth': 10, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 3, 'classifier__n_estimators': 100}
nan


In [34]:
# Xgboost
parms_xg = [
    {
    'classifier': [XGBClassifier()],
    'classifier__n_estimators': [25, 50, 75, 100],            
    'classifier__learning_rate':[0.01, 0.05, 0.1, 0.15],       
    'classifier__max_depth': [2, 4, 6, 8],     
    'classifier__gamma': [0, 0.05, 0.1, 0.15],         
    'classifier__colsample_bytree': [0.5, 0.7, 0.8], 
    'classifier__random_state': [20, 30, 40, 50]         
}]

grid_res_xg = GridSearchCV(pipe, parms_xg, cv = 5, scoring = 'accuracy',n_jobs = -1).fit(train_X, train_y)

print(grid_res_xg.best_params_)
print(grid_res_xg.best_score_) 


{'classifier': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.7, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.05, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=75, n_jobs=None,
              num_parallel_tree=None, random_state=50, ...), 'classifier__colsample_bytree': 0.7, 'classifier__gamma': 0, 'classifier__learning_rate': 0.05, 'classifier__max_depth': 6, 'classifier__n_estimators': 75, 'classifier__random_state': 50}
0.7706205238120132
