In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MaxAbsScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV


In [36]:
test_data = pd.read_csv('/Users/yukihuang/Documents/Grinnell College/Spring2024/STA395/final project/Spaceship_Titanic/spaceship-titanic/test_data_mode.csv')
train_data = pd.read_csv('/Users/yukihuang/Documents/Grinnell College/Spring2024/STA395/final project/Spaceship_Titanic/spaceship-titanic/train_data_mode.csv')

test_passenger_id = test_data['PassengerId'].copy()
train_data.drop(['PassengerId', 'Name', 'Cabin'], axis=1, inplace=True)
test_data.drop(['PassengerId', 'Name', 'Cabin'], axis=1, inplace=True)

# Convert Boolean Columns to Floats
train_data.replace({False: 0, True: 1}, inplace=True)
test_data.replace({False: 0, True: 1}, inplace=True)

# Splitting the features and target
train_y = train_data['Transported'].astype(float)
train_X = train_data.drop('Transported', axis=1).astype(float)
test_X = test_data.astype(float)

In [37]:
# k-nearest neighbors

pipe = Pipeline([
    ('transformer',  PowerTransformer(method = 'yeo-johnson')),
    ('scaler', StandardScaler()),
    ('reducer', PCA()),
    ('classifier', None)
])

parms_knn = [
    {'classifier': [KNeighborsClassifier()],
     'scaler': [StandardScaler(), RobustScaler(), MaxAbsScaler()],
    'reducer': ['passthrough', PCA(n_components=4), PCA(n_components=8)],
    'classifier__n_neighbors': [20, 30,35, 40,45, 55],
    'classifier__weights': ['distance', 'uniform'],
    'classifier__p': [1, 2]
}]

grid_res_knn = GridSearchCV(pipe, parms_knn, cv=5, scoring = 'accuracy').fit(train_X, train_y)
print(grid_res_knn.best_estimator_)
print(grid_res_knn.best_score_)


Pipeline(steps=[('transformer', PowerTransformer()), ('scaler', MaxAbsScaler()),
                ('reducer', 'passthrough'),
                ('classifier',
                 KNeighborsClassifier(n_neighbors=35, p=1,
                                      weights='distance'))])
0.7617624112372294


In [38]:
test_predictions = grid_res_knn.best_estimator_.predict(test_X)
submission_df = pd.DataFrame({
    "PassengerId": test_passenger_id,
    "Transported": test_predictions.flatten().astype(bool)
})
submission_df.to_csv('/Users/yukihuang/Documents/Grinnell College/Spring2024/STA395/final project/Spaceship_Titanic/spaceship-titanic/submission_knn.csv', index=False)

In [40]:
# decision tree

parms_dt = [
    {'classifier': [DecisionTreeClassifier()],
    'scaler': [StandardScaler(), RobustScaler(), MaxAbsScaler()],
    'reducer': ['passthrough', PCA(n_components=4), PCA(n_components=8)],
    'classifier__max_depth': range(3, 10),  
    'classifier__min_samples_split': range(2, 11), 
    'classifier__min_samples_leaf': range(1, 6)
}]

grid_res_dt = GridSearchCV(pipe, parms_dt, cv=5, scoring = 'accuracy',n_jobs = -1).fit(train_X, train_y)
print(grid_res_dt.best_estimator_)
print(grid_res_dt.best_score_)


Pipeline(steps=[('transformer', PowerTransformer()), ('scaler', RobustScaler()),
                ('reducer', 'passthrough'),
                ('classifier',
                 DecisionTreeClassifier(max_depth=8, min_samples_leaf=3,
                                        min_samples_split=4))])
0.777638829241307


In [41]:
test_predictions_dt = grid_res_dt.best_estimator_.predict(test_X)
submission_dt = pd.DataFrame({
    "PassengerId": test_passenger_id,
    "Transported": test_predictions_dt.flatten().astype(bool)
})
submission_dt.to_csv('/Users/yukihuang/Documents/Grinnell College/Spring2024/STA395/final project/Spaceship_Titanic/spaceship-titanic/submission_dt.csv', index=False)


In [66]:
# Support vector machine

parms_svm = [{
    'classifier': [SVC()],
    'scaler': [StandardScaler(), RobustScaler(), MaxAbsScaler()],
    'reducer': ['passthrough', PCA(n_components= 12), PCA(n_components= 16)],
    'classifier__kernel': ['poly', 'linear', 'rbf'],
    'classifier__C': [15, 20, 25],
    'classifier__class_weight': ['balanced', None]
}]

grid_res_svm = GridSearchCV(pipe, parms_svm, cv=5, scoring = 'accuracy', n_jobs = -1).fit(train_X, train_y)
print(grid_res_svm.best_estimator_)
print(grid_res_svm.best_score_)

Pipeline(steps=[('transformer', PowerTransformer()), ('scaler', MaxAbsScaler()),
                ('reducer', PCA(n_components=12)),
                ('classifier',
                 SVC(C=25, class_weight='balanced', kernel='poly'))])
0.7776398880088619


In [65]:
test_predictions_svm = grid_res_svm.best_estimator_.predict(test_X)
submission_svm = pd.DataFrame({
    "PassengerId": test_passenger_id,
    "Transported": test_predictions_svm.flatten().astype(bool)
})
submission_svm.to_csv('/Users/yukihuang/Documents/Grinnell College/Spring2024/STA395/final project/Spaceship_Titanic/spaceship-titanic/submission_svm.csv', index=False)


In [51]:
# random forest 

parms_rf = [
    {
        'classifier': [RandomForestClassifier()],
        'classifier__max_depth': [6, 8, 10, 15, 20],
        'classifier__max_features': ['sqrt'],
        'classifier__min_samples_split': [3, 5, 6, 8, 10],
        'classifier__min_samples_leaf' : [1, 2, 4, 6],
        'classifier__n_estimators': [100, 200, 400]
    }
]

grid_res_rf = GridSearchCV(pipe, parms_rf, cv = 5, scoring = 'accuracy',n_jobs = -1).fit(train_X, train_y)

print(grid_res_rf.best_params_)
print(grid_res_rf.best_score_)

{'classifier': RandomForestClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=3,
                       n_estimators=400), 'classifier__max_depth': 10, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 3, 'classifier__n_estimators': 400}
0.7644078081460252


In [53]:
test_predictions_rf = grid_res_rf.best_estimator_.predict(test_X)
submission_rf = pd.DataFrame({
    "PassengerId": test_passenger_id,
    "Transported": test_predictions_rf.flatten().astype(bool)
})
submission_rf.to_csv('/Users/yukihuang/Documents/Grinnell College/Spring2024/STA395/final project/Spaceship_Titanic/spaceship-titanic/submission_rf.csv', index=False)


In [45]:
# Xgboost
parms_xg = [
    {
    'classifier': [XGBClassifier()],
    'classifier__n_estimators': [25, 50, 75, 100],            
    'classifier__learning_rate':[0.01, 0.05, 0.1, 0.15],       
    'classifier__max_depth': [2, 4, 6, 8],     
    'classifier__gamma': [0, 0.05, 0.1, 0.15],         
    'classifier__colsample_bytree': [0.5, 0.7, 0.8], 
    'classifier__random_state': [20, 30, 40, 50]         
}]

grid_res_xg = GridSearchCV(pipe, parms_xg, cv = 5, scoring = 'accuracy',n_jobs = -1).fit(train_X, train_y)

print(grid_res_xg.best_params_)
print(grid_res_xg.best_score_) 


{'classifier': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.15, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=75, n_jobs=None,
              num_parallel_tree=None, random_state=30, ...), 'classifier__colsample_bytree': 0.8, 'classifier__gamma': 0.15, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 6, 'classifier__n_estimators': 75, 'classifier__random_state': 30}
0.7618747067710171


In [52]:
test_predictions_xgb = grid_res_xg.best_estimator_.predict(test_X)
submission_xgb = pd.DataFrame({
    "PassengerId": test_passenger_id,
    "Transported": test_predictions_xgb.flatten().astype(bool)
})
submission_xgb.to_csv('/Users/yukihuang/Documents/Grinnell College/Spring2024/STA395/final project/Spaceship_Titanic/spaceship-titanic/submission_xgb.csv', index=False)


In [62]:
# logistic regression 
parms_lg =[
    {
        'classifier': [LogisticRegression()],
        'classifier__penalty': ['l1','l2'],
        'classifier__C': [0.1, 0.5, 0.8, 1.0],
        'classifier__solver': ['liblinear', 'saga'],
        'classifier__random_state': [0, 1, 2],
        'classifier__max_iter': [1000]
    },
    {
        'classifier': [LogisticRegression()],
        'classifier__penalty': ['l2'],
        'classifier__C': [0.1, 0.5, 0.8, 1.0],
        'classifier__solver': ['lbfgs', 'newton-cg', 'saga'],
        'classifier__random_state': [0, 1, 2],
        'classifier__max_iter': [1000]
    },
    {
        'classifier': [LogisticRegression()],
        'classifier__penalty': [None],
        'classifier__solver': ['lbfgs', 'newton-cg', 'saga'],
        'classifier__random_state': [0, 1, 2],
        'classifier__max_iter': [1000]
    },
    {
        'classifier': [LogisticRegression()],
        'classifier__penalty': ['elasticnet'],
        'classifier__C': [0.1, 0.5, 0.8, 1.0],
        'classifier__solver': ['saga'],
        'classifier__l1_ratio': [0.1, 0.15, 0.4, 0.6, 0.8],
        'classifier__random_state': [0, 1, 2],
        'classifier__max_iter': [1000]
    }]

grid_res_lg = GridSearchCV(pipe, parms_lg, cv = 5, scoring = 'accuracy',n_jobs = -1).fit(train_X, train_y)

print(grid_res_lg.best_params_)
print(grid_res_lg.best_score_) 


{'classifier': LogisticRegression(C=0.1, max_iter=1000, penalty='l1', random_state=0,
                   solver='liblinear'), 'classifier__C': 0.1, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__random_state': 0, 'classifier__solver': 'liblinear'}
0.7501429005334204


In [63]:
test_predictions_lg = grid_res_lg.best_estimator_.predict(test_X)
submission_lg = pd.DataFrame({
    "PassengerId": test_passenger_id,
    "Transported": test_predictions_lg.flatten().astype(bool)
})
submission_lg.to_csv('/Users/yukihuang/Documents/Grinnell College/Spring2024/STA395/final project/Spaceship_Titanic/spaceship-titanic/submission_lg.csv', index=False)
