In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.pipeline import Pipeline
from sklearn.base import ClassifierMixin, BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('train.csv')
data.fillna(value=0,inplace=True)
#columns_to_drop = ["Name","PassengerId"] #get rid of categorical values
#data = data.drop(columns = columns_to_drop)
#data = pd.get_dummies(data)
columns_to_drop = ["Name", "PassengerId"]
data_temp = data.drop(columns=columns_to_drop)


X_copy = data_temp.copy()
#extract the cabin info, deck, number, and side

cabin_info = X_copy['Cabin'].str.extract(r'(?P<Deck>[A-Za-z])/(?P<Number>\d+)/(?P<Side>[PS])')
print(cabin_info['Deck'])
X_copy = pd.concat([X_copy, cabin_info['Deck']], axis=1)
X_copy = pd.concat([X_copy, cabin_info['Side']], axis=1)

X_copy = X_copy.drop(columns="Cabin")  
data_temp = X_copy;
data_temp = pd.get_dummies(data_temp)
data_temp = pd.concat([data_temp, cabin_info['Number'].astype('float')], axis=1)

data_temp['PassengerId'] = data['PassengerId']

print(data_temp.columns.tolist())

train = data_temp.sample(frac = 0.70)

test = data.drop(train.index)
train_xs = train.drop(columns = "Transported")
train_ys = train['Transported']
test_xs = test.drop(columns = "Transported")
test_ys = test['Transported']


train_xs.dtypes
print(train_xs)

0       B
1       F
2       A
3       A
4       F
       ..
8688    A
8689    G
8690    G
8691    E
8692    E
Name: Deck, Length: 8693, dtype: object
['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Transported', 'HomePlanet_0', 'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars', 'CryoSleep_False', 'CryoSleep_True', 'Destination_0', 'Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'VIP_False', 'VIP_True', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'Side_P', 'Side_S', 'Number', 'PassengerId']
       Age  RoomService  FoodCourt  ShoppingMall   Spa  VRDeck  HomePlanet_0  \
3530  55.0          0.0        0.0           0.0   0.0     0.0         False   
1531   4.0          0.0        0.0           0.0   0.0     0.0         False   
5148  28.0          0.0        0.0           0.0   0.0     0.0         False   
5401  54.0          0.0        0.0           0.0   0.0     0.0         False   
203

In [3]:
class ExtractGroupNumber(BaseEstimator, ClassifierMixin):
    def __init__(self, column_name='PassengerId'):
        self.column_name = column_name

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.column_name in X.columns:
            X_copy = X.copy()
            X_copy['GroupNumber'] = X_copy[self.column_name].apply(lambda x: int(x.split('_')[0]))
            #print(X_copy['GroupNumber'].dtypes)
            return X_copy[['GroupNumber']]
        else:
            return X


gradientboosting_pipeline = Pipeline([
    ('extract_group', ExtractGroupNumber()),
    ('scaler',MinMaxScaler()),
    ('gradient_boosting', GradientBoostingClassifier())
])

gradientboosting_grid = {
    'gradient_boosting__n_estimators': [50,100,200],  #number of boosting stages, larger number tends to do better
    'gradient_boosting__max_depth': [3,5,7],  #limits number of nodes in tree
}
gradientboosting_search = GridSearchCV(gradientboosting_pipeline, gradientboosting_grid, scoring='accuracy', n_jobs=-1)
gradientboosting_search.fit(train_xs, train_ys)

In [4]:
gradientboosting_params = gradientboosting_search.best_params_
gradientboosting_score = gradientboosting_search.best_score_
print(f"Accuracy: {gradientboosting_score}")
print(f"Best params: {gradientboosting_params}\n")

Accuracy: 0.5467543138866064
Best params: {'gradient_boosting__max_depth': 3, 'gradient_boosting__n_estimators': 50}

