<a href="https://www.kaggle.com/code/noibl0/titanic-ml?scriptVersionId=131705288" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [2]:
def convert_columns(df):
    df['Cabin'].fillna(0, inplace=True)
    df['Cabin'] = np.where(df['Cabin'] == 0, 0, 1)
    return df

In [3]:
def select_drop_features(df, cardinality=6):
    # over 30 cardinality column will be dropped.
    
    columns_to_drop = df.apply(lambda x: x.nunique() > cardinality)
    return columns_to_drop

In [4]:
def feature_engineering(df, drop_features):
    X = df.drop(columns=drop_features[drop_features].index)
    return X

In [5]:
def split_X_y(df):
    target = 'Survived'
    X = df.drop(columns=target)
    y = df[target]
    return X, y

In [6]:
df = pd.read_csv('/kaggle/input/titanic/train.csv')
df_t = pd.read_csv('/kaggle/input/titanic/test.csv')

In [7]:
df = convert_columns(df)
df_t = convert_columns(df_t)
drop_features = select_drop_features(df)
df = feature_engineering(df, drop_features)
X_test = feature_engineering(df_t, drop_features)
X_train, y_train = split_X_y(df)

Comparing the train data with the test data, Survived feature is the target to be predicted.

Select the features to predict target using in ML.

In [8]:
pipe = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
)

pipe.fit(X_train, y_train)

In [9]:
params = {
    "xgbclassifier__max_depth": [2, 4, 6],
    "xgbclassifier__min_child_weight": [2, 4, 8],
    "xgbclassifier__colsample_bytree": [0.6, 0.8, 1.0],
}

In [10]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipe, param_grid=params, scoring="roc_auc", cv=3, verbose=3)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV 1/3] END xgbclassifier__colsample_bytree=0.6, xgbclassifier__max_depth=2, xgbclassifier__min_child_weight=2;, score=0.817 total time=   0.0s
[CV 2/3] END xgbclassifier__colsample_bytree=0.6, xgbclassifier__max_depth=2, xgbclassifier__min_child_weight=2;, score=0.852 total time=   0.0s
[CV 3/3] END xgbclassifier__colsample_bytree=0.6, xgbclassifier__max_depth=2, xgbclassifier__min_child_weight=2;, score=0.842 total time=   0.0s
[CV 1/3] END xgbclassifier__colsample_bytree=0.6, xgbclassifier__max_depth=2, xgbclassifier__min_child_weight=4;, score=0.817 total time=   0.0s
[CV 2/3] END xgbclassifier__colsample_bytree=0.6, xgbclassifier__max_depth=2, xgbclassifier__min_child_weight=4;, score=0.852 total time=   0.0s
[CV 3/3] END xgbclassifier__colsample_bytree=0.6, xgbclassifier__max_depth=2, xgbclassifier__min_child_weight=4;, score=0.842 total time=   0.0s
[CV 1/3] END xgbclassifier__colsample_bytree=0.6, xgbclassifier__max_

In [11]:
print("최적 하이퍼파라미터: ", grid_search.best_params_)
print("최적 AUC: ", grid_search.best_score_)

최적 하이퍼파라미터:  {'xgbclassifier__colsample_bytree': 1.0, 'xgbclassifier__max_depth': 6, 'xgbclassifier__min_child_weight': 4}
최적 AUC:  0.8428482408206307


In [12]:
y_pred = grid_search.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.95      0.86       549
           1       0.88      0.58      0.70       342

    accuracy                           0.81       891
   macro avg       0.84      0.77      0.78       891
weighted avg       0.82      0.81      0.80       891



In [13]:
y_pred = grid_search.predict(X_test)
submission = pd.DataFrame({'PassengerId' : df_t['PassengerId'], 'Survived' : y_pred})
submission.to_csv('submission.csv', index=False)