<a href="https://colab.research.google.com/github/SJP0308/MLTUTORIAL/blob/titanic/titanic2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
# 경고 메시지를 무시하도록 설정
warnings.filterwarnings('ignore')

# 플롯을 인라인으로 표시하도록 설정
%matplotlib inline

In [2]:
from google.colab import drive

drive.mount('/content/drive')
train_data = pd.read_csv('/content/drive/My Drive/titanic/train.csv')
test_data = pd.read_csv('/content/drive/My Drive/titanic/test.csv')

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **특성 공학**

In [3]:
def fill_age_with_mean(age, title):
    if pd.isnull(age):
        mean_age = {'Master': 4, 'Miss': 21, 'Mr': 33, 'Mrs':35}
        return mean_age.get(title, age)
    return age

def extract_age_group(age):
    if pd.notnull(age):
        return int((age-0.01)//10)

def fill_fare_with_train_mean(fare, Pclass):
    if pd.isnull(fare):
        mean_fares = {1: 84.15, 2: 20.66, 3: 13.68}
        return mean_fares.get(Pclass, fare)
    return round(fare, 2)

def fill_fare_with_test_mean(fare, Pclass):
    if pd.isnull(fare):
        mean_fares = {1: 94.28, 2: 22.20, 3: 12.46}
        return mean_fares.get(Pclass, fare)
    return round(fare, 2)

title_mapping = {'Mlle': 'Miss', 'Mme': 'Mrs', 'Ms': 'Miss', 'Dr': 'Mr', 'Major': 'Mr', 'Lady': 'Mrs',
    'Countess': 'Mrs', 'Jonkheer': 'Mr', 'Col': 'Mr', 'Rev': 'Mr', 'Capt': 'Mr', 'Sir': 'Mr',
    'Don': 'Mr', 'Dona': 'Mrs'}

In [4]:
train_df['Title'] = train_df['Name'].str.extract(' ([A-Z][a-z]+)\.')
test_df['Title']  = test_df['Name'].str.extract(' ([A-Z][a-z]+)\.')
train_df['Title'].replace(title_mapping, inplace=True)
test_df['Title'].replace(title_mapping, inplace=True)

train_df['Age'] = train_df.apply(lambda row: fill_age_with_mean(row['Age'], row['Title']), axis=1)
test_df['Age'] = test_df.apply(lambda row: fill_age_with_mean(row['Age'], row['Title']), axis=1)
train_df['AgeGroup'] = train_df['Age'].apply(extract_age_group)
test_df['AgeGroup'] = test_df['Age'].apply(extract_age_group)

train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch']
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch']
train_df['IsAlone'] = (train_df['FamilySize'] == 1).astype(int)
test_df['IsAlone'] = (test_df['FamilySize'] == 1).astype(int)

train_df['Fare'] = train_df.apply(lambda row: fill_fare_with_train_mean(row['Fare'], row['Pclass']), axis=1)
test_df['Fare'] = test_df.apply(lambda row: fill_fare_with_test_mean(row['Fare'], row['Pclass']), axis=1)
train_df['Fare'] = np.where(train_df['Fare'] > 0, np.log(train_df['Fare']), 0)
test_df['Fare'] = np.where(test_df['Fare'] > 0, np.log(test_df['Fare']), 0)

train_df['Embarked'] = train_df['Embarked'].fillna('S')
test_df['Embarked'] = test_df['Embarked'].fillna('S')

In [5]:
train_df.drop(['PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_df.drop(['PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1, inplace=True)

# **원-핫 인코딩**

In [6]:
train_df['Sex'] = train_df['Sex'].map({'female': 0, 'male': 1})
test_df['Sex'] = test_df['Sex'].map({'female': 0, 'male': 1})

train_df['Embarked'] = train_df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
test_df['Embarked'] = test_df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

train_df['Title'] = train_df['Title'].map({'Master': 0, 'Miss': 1, 'Mr': 2, 'Mrs': 3})
test_df['Title'] = test_df['Title'].map({'Master': 0, 'Miss': 1, 'Mr': 2, 'Mrs': 3})

In [7]:
for column in ['Pclass', 'Sex', 'Embarked', 'Title', 'AgeGroup', 'FamilySize', 'IsAlone']:
  train_df = pd.get_dummies(train_df, columns=[column], prefix=column)
  test_df = pd.get_dummies(test_df, columns=[column], prefix=column)

# **모델 학습 및 테스트 세트 예측**

In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# 데이터 준비
X_train = train_df.drop('Survived', axis=1).values
target_label = train_df['Survived'].values
X_test = test_df.values

# 훈련 세트와 검증 세트로 분할
X_tr, X_vld, y_tr, y_vld = train_test_split(X_train, target_label, test_size=0.3, random_state=42)

# XGBoost 모델 초기화
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# 하이퍼파라미터 그리드 설정
param_grid = {
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 2, 3],
    'subsample': [0.6, 0.7, 0.8],
    'colsample_bytree': [0.6, 0.7, 0.8],
    'n_estimators': [100, 200]
}

# K-Fold 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV 설정
grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           scoring='accuracy', cv=kf, verbose=1, n_jobs=-1)

# 모델 학습
grid_search.fit(X_tr, y_tr)

# 최적 하이퍼파라미터 출력
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# 최적 모델로 검증 데이터에서 성능 평가
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_vld)
accuracy = accuracy_score(y_vld, y_pred)
print("Validation Accuracy: {:.2f}".format(accuracy))

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best parameters: {'colsample_bytree': 0.8, 'max_depth': 4, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.6}
Validation Accuracy: 0.81


In [9]:
submission = pd.read_csv('/content/drive/My Drive/titanic/gender_submission.csv')
prediction = best_model.predict(X_test)
submission['Survived'] = prediction
submission.to_csv('./answer.csv', index=False)