In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier

# 학습 데이터 로딩
titanic_data = pd.read_csv('./titanic_train.csv')

# 'Cabin' 열 제거
titanic_data = titanic_data.drop(columns='Cabin', axis=1)

# 'Age' 열의 결측치를 평균값으로 대체
titanic_data['Age'].fillna(titanic_data['Age'].mean(), inplace=True)

# 'Embarked' 열의 결측치를 최빈값으로 대체
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)

# 'Survived'를 문자열로 변환
titanic_data['Survived'] = titanic_data['Survived'].astype(str)

# 'Embarked' 열의 문자열을 숫자로 변환
titanic_data.replace({'Sex': {'male': 0, 'female': 1}, 'Embarked': {'S': 0, 'C': 1, 'Q': 2}}, inplace=True)

# 'Name', 'Ticket', 'PassengerId' 열 제거
titanic_data = titanic_data.drop(columns=['Name', 'Ticket', 'PassengerId'], axis=1)

# 독립 변수(X)와 종속 변수(Y) 설정
X = titanic_data.drop(columns=['Survived'], axis=1)
Y = titanic_data['Survived']

# 학습 데이터와 테스트 데이터로 분할
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

# LightGBM 모델 초기화 및 학습
lgbm_model = LGBMClassifier(random_state=2)
lgbm_model.fit(X_train, Y_train)

# 학습 데이터 정확도 출력
X_train_prediction_lgbm = lgbm_model.predict(X_train)
training_data_accuracy_lgbm = accuracy_score(Y_train, X_train_prediction_lgbm)
print('Accuracy score of training data (LightGBM): ', training_data_accuracy_lgbm)

# 테스트 데이터 정확도 출력
X_test_prediction_lgbm = lgbm_model.predict(X_test)
test_data_accuracy_lgbm = accuracy_score(Y_test, X_test_prediction_lgbm)
print('Accuracy score of test data (LightGBM): ', test_data_accuracy_lgbm)


Accuracy score of training data (LightGBM):  0.949438202247191
Accuracy score of test data (LightGBM):  0.7821229050279329
