데이터 전처리 없이 머신러닝 돌리기

In [1]:
# 라이브러리
import os
import pandas as pd
import numpy as np

# 데이터 시각화
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import torch
import random

def reset_seeds(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)    # 파이썬 환경변수 시드 고정
    np.random.seed(seed)
    torch.manual_seed(seed) # cpu 연산 무작위 고정
    torch.cuda.manual_seed(seed) # gpu 연산 무작위 고정
    torch.backends.cudnn.deterministic = True  # cuda 라이브러리에서 Deterministic(결정론적)으로 예측하기 (예측에 대한 불확실성 제거 )

In [2]:
# hotel폴더의 archive폴더에 있는 CSV로드
import easydict
args = easydict.EasyDict()

args.default_path = './archive/hotel_bookings.csv'

# 데이터 분석을 위한 변수들
args.random_state = 21

데이터 로드

In [3]:
plt.style.use('fivethirtyeight')
plt.ion()

import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv(args.default_path)
df.shape

(119390, 32)

In [5]:
df["country"].mode()[0]

'PRT'

In [6]:
df.dtypes

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             

In [7]:
# company, agent, children → 0으로 채우기
df["company"] = df["company"].fillna(0)
df["agent"] = df["agent"].fillna(0)
df["children"] = df["children"].fillna(0)

# country → 최빈값으로 채우기
df["country"] = df["country"].fillna(df["country"].mode()[0])

train, test 분리

In [8]:
from sklearn.model_selection import train_test_split

# 전체 train/test 먼저
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['is_canceled'])

# train 안에서 다시 eval 분리
train_df, eval_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['is_canceled'])

print("Train:", train_df.shape)
print("Eval :", eval_df.shape)
print("Test :", test_df.shape)
# 과적합을 막기 위해 train, eval, test로 나눔

Train: (76409, 32)
Eval : (19103, 32)
Test : (23878, 32)


staking 기법 사용

In [9]:
from sklearn.ensemble import StackingClassifier

from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [10]:
# X, y 정의
X = df.drop(["is_canceled", "reservation_status", "reservation_status_date"], axis=1)
y = df["is_canceled"]

# train/test split
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [11]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 수치형/범주형 컬럼 분리
num_cols = X_tr.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X_tr.select_dtypes(include=['object']).columns.tolist()

# 전처리 파이프라인
preprocessor = ColumnTransformer(
    transformers=[
        # ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)


MLP

In [12]:
from sklearn.neural_network import MLPClassifier

# MLP 파이프라인 생성
mlp_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MLPClassifier(max_iter=1000, random_state=42))
])

# 학습 및 평가
mlp_pipeline.fit(X_tr, y_tr)
print("MLPClassifier 훈련 점수:", mlp_pipeline.score(X_tr, y_tr))
print("MLPClassifier 테스트 점수:", mlp_pipeline.score(X_te, y_te))


MLPClassifier 훈련 점수: 0.8155624424156127
MLPClassifier 테스트 점수: 0.7944970265516375


Logistic

In [13]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression 파이프라인 생성
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

# 학습 및 평가
lr_pipeline.fit(X_tr, y_tr)
print("LogisticRegression 훈련 점수:", lr_pipeline.score(X_tr, y_tr))
print("LogisticRegression 테스트 점수:", lr_pipeline.score(X_te, y_te))

LogisticRegression 훈련 점수: 0.7616529860122289
LogisticRegression 테스트 점수: 0.7617053354552308


Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest 파이프라인 생성
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# 학습 및 평가
rf_pipeline.fit(X_tr, y_tr)
print("RandomForestClassifier 훈련 점수:", rf_pipeline.score(X_tr, y_tr))
print("RandomForestClassifier 테스트 점수:", rf_pipeline.score(X_te, y_te))

RandomForestClassifier 훈련 점수: 0.8298014909121367
RandomForestClassifier 테스트 점수: 0.7978055113493593
