데이터 전처리 없이 머신러닝 돌리기

In [1]:
# 라이브러리
import os
import pandas as pd
import numpy as np

# 데이터 시각화
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import torch
import random

def reset_seeds(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)    # 파이썬 환경변수 시드 고정
    np.random.seed(seed)
    torch.manual_seed(seed) # cpu 연산 무작위 고정
    torch.cuda.manual_seed(seed) # gpu 연산 무작위 고정
    torch.backends.cudnn.deterministic = True  # cuda 라이브러리에서 Deterministic(결정론적)으로 예측하기 (예측에 대한 불확실성 제거 )

In [2]:
# hotel폴더의 archive폴더에 있는 CSV로드
import easydict
args = easydict.EasyDict()

args.default_path = './archive/hotel_bookings.csv'

# 데이터 분석을 위한 변수들
args.random_state = 21

데이터 로드

In [3]:
plt.style.use('fivethirtyeight')
plt.ion()

import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv(args.default_path)
df.shape

(119390, 32)

In [5]:
df["country"].mode()[0]

'PRT'

In [6]:
df.dtypes

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             

In [7]:
# company, agent, children → 0으로 채우기
df["company"] = df["company"].fillna(0)
df["agent"] = df["agent"].fillna(0)
df["children"] = df["children"].fillna(0)

# country → 최빈값으로 채우기
df["country"] = df["country"].fillna(df["country"].mode()[0])

train, test 분리

In [8]:
from sklearn.model_selection import train_test_split

# 전체 train/test 먼저
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['is_canceled'])

# train 안에서 다시 eval 분리
train_df, eval_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['is_canceled'])

print("Train:", train_df.shape)
print("Eval :", eval_df.shape)
print("Test :", test_df.shape)
# 과적합을 막기 위해 train, eval, test로 나눔

Train: (76409, 32)
Eval : (19103, 32)
Test : (23878, 32)


feature 생성

In [9]:
from sklearn.ensemble import StackingClassifier

from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [11]:
# X, y 정의
X = df.drop(["is_canceled", "reservation_status", "reservation_status_date"], axis=1)
y = df["is_canceled"]

# train/test split
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

feature 생성

In [None]:
# Create 'total_guests' on both the training and testing sets
X_tr['total_guests'] = X_tr['adults'] + X_tr['children'] + X_tr['babies']
X_te['total_guests'] = X_te['adults'] + X_te['children'] + X_te['babies']

# Create 'is_alone' feature for both sets
# 1 if total_guests is 1, otherwise 0
X_tr['is_alone'] = X_tr['total_guests'].apply(lambda x: 1 if x == 1 else 0)
X_te['is_alone'] = X_te['total_guests'].apply(lambda x: 1 if x == 1 else 0)

# Verify the new feature has been added
print("X_tr shape:", X_tr.shape)
print("X_te shape:", X_te.shape)

# Optionally, you can drop the intermediate 'total_guests' feature
X_tr = X_tr.drop('total_guests', axis=1)
X_te = X_te.drop('total_guests', axis=1)

print("\nAfter dropping 'total_guests':")
print("X_tr columns:", X_tr.columns)
print("X_te columns:", X_te.columns)

company, agent  
회사와 대행사가 미치는 영향 분석

In [None]:
# X_te[company]의 값이 0보다 크면1 / 작으면 0# company 값이 0보다 크면 1, 아니면 0  
# .astype(int) : Boolean을 정수로 변환
X_te['has_company'] = (X_te['company'] > 0).astype(int)
X_tr['has_company'] = (X_tr['company'] > 0).astype(int)
# 확인
print("has_company 분포:")
print(X_tr['has_company'].value_counts()/len(X_tr['has_company']))

In [None]:
# X_te[company]의 값이 0보다 크면1 / 작으면 0# company 값이 0보다 크면 1, 아니면 0  
# .astype(int) : Boolean을 정수로 변환
X_te['has_company'] = (X_te['company'] > 0).astype(int)
X_tr['has_company'] = (X_tr['company'] > 0).astype(int) 
# 확인
print("has_company 분포:")
print(X_tr['has_company'].value_counts()/len(X_tr['has_company']))

meal 형태  
FB 즉 식사를 3번 제공하면 취소율이 높았음

In [None]:
# meal이 FB이면 1, 아니면 0인 피처생성
import numpy as np

# np.where로 조건부 값 할당
X_tr['is_FB_meal'] = np.where(X_tr['meal'] == 'FB', 1, 0)
X_te['is_FB_meal'] = np.where(X_te['meal'] == 'FB', 1, 0)

# 확인
print("is_FB_meal 분포:")
print(X_tr['is_FB_meal'].value_counts()/len(X_tr['is_FB_meal']))

예약한 방과 실제 방이 다른경우 끼치는 영향

In [1]:
import numpy as np

# 'is_room_changed' 컬럼 생성
df['is_room_changed'] = np.where(df['reserved_room_type'] == df['assigned_room_type'], 0, 1)

# 새로 생성된 피처 확인
print(df[['reserved_room_type', 'assigned_room_type', 'is_room_changed']].head())

NameError: name 'df' is not defined

In [None]:
import numpy as np

# X_tr에 'is_room_changed' 피처 생성
X_tr['is_room_changed'] = np.where(X_tr['reserved_room_type'] == X_tr['assigned_room_type'], 0, 1)

# X_te에 'is_room_changed' 피처 생성
X_te['is_room_changed'] = np.where(X_te['reserved_room_type'] == X_te['assigned_room_type'], 0, 1)

# 피처가 제대로 추가되었는지 확인
print("X_tr columns:", X_tr.columns)
print("X_te columns:", X_te.columns)

방이 같으면 0 방이 변경이 있으면 1

ard 이상치 제거  
1~200 사이의 값을 제외하고 전부 중앙값으로 처리

In [None]:
import numpy as np

# 훈련 데이터(X_tr)와 테스트 데이터(X_te)의 adr 중앙값 계산
# 이 때, 이상치가 포함될 수 있으므로 adr의 전체 중앙값을 사용하는 것이 더 일관적입니다.
adr_median = df['adr'].median()

# X_tr에 adr 중앙값 처리 피처 생성
X_tr['adr_processed'] = np.where(
    (X_tr['adr'] < 1) | (X_tr['adr'] > 200),
    adr_median,
    X_tr['adr']
)

# X_te에 adr 중앙값 처리 피처 생성
X_te['adr_processed'] = np.where(
    (X_te['adr'] < 1) | (X_te['adr'] > 200),
    adr_median,
    X_te['adr']
)

# 처리된 피처의 분포 확인 (예: X_tr)
print(f"Original adr median: {X_tr['adr'].median():.2f}")
print(f"Processed adr median: {X_tr['adr_processed'].median():.2f}")

print("\nOriginal adr value counts (outliers):")
print(X_tr[(X_tr['adr'] < 1) | (X_tr['adr'] > 200)]['adr'].describe())

print("\nProcessed adr value counts:")
print(X_tr['adr_processed'].describe())

lead_time 이상치 제거  
IQR계산법을 통해 이상치를 제거한다.  
375보다 크면 중앙값을 넣음

In [None]:
# Calculate the median lead time from the TRAINING set
lead_time_median = X_tr['lead_time'].median()

# Create 'lead_time_processed' feature for the TRAINING set
# Values outside the 0 to 373 range are replaced with the training set median
X_tr['lead_time_processed'] = np.where(
    (X_tr['lead_time'] < 0) | (X_tr['lead_time'] > 373),
    lead_time_median,
    X_tr['lead_time']
)

# Create 'lead_time_processed' feature for the TESTING set
# Use the SAME median calculated from the training set
X_te['lead_time_processed'] = np.where(
    (X_te['lead_time'] < 0) | (X_te['lead_time'] > 373),
    lead_time_median,
    X_te['lead_time']
)


hotel 타입

In [None]:
# 'hotel' 컬럼을 기반으로 'is_resort' 피처 생성
# City Hotel은 0, Resort Hotel은 1로 변환
X_tr['is_resort'] = X_tr['hotel'].map({'City Hotel': 0, 'Resort Hotel': 1})
X_te['is_resort'] = X_te['hotel'].map({'City Hotel': 0, 'Resort Hotel': 1})

# 변환이 제대로 되었는지 확인
print("X_tr의 'is_resort' 값 빈도수:")
print(X_tr['is_resort'].value_counts())

print("\nX_te의 'is_resort' 값 빈도수:")
print(X_te['is_resort'].value_counts())

In [None]:
SEED = 42
reset_seeds(SEED)

# ==========================
# 스태킹 모델 정의
# ==========================
estimators = [
    ("mlp", MLPClassifier(max_iter=1000, random_state=SEED)),
    ("lr", LogisticRegression(max_iter=1000, random_state=SEED)),
    ("rf", RandomForestClassifier(random_state=SEED))
]

stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(random_state=SEED),
    n_jobs=-1
)

# ==========================
# 전체 파이프라인
# ==========================
model = Pipeline(steps=[
    ("preprocessor", preprocessor),  # 전처리기 (예: OneHotEncoder, StandardScaler 등)
    ("stack", stack)
])

# ==========================
# 학습 및 평가
# ==========================
model.fit(X_tr, y_tr)

print(f"훈련 점수: {model.score(X_tr, y_tr):.4f}")
print(f"테스트 점수: {model.score(X_te, y_te):.4f}")


훈련 점수: 0.8227
테스트 점수: 0.8000


In [12]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 수치형/범주형 컬럼 분리
num_cols = X_tr.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X_tr.select_dtypes(include=['object']).columns.tolist()

# 전처리 파이프라인
preprocessor = ColumnTransformer(
    transformers=[
        # ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)


In [None]:
SEED = 42
reset_seeds(SEED)

# ==========================
# 스태킹 모델 정의
# ==========================
estimators = [
    ("mlp", MLPClassifier(max_iter=1000, random_state=SEED)),
    ("lr", LogisticRegression(max_iter=1000, random_state=SEED)),
    ("rf", RandomForestClassifier(random_state=SEED))
]

stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(random_state=SEED),
    n_jobs=-1
)

# ==========================
# 전체 파이프라인
# ==========================
model = Pipeline(steps=[
    ("preprocessor", preprocessor),  # 전처리기 (예: OneHotEncoder, StandardScaler 등)
    ("stack", stack)
])

# ==========================
# 학습 및 평가
# ==========================
model.fit(X_tr, y_tr)

print(f"훈련 점수: {model.score(X_tr, y_tr):.4f}")
print(f"테스트 점수: {model.score(X_te, y_te):.4f}")


In [None]:
print(X.columns)

Index(['hotel', 'lead_time', 'arrival_date_year', 'arrival_date_month',
       'arrival_date_week_number', 'arrival_date_day_of_month',
       'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children',
       'babies', 'meal', 'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date'],
      dtype='object')
