In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

# EDA 이미지 저장 폴더 생성
os.makedirs('eda_image', exist_ok=True)

# 그래프 스타일 설정
plt.style.use('default')
sns.set_palette("husl")

In [2]:
import pandas as pd

def load_dataset(path:str) -> pd.DataFrame:
    return pd.read_csv(path)


def split_features_targets(df_dataset:pd.DataFrame, target_name:str) -> tuple:
    df_target = df_dataset[target_name]
    df_features = df_dataset.drop(target_name, axis=1)
    return df_features, df_target


def do_load_dataset(train_path:str, test_path:str, target_name:str):
    df_train_full = load_dataset(path=train_path)
    df_test = load_dataset(path=test_path)

    df_train, df_trian_target = split_features_targets(
        df_dataset=df_train_full, target_name=target_name)
    
    return df_train, df_test, df_trian_target

In [3]:
df_train, df_test, df_trian_target = do_load_dataset(train_path="./data/hotel_bookings_train.csv", test_path="./data/hotel_bookings_test.csv", target_name="is_canceled")

In [4]:
'''
피처 생성
'''

import pandas as pd
import numpy as np

def do_feature_extraction(df_train:pd.DataFrame, df_test:pd.DataFrame):
    # has_conpany
    df_train['has_company'] = (df_train['company'] > 0).astype(int)
    df_test['has_company'] = (df_test['company'] > 0).astype(int) 

    # has_agent
    df_train['has_agent'] = (df_train['agent'] > 0).astype(int)
    df_test['has_agent'] = (df_test['agent'] > 0).astype(int)

    # is_FB_meal
    df_train['is_FB_meal'] = np.where(df_train['meal'] == 'FB', 1, 0)
    df_test['is_FB_meal'] = np.where(df_test['meal'] == 'FB', 1, 0)

    # market_rist_level -> 인코딩 필요(라이트gbm, 캣부스트 제외)
        # 리스크 레벨 매핑 딕셔너리
    risk_mapping = {
    'Groups': 'High risk',
    'Online TA': 'High risk', 
    'Offline TA/TO': 'Medium risk',
    'Direct': 'Low risk',
    'Corporate': 'Low risk',
    'Complementary': 'Low risk'}
    df_train['market_risk_level'] = df_train['market_segment'].map(risk_mapping)
    df_test['market_risk_level'] = df_test['market_segment'].map(risk_mapping)

    # is_HighRisk_markket_risk
    df_train['is_High_risk_market_risk'] = (df_train['market_risk_level'] == 'High risk').astype(int)
    df_test['is_High_risk_market_risk'] = (df_test['market_risk_level'] == 'High risk').astype(int)
    
    # adr_processed
    # 1. IQR을 사용하여 훈련 데이터(X_tr)의 이상치 범위 계산
    Q1 = df_train['adr'].quantile(0.25)
    Q3 = df_train['adr'].quantile(0.75)
    IQR = Q3 - Q1
    upper_bound = Q3 + 1.5 * IQR
    lower_bound = Q1 - 1.5 * IQR
    # 2. 이상치(outliers)를 제외한 훈련 데이터의 adr 중앙값 재계산
    # adr은 보통 0보다 크므로 하한선을 0으로 설정하거나 IQR로 계산된 값을 사용
    # 여기서는 IQR로 계산된 값을 사용하여 더 일반적인 방법으로 처리
    adr_filtered_median = df_train.loc[(df_train['adr'] >= lower_bound) & (df_train['adr'] <= upper_bound), 'adr'].median()
    # 3. 훈련 세트(X_tr)에 새로운 'adr_processed' 피처 생성
    # 이상치 범위(lower_bound ~ upper_bound)를 벗어나는 값들을 필터링된 중앙값으로 대체
    df_train['adr_processed'] = np.where(
        (df_train['adr'] < lower_bound) | (df_train['adr'] > upper_bound),
        adr_filtered_median,
        df_train['adr'])
    # 4. 테스트 세트(X_te)에 새로운 'adr_processed' 피처 생성
    # 훈련 데이터에서 계산한 동일한 이상치 범위와 중앙값을 사용
    df_test['adr_processed'] = np.where(
        (df_test['adr'] < lower_bound) | (df_test['adr'] > upper_bound),
        adr_filtered_median,
        df_test['adr'])

    # lead_time_processed
    # 1. 리드타임 700일 이상 이상치 제거 - 700일 미만 데이터만 유지
    df_train = df_train[df_train['lead_time'] < 700]
    df_test = df_test[df_test['lead_time'] < 700]
    # 2. # 1단계: 훈련 데이터(X_tr)에서 IQR을 사용하여 이상치 범위 계산
    Q1 = df_train['lead_time'].quantile(0.25)
    Q3 = df_train['lead_time'].quantile(0.75)
    IQR = Q3 - Q1
    upper_bound = Q3 + 1.5 * IQR
    # 2단계: 이상치를 제외한 데이터로 중앙값 재계산
    # lead_time은 음수일 수 없으므로 하한선은 0으로 설정합니다.
    lead_time_filtered_median = df_train.loc[(df_train['lead_time'] >= 0) & (df_train['lead_time'] <= upper_bound), 'lead_time'].median()
    # 3단계: X_tr에 'lead_time_processed' 피처 생성 후  리드타임에 저장
    # 0 미만이거나 상한선(upper_bound)을 초과하는 값을 필터링된 중앙값으로 대체
    df_train['lead_time_processed'] = np.where(
        (df_train['lead_time'] < 0) | (df_train['lead_time'] > upper_bound),
        lead_time_filtered_median,
        df_train['lead_time'])
    # 4단계: X_te에 'lead_time_processed' 피처 생성 후  리드타임에 저장
    # 훈련 데이터에서 계산한 동일한 상한선과 중앙값을 사용
    df_test['lead_time_processed'] = np.where(
        (df_test['lead_time'] < 0) | (df_test['lead_time'] > upper_bound),
        lead_time_filtered_median,
        df_test['lead_time'])

    # is_alone
    ## 준호님꺼
    df_train['total_guests'] = df_train['adults'] + df_train['children'] + df_train['babies']
    df_test['total_guests'] = df_test['adults'] + df_test['children'] + df_test['babies']
    # Create 'is_alone' feature for both sets
    # 1 if total_guests is 1, otherwise 0
    df_train['is_alone'] = df_train['total_guests'].apply(lambda x: 1 if x == 1 else 0)
    df_test['is_alone'] = df_test['total_guests'].apply(lambda x: 1 if x == 1 else 0)
    # Optionally, you can drop the intermediate 'total_guests' feature
    df_train = df_train.drop('total_guests', axis=1)
    df_test = df_test.drop('total_guests', axis=1)
    
    # is_resort
    # City Hotel은 0, Resort Hotel은 1로 변환
    df_train['is_resort'] = df_train['hotel'].map({'City Hotel': 0, 'Resort Hotel': 1})
    df_test['is_resort'] = df_test['hotel'].map({'City Hotel': 0, 'Resort Hotel': 1})


    return df_train, df_test




In [5]:
df_train, df_test = do_feature_extraction(df_train, df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['lead_time_processed'] = np.where(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['total_guests'] = df_train['adults'] + df_train['children'] + df_train['babies']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['is_alone'] = df_train['total_guests'].apply(lambda x: 1 if x 

In [7]:
df_train.shape,  df_test.shape

((31329, 41), (8669, 42))

In [11]:
df_test['is_resort'].head()

0    1
1    1
2    1
3    1
4    1
Name: is_resort, dtype: int64

In [12]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31329 entries, 0 to 31330
Data columns (total 41 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           31329 non-null  object 
 1   lead_time                       31329 non-null  int64  
 2   arrival_date_year               31329 non-null  int64  
 3   arrival_date_month              31329 non-null  object 
 4   arrival_date_week_number        31329 non-null  int64  
 5   arrival_date_day_of_month       31329 non-null  int64  
 6   stays_in_weekend_nights         31329 non-null  int64  
 7   stays_in_week_nights            31329 non-null  int64  
 8   adults                          31329 non-null  int64  
 9   children                        31329 non-null  int64  
 10  babies                          31329 non-null  int64  
 11  meal                            31329 non-null  object 
 12  country                         30881

In [17]:
# df_train 데이터에 존재하는 모든 타입 조회
df_train.dtypes

hotel                              object
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                            int64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             float64
company                           

In [15]:
# 숫자 타입이 아닌 컬럼들 조회
df_train.select_dtypes(exclude=['int64', 'float64']).info(),
df_train.select_dtypes(exclude=['int64', 'float64']).columns

<class 'pandas.core.frame.DataFrame'>
Index: 31329 entries, 0 to 31330
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   hotel                    31329 non-null  object
 1   arrival_date_month       31329 non-null  object
 2   meal                     31329 non-null  object
 3   country                  30881 non-null  object
 4   market_segment           31329 non-null  object
 5   distribution_channel     31329 non-null  object
 6   reserved_room_type       31329 non-null  object
 7   assigned_room_type       31329 non-null  object
 8   deposit_type             31329 non-null  object
 9   customer_type            31329 non-null  object
 10  reservation_status       31329 non-null  object
 11  reservation_status_date  31329 non-null  object
 12  arrival_date_full        31329 non-null  object
 13  market_risk_level        31329 non-null  object
dtypes: object(14)
memory usage: 3.6+ MB


Index(['hotel', 'arrival_date_month', 'meal', 'country', 'market_segment',
       'distribution_channel', 'reserved_room_type', 'assigned_room_type',
       'deposit_type', 'customer_type', 'reservation_status',
       'reservation_status_date', 'arrival_date_full', 'market_risk_level'],
      dtype='object')

In [9]:
# 숫자타입인 컬럼들 조회
df_train.select_dtypes(exclude=['object']).info(),
df_train.select_dtypes(exclude=['object']).columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31331 entries, 0 to 31330
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   lead_time                       31331 non-null  int64  
 1   arrival_date_year               31331 non-null  int64  
 2   arrival_date_week_number        31331 non-null  int64  
 3   arrival_date_day_of_month       31331 non-null  int64  
 4   stays_in_weekend_nights         31331 non-null  int64  
 5   stays_in_week_nights            31331 non-null  int64  
 6   adults                          31331 non-null  int64  
 7   children                        31331 non-null  int64  
 8   babies                          31331 non-null  int64  
 9   is_repeated_guest               31331 non-null  int64  
 10  previous_cancellations          31331 non-null  int64  
 11  previous_bookings_not_canceled  31331 non-null  int64  
 12  booking_changes                 

Index(['lead_time', 'arrival_date_year', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'booking_changes', 'agent', 'company',
       'days_in_waiting_list', 'adr', 'required_car_parking_spaces',
       'total_of_special_requests'],
      dtype='object')

In [None]:
df_train['is_transient'] = (df_train['customer_type'] == 'Transient').astype(int)
df_test['is_transient'] = (df_test['customer_type'] == 'Transient').astype(int)



0    1
1    1
2    1
3    1
4    1
Name: is_transient, dtype: int64