# SKN 19기 mini-project 5팀(팀명: 여권어디있지) M/L

In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


# pandas 출력 옵션
pd.set_option('display.float_format', '{:.2f}'.format)

# 한글 폰트 사용을 위한 설정
import matplotlib.font_manager as fm
import matplotlib

font_path = 'C:\\Windows\\Fonts\\gulim.ttc'
font = fm.FontProperties(fname=font_path).get_name()
matplotlib.rc('font', family=font)

## 데이터 로드

In [49]:
# 데이터 로드
df_listings = pd.read_csv('./data/listings.csv')

df_listings.head(5)

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,accommodates,bathrooms,bedrooms,beds,amenities,price,number_of_reviews,first_review,review_scores_rating
0,Bunkyo Ku,Entire rental unit,Entire home/apt,3,1.0,0.0,2.0,"[""Bidet"", ""Body soap"", ""Bed linens"", ""Hot wate...",100000.0,0,,
1,Bunkyo Ku,Entire rental unit,Entire home/apt,8,1.0,2.0,5.0,"[""Bidet"", ""Body soap"", ""Bed linens"", ""Hot wate...",100000.0,0,,
2,Taito Ku,Entire serviced apartment,Entire home/apt,4,2.0,2.0,2.0,"[""Room-darkening shades"", ""Body soap"", ""Carbon...",14550.0,24,2023-12-04,4.42
3,Kita Ku,Entire home,Entire home/apt,8,1.0,3.0,6.0,"[""Room-darkening shades"", ""Cooking basics"", ""C...",22012.0,15,2024-01-10,4.73
4,Sumida Ku,Entire rental unit,Entire home/apt,3,1.0,1.0,1.0,"[""Clothing storage: closet"", ""Elevator"", ""Hair...",15429.0,17,2023-12-26,5.0


In [50]:
# 5.2 추가 전처리 진행
# 컬럼 삭제
df_listings = df_listings.drop(['property_type', 'amenities'], axis=1, errors='ignore')

In [51]:
# 데이터 확인
print(df_listings.info())
print('-' * 50)
df_listings.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23007 entries, 0 to 23006
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   neighbourhood_cleansed  23007 non-null  object 
 1   room_type               23007 non-null  object 
 2   accommodates            23007 non-null  int64  
 3   bathrooms               23007 non-null  float64
 4   bedrooms                23007 non-null  float64
 5   beds                    23007 non-null  float64
 6   price                   23007 non-null  float64
 7   number_of_reviews       23007 non-null  int64  
 8   first_review            19875 non-null  object 
 9   review_scores_rating    19875 non-null  float64
dtypes: float64(5), int64(2), object(3)
memory usage: 1.8+ MB
None
--------------------------------------------------


Unnamed: 0,neighbourhood_cleansed,room_type,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,first_review,review_scores_rating
count,23007,23007,23007.0,23007.0,23007.0,23007.0,23007.0,23007.0,19875,19875.0
unique,49,4,,,,,,,2946,
top,Shinjuku Ku,Entire home/apt,,,,,,,2025-03-30,
freq,4311,19885,,,,,,,66,
mean,,,4.44,1.15,1.39,2.9,17758.09,37.97,,4.73
std,,,2.89,0.49,0.94,2.19,12886.22,67.66,,0.32
min,,,1.0,0.0,0.0,0.0,1700.0,0.0,,1.0
25%,,,2.0,1.0,1.0,1.0,9429.0,3.0,,4.63
50%,,,4.0,1.0,1.0,2.0,13912.0,18.0,,4.8
75%,,,6.0,1.0,2.0,4.0,21572.0,45.0,,4.94


In [52]:

# 6. 데이터 변환 및 피처 엔지니어링

# 1) 범주형 컬럼의 데이터 변환
# 1.1) neighbourhood_cleansed, room-type 컬럼의 라벨 인코딩
# - 순서가 있는 범주형 변수는 아니지만 Gredint Boosting 알고리즘으로 모델링 할 예정이므로 원핫 인코딩 대신 라벨 인코딩 사용
# - 순서가 없는 범주에 라벨 인코딩 사용은,
# - 거리 기반/선형 모델에는 적합하지 않으나 (숫자 크기가 인위적 순서 의미를 만들어버림 → 잘못된 해석 위험)
# - 트리 모델은 값의 크기 자체보다는 **분할 조건(같다/다르다)**를 이용하므로, 라벨 인코딩을 써도 문제가 거의 없음
from sklearn.preprocessing import LabelEncoder

le_neighbourhood = LabelEncoder()
le_room_type = LabelEncoder()

df_listings['neighbourhood_cleansed_encoded'] = le_neighbourhood.fit_transform(df_listings['neighbourhood_cleansed'])
df_listings['room_type_encoded'] = le_room_type.fit_transform(df_listings['room_type'])

# 결과 확인
df_listings[['neighbourhood_cleansed', 'neighbourhood_cleansed_encoded',
             'room_type', 'room_type_encoded']].head(10)

Unnamed: 0,neighbourhood_cleansed,neighbourhood_cleansed_encoded,room_type,room_type_encoded
0,Bunkyo Ku,4,Entire home/apt,0
1,Bunkyo Ku,4,Entire home/apt,0
2,Taito Ku,46,Entire home/apt,0
3,Kita Ku,20,Entire home/apt,0
4,Sumida Ku,44,Entire home/apt,0
5,Sumida Ku,44,Entire home/apt,0
6,Kita Ku,20,Entire home/apt,0
7,Koto Ku,26,Entire home/apt,0
8,Taito Ku,46,Shared room,3
9,Taito Ku,46,Shared room,3


In [53]:
# 1.2) first_rivew 컬럼의 데이터 타입 변환
df_listings['first_review'] = pd.to_datetime(df_listings['first_review'], errors='coerce')

print(df_listings['first_review'].dtype)
print('-' * 50)
df_listings[['first_review']].head(15)

datetime64[ns]
--------------------------------------------------


Unnamed: 0,first_review
0,NaT
1,NaT
2,2023-12-04
3,2024-01-10
4,2023-12-26
5,2023-12-26
6,2023-12-10
7,2023-12-24
8,2023-12-10
9,2023-12-07


In [54]:
# 2) 피처 엔지니어링
# - 숙소의 open 시기가 다르므로 number_of_riews와 first_review를 이용해 새로운 특성 생성

# 기준 날짜 설정
reference_date = pd.to_datetime('2025-09-24')
# first_review가 결측치인 경우 NaT로 처리되어도 안전하게 계산
df_listings['days_since_first_review'] = (reference_date - df_listings['first_review']).dt.days
# 결측치가 있으면 결과도 NaN
df_listings['reviews_per_day_since_first_review'] = df_listings['number_of_reviews'] / df_listings['days_since_first_review']
# 결과 확인
df_listings[['number_of_reviews', 'first_review', 'days_since_first_review', 
             'reviews_per_day_since_first_review']].head(10)

Unnamed: 0,number_of_reviews,first_review,days_since_first_review,reviews_per_day_since_first_review
0,0,NaT,,
1,0,NaT,,
2,24,2023-12-04,660.0,0.04
3,15,2024-01-10,623.0,0.02
4,17,2023-12-26,638.0,0.03
5,14,2023-12-26,638.0,0.02
6,47,2023-12-10,654.0,0.07
7,56,2023-12-24,640.0,0.09
8,70,2023-12-10,654.0,0.11
9,55,2023-12-07,657.0,0.08


In [55]:
# 데이터 재확인
# - 리뷰 비율 컬럼과 리뷰 평점 컬럼에 결측치가 존재하나,
# - 부스팅 모델은 결측치를 자체적으로 처리하는 기능을 가지고 있으므로 결측치를 삭제하지 않고 모델 학습 진행
# - 예. XGBoost는 결측치가 있는 샘플은 손실을 최소화하는 방향으로 자동으로 배치
# - 예. LightGBM은 NaN 자체를 별도의 카테고리로 인식해 분기시 NaN 전용 경로로 학습하여 과적합 위험을 최소화 함
print(df_listings.info())
df_listings.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23007 entries, 0 to 23006
Data columns (total 14 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   neighbourhood_cleansed              23007 non-null  object        
 1   room_type                           23007 non-null  object        
 2   accommodates                        23007 non-null  int64         
 3   bathrooms                           23007 non-null  float64       
 4   bedrooms                            23007 non-null  float64       
 5   beds                                23007 non-null  float64       
 6   price                               23007 non-null  float64       
 7   number_of_reviews                   23007 non-null  int64         
 8   first_review                        19875 non-null  datetime64[ns]
 9   review_scores_rating                19875 non-null  float64       
 10  neighbourhood_cleansed

Unnamed: 0,neighbourhood_cleansed,room_type,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,first_review,review_scores_rating,neighbourhood_cleansed_encoded,room_type_encoded,days_since_first_review,reviews_per_day_since_first_review
0,Bunkyo Ku,Entire home/apt,3,1.0,0.0,2.0,100000.0,0,NaT,,4,0,,
1,Bunkyo Ku,Entire home/apt,8,1.0,2.0,5.0,100000.0,0,NaT,,4,0,,
2,Taito Ku,Entire home/apt,4,2.0,2.0,2.0,14550.0,24,2023-12-04,4.42,46,0,660.0,0.04
3,Kita Ku,Entire home/apt,8,1.0,3.0,6.0,22012.0,15,2024-01-10,4.73,20,0,623.0,0.02
4,Sumida Ku,Entire home/apt,3,1.0,1.0,1.0,15429.0,17,2023-12-26,5.0,44,0,638.0,0.03
5,Sumida Ku,Entire home/apt,3,1.0,1.0,1.0,11700.0,14,2023-12-26,4.93,44,0,638.0,0.02
6,Kita Ku,Entire home/apt,12,1.0,4.0,3.0,14000.0,47,2023-12-10,4.96,20,0,654.0,0.07
7,Koto Ku,Entire home/apt,6,1.0,1.0,1.0,19758.0,56,2023-12-24,4.88,26,0,640.0,0.09
8,Taito Ku,Shared room,1,3.0,1.0,1.0,1808.0,70,2023-12-10,4.67,46,3,654.0,0.11
9,Taito Ku,Shared room,1,3.0,1.0,1.0,1808.0,55,2023-12-07,4.69,46,3,657.0,0.08


In [56]:
# 모델 학습을 시키기 위한 컬럼만 추출
# 제외할 컬럼 리스트
cols_to_exclude = ['neighbourhood_cleansed', 'room_type', 'number_of_reviews', 
                   'first_review', 'days_since_first_review']

# 제외 후 새로운 데이터프레임 생성
df_listings_f = df_listings.drop(columns=cols_to_exclude)

# 결과 확인
df_listings_f.head(10)

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,price,review_scores_rating,neighbourhood_cleansed_encoded,room_type_encoded,reviews_per_day_since_first_review
0,3,1.0,0.0,2.0,100000.0,,4,0,
1,8,1.0,2.0,5.0,100000.0,,4,0,
2,4,2.0,2.0,2.0,14550.0,4.42,46,0,0.04
3,8,1.0,3.0,6.0,22012.0,4.73,20,0,0.02
4,3,1.0,1.0,1.0,15429.0,5.0,44,0,0.03
5,3,1.0,1.0,1.0,11700.0,4.93,44,0,0.02
6,12,1.0,4.0,3.0,14000.0,4.96,20,0,0.07
7,6,1.0,1.0,1.0,19758.0,4.88,26,0,0.09
8,1,3.0,1.0,1.0,1808.0,4.67,46,3,0.11
9,1,3.0,1.0,1.0,1808.0,4.69,46,3,0.08


In [57]:
# 7. 데이터 분할
from sklearn.model_selection import train_test_split

target = 'price'

X = df_listings_f.drop(columns=[target])
y = df_listings_f[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
# 모델 학습
from sklearn.metrics import r2_score, mean_squared_error
import xgboost as xgb
import lightgbm as lgb

# 1) XGBoost 알고리즘 선택
# - DMatrix는 XGBoost 전용 데이터 구조로서, DMatrix로 변환하면 학습 속도와 메모리 효율이 좋아지고 결측치 처리도 편리
dtrain = xgb.DMatrix(X_train, label=y_train, missing=np.nan)
dtest = xgb.DMatrix(X_test, label=y_test, missing=np.nan)

params_xgb = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'seed': 42
}

xgb_model = xgb.train(params_xgb, dtrain, num_boost_round=100)
y_pred_xgb = xgb_model.predict(dtest)

r2_xgb = r2_score(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
print(f"XGBoost → R2: {r2_xgb:.4f}, RMSE: {rmse_xgb:.2f}")

XGBoost → R2: 0.5457, RMSE: 8645.13


In [59]:
# 2) LightGBM 알고리즘 선택
# - Dataset 은 LightGBM 전용 데이터 구조로서, 학습 최적화와 결측치 자동 처리 기능을 제공
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_test = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

params_lgb = {
    'objective': 'regression',
    'metric': 'rmse',
    'seed': 42
}

lgb_model = lgb.train(
    params_lgb,
    lgb_train,
    num_boost_round=100,
    valid_sets=[lgb_train, lgb_test],
    callbacks=[lgb.early_stopping(stopping_rounds=10)]
)

y_pred_lgb = lgb_model.predict(X_test)
r2_lgb = r2_score(y_test, y_pred_lgb)
rmse_lgb = np.sqrt(mean_squared_error(y_test, y_pred_lgb))
print(f"LightGBM → R2: {r2_lgb:.4f}, RMSE: {rmse_lgb:.2f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001259 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 491
[LightGBM] [Info] Number of data points in the train set: 18405, number of used features: 8
[LightGBM] [Info] Start training from score 17738.167651
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[80]	training's rmse: 7905.19	valid_1's rmse: 8432.31
LightGBM → R2: 0.5678, RMSE: 8432.31


In [None]:
# 하이퍼파라미터 튜닝
# 1) 랜덤 서치
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

# 하이퍼파라미터 후보 설정
param_dist = {
    'num_leaves': [31, 50, 70, 100],
    'max_depth': [-1, 5, 10, 15, 20],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 300, 500, 1000],
    'min_child_samples': [5, 10, 20, 50],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [0, 0.1, 0.5, 1]
}

# LightGBM 회귀 모델 객체
lgb_model = lgb.LGBMRegressor(random_state=42)

# R2 기준 스코어러
r2_scorer = make_scorer(r2_score)

# 랜덤서치 설정
random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_dist,
    n_iter=50,               # 50개의 랜덤 조합 시도
    scoring=r2_scorer,
    cv=5,                    # 5-폴드 교차검증
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# 학습
random_search.fit(X_train, y_train)

# 최적 파라미터와 최고 R2 확인
print("Best Parameters:", random_search.best_params_)
print("Best R2 Score:", random_search.best_score_)
best_model = random_search.best_estimator_

# 훈련 데이터에서 평가
y_train_pred = best_model.predict(X_train)
r2_train = r2_score(y_train, y_train_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"Train R2: {r2_train:.4f}, Train RMSE: {rmse_train:.2f}")

# 테스트 데이터에서 평가
y_pred = best_model.predict(X_test)
r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test R2: {r2_test:.4f}, Test RMSE: {rmse_test:.2f}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000741 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 491
[LightGBM] [Info] Number of data points in the train set: 18405, number of used features: 8
[LightGBM] [Info] Start training from score 17738.167651
Best Parameters: {'subsample': 0.6, 'reg_lambda': 0.5, 'reg_alpha': 0.1, 'num_leaves': 50, 'n_estimators': 1000, 'min_child_samples': 20, 'max_depth': 15, 'learning_rate': 0.01, 'colsample_bytree': 0.6}
Best R2 Score: 0.54151461686517
Train R2: 0.6584, Train RMSE: 7539.65
Test R2: 0.5765, Test RMSE: 8347.13


In [62]:
# 2) 로그 변환 
# 타깃 변수 로그 변환
y_train_log = np.log1p(y_train)  # log(1 + y) 형태로 변환
y_test_log = np.log1p(y_test)    # 평가용도

# 랜덤 서치 설정
param_dist = {
    'num_leaves': [31, 50, 70, 100],
    'max_depth': [-1, 5, 10, 15, 20],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 300, 500, 1000],
    'min_child_samples': [5, 10, 20, 50],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [0, 0.1, 0.5, 1]
}

lgb_model = lgb.LGBMRegressor(random_state=42)
r2_scorer = make_scorer(r2_score)

random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_dist,
    n_iter=50,
    scoring=r2_scorer,
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# 학습 (타깃 로그 변환 적용)
random_search.fit(X_train, y_train_log)

# 최적 파라미터 확인
print("Best Parameters:", random_search.best_params_)
print("Best R2 Score (log target):", random_search.best_score_)

best_model = random_search.best_estimator_

# 훈련 데이터에서 평가 (원래 스케일로 역변환)
y_train_pred_log = best_model.predict(X_train)
y_train_pred = np.expm1(y_train_pred_log)  # 역변환
r2_train = r2_score(y_train, y_train_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"Train R2: {r2_train:.4f}, Train RMSE: {rmse_train:.2f}")

# 테스트 데이터에서 평가
y_pred_log = best_model.predict(X_test)
y_pred = np.expm1(y_pred_log)  # 역변환
r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test R2: {r2_test:.4f}, Test RMSE: {rmse_test:.2f}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000836 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 491
[LightGBM] [Info] Number of data points in the train set: 18405, number of used features: 8
[LightGBM] [Info] Start training from score 9.586356
Best Parameters: {'subsample': 1.0, 'reg_lambda': 0.5, 'reg_alpha': 0.1, 'num_leaves': 100, 'n_estimators': 1000, 'min_child_samples': 10, 'max_depth': 10, 'learning_rate': 0.01, 'colsample_bytree': 0.8}
Best R2 Score (log target): 0.645228516769153
Train R2: 0.6593, Train RMSE: 7530.06
Test R2: 0.5700, Test RMSE: 8411.37


In [None]:
# # 3) 그리드 서치
# from sklearn.model_selection import GridSearchCV

# # 랜덤서치에서 찾은 최적 파라미터
# best_params = {
#     'subsample': 0.6,
#     'reg_lambda': 0.5,
#     'reg_alpha': 0.1,
#     'num_leaves': 50,
#     'n_estimators': 1000,
#     'min_child_samples': 20,
#     'max_depth': 15,
#     'learning_rate': 0.01,
#     'colsample_bytree': 0.6
# }

# # 그리드 서치용 후보 범위(최적 파라미터 기준 ± 주변 값)
# param_grid = {
#     'num_leaves': [45, 50, 55], # 하나의 트리가 가질 수 있는 최대 리프 개수
#     'max_depth': [12, 15, 18],
#     'learning_rate': [0.01, 0.02, 0.03],
#     'n_estimators': [900, 1000, 1100],
#     'min_child_samples': [15, 20, 25], # 리프 노드가 되기 위해 최소한으로 필요한 레코드 수 
#     'subsample': [0.6, 0.7, 0.8], # 데이터 샘플링 비율
#     'colsample_bytree': [0.6, 0.7, 0.8], # 무작위로 선택하는 특성의 비율
# }

# # LightGBM 모델 객체
# lgb_model = lgb.LGBMRegressor(random_state=42)

# # R2 기준 스코어러
# r2_scorer = make_scorer(r2_score)

# # 그리드 서치 설정
# grid_search = GridSearchCV(
#     estimator=lgb_model,
#     param_grid=param_grid,
#     scoring=r2_scorer,
#     cv=5,       
#     n_jobs=-1,
#     verbose=1
# )

# # 학습
# grid_search.fit(X_train, y_train)

# # 최적 파라미터와 최고 R2 확인
# print("Best Parameters (GridSearch):", grid_search.best_params_)
# print("Best R2 Score (CV):", grid_search.best_score_)

# # 테스트 데이터에서 평가
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_test)
# r2_test = r2_score(y_test, y_pred)
# rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
# print(f"Test R2: {r2_test:.4f}, Test RMSE: {rmse_test:.2f}")

Fitting 5 folds for each of 19683 candidates, totalling 98415 fits
