In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
# 트리 계열 모델(RandomForest, LightGBM, CatBoost, XGBoost) → Label Encoding만 해도 괜찮음. (특히 CatBoost는 카테고리 직접 처리 가능)

### 랜덤포레스트

In [64]:
airbnb = pd.read_csv('../EDA/data/listings_ml.csv')
select_columns = [
    'neighbourhood_cleansed',
    'property_type',
    'room_type',
    'accommodates',
    'bathrooms',
    'bedrooms',
    'beds',
    'review_scores_rating',
    'number_of_reviews',
    'first_review',
    'price',
]
airbnb_select = airbnb[select_columns].copy()

airbnb_select['first_review'] = pd.to_datetime(airbnb_select['first_review'], errors='coerce')
airbnb_select['first_review_days_since'] = (pd.Timestamp("2025-07-01") - airbnb_select['first_review']).dt.days
airbnb_select.drop('first_review',axis=1, inplace=True)

X = airbnb_select.drop('price',axis=1)
y = np.log(airbnb_select['price'])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
airbnb_select


Unnamed: 0,neighbourhood_cleansed,property_type,room_type,accommodates,bathrooms,bedrooms,beds,review_scores_rating,number_of_reviews,price,first_review_days_since
0,4,11,0,3,1.0,0.0,2.0,,0,100000.0,
1,4,11,0,8,1.0,2.0,5.0,,0,100000.0,
2,46,12,0,4,2.0,2.0,2.0,4.42,24,14550.0,575.0
3,20,8,0,8,1.0,3.0,6.0,4.73,15,22012.0,538.0
4,44,11,0,3,1.0,1.0,1.0,5.00,17,15429.0,553.0
...,...,...,...,...,...,...,...,...,...,...,...
23002,42,11,0,4,1.0,1.0,2.0,,0,12306.0,
23003,40,8,0,11,1.0,4.0,7.0,,0,40732.0,
23004,30,8,0,7,1.0,3.0,4.0,,0,64436.0,
23005,40,11,0,2,1.0,1.0,1.0,,0,17046.0,


### 랜덤포레스트

In [65]:
rf_clf = RandomForestRegressor(
    n_estimators=300,  # 생성할 트리의 개수, 기본값 : 100
    max_depth = 9,
    max_features= 'sqrt',
    min_samples_leaf= 3, 
    min_samples_split= 6, 
    random_state=42
)

rf_clf.fit(X_train, y_train)
print(f"학습 점수 : ", rf_clf.score(X_train,y_train))
print(f"학습 점수 : ", rf_clf.score(X_test,y_test))
rf_clf.feature_importances_, rf_clf.feature_names_in_

# 학습 점수 :  0.5936216984503482
# 학습 점수 :  0.5580308930667504
# array([0.08726452, 0.04785898, 0.01132883, 0.72052703, 0.01683077,
#        0.07185703, 0.00622649, 0.03147071, 0.00306331, 0.00357232])

학습 점수 :  0.6221948626363234
학습 점수 :  0.5806149369083187


(array([0.07035907, 0.084524  , 0.04082226, 0.35094865, 0.03902632,
        0.18113012, 0.15483824, 0.03971956, 0.01569676, 0.02293502]),
 array(['neighbourhood_cleansed', 'property_type', 'room_type',
        'accommodates', 'bathrooms', 'bedrooms', 'beds',
        'review_scores_rating', 'number_of_reviews',
        'first_review_days_since'], dtype=object))

### 그리드서치CV

In [21]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [6, 10],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['sqrt', 'log2']
# }

# grid_search = GridSearchCV(
#     RandomForestRegressor(random_state=0),
#     param_grid,
#     cv=5,
#     scoring='r2',
#     # n_jobs=-1
# )

# grid_search.fit(X, y)
# print("최적 파라미터:", grid_search.best_params_)
# print("최적 교차검증 R²:", grid_search.best_score_)

# # 최적 파라미터: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
# # 최적 교차검증 R²: 0.5921771879198708

In [22]:
# from sklearn.neighbors import KNeighborsRegressor
# knn = KNeighborsRegressor(n_neighbors= 5)

# knn.fit(X_train, y_train)
# print(knn.score(X_train, y_train))
# print(knn.score(X_test,y_test))


In [66]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBRegressor
# train_test_split()

xgb_clf = XGBRegressor(
    n_estimators =300,
    max_depth = 3,
    learning_rate = 0.1,
    random_state = 42
    )

    
xgb_clf.fit(X_train,y_train)
y_pred_train = xgb_clf.predict(X_train)
y_pred_test = xgb_clf.predict(X_test)
print(xgb_clf.score(X_train, y_train))
print(xgb_clf.score(X_test, y_test))


0.6781052283110882
0.6595899936898116


In [70]:
cleaned_df = pd.read_csv('../EDA/data/listings_cleaned_1st.csv')

In [40]:
cleaned_df


Unnamed: 0,neighbourhood_cleansed,property_type,room_type,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,first_review,...,amnt_ev_charger,amnt_crib,amnt_king_bed,amnt_gym,amnt_breakfast,amnt_fireplace,amnt_smoking_allowed,amnt_waterfront,amnt_smoke_alarm,amnt_carbon_monoxide_alarm
0,Bunkyo Ku,Entire rental unit,Entire home/apt,3,1.0,0.0,2.0,100000.0,0,,...,0,0,0,0,0,1,0,0,1,0
1,Bunkyo Ku,Entire rental unit,Entire home/apt,8,1.0,2.0,5.0,100000.0,0,,...,0,0,0,0,0,1,0,0,1,0
2,Taito Ku,Entire serviced apartment,Entire home/apt,4,2.0,2.0,2.0,14550.0,24,2023-12-04,...,0,0,0,0,0,0,0,0,1,1
3,Kita Ku,Entire home,Entire home/apt,8,1.0,3.0,6.0,22012.0,15,2024-01-10,...,0,0,0,0,0,0,0,0,1,1
4,Sumida Ku,Entire rental unit,Entire home/apt,3,1.0,1.0,1.0,15429.0,17,2023-12-26,...,0,1,0,0,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23002,Shinjuku Ku,Entire rental unit,Entire home/apt,4,1.0,1.0,2.0,12306.0,0,,...,0,0,0,0,0,0,0,0,1,0
23003,Shibuya Ku,Entire home,Entire home/apt,11,1.0,4.0,7.0,40732.0,0,,...,0,0,0,0,0,0,0,0,0,0
23004,Minato Ku,Entire home,Entire home/apt,7,1.0,3.0,4.0,64436.0,0,,...,0,0,0,0,0,0,0,0,1,1
23005,Shibuya Ku,Entire rental unit,Entire home/apt,2,1.0,1.0,1.0,17046.0,0,,...,0,0,0,0,0,0,0,0,1,0


In [154]:
cleaned_df['neighbourhood_cleansed'] = airbnb_select['neighbourhood_cleansed']
cleaned_df['property_type'] = airbnb_select['property_type']
cleaned_df['room_type'] = airbnb_select['room_type']
cleaned_df['first_review_days_since'] = airbnb_select['first_review_days_since']
final_df = cleaned_df.drop("first_review", axis=1)
# final_df['review_scores_rating'] = np.exp(final_df['review_scores_rating'])
# final_df['n_review/days'] = final_df['number_of_reviews'] / final_df['first_review_days_since']

X = final_df.drop('price',axis=1)
y = np.log(final_df['price'])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
final_df.describe()

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,review_scores_rating,...,amnt_crib,amnt_king_bed,amnt_gym,amnt_breakfast,amnt_fireplace,amnt_smoking_allowed,amnt_waterfront,amnt_smoke_alarm,amnt_carbon_monoxide_alarm,first_review_days_since
count,23007.0,23007.0,23007.0,23007.0,23007.0,23007.0,23007.0,23007.0,23007.0,19875.0,...,23007.0,23007.0,23007.0,23007.0,23007.0,23007.0,23007.0,23007.0,23007.0,19875.0
mean,36.326596,13.828487,0.276611,4.442996,1.147716,1.390446,2.900509,17758.09365,37.969618,4.730824,...,0.095319,0.0,0.012301,0.003999,0.198331,0.024123,0.017386,0.959882,0.41922,819.543698
std,12.618917,9.196355,0.707372,2.888349,0.489267,0.936746,2.192957,12886.215825,67.660024,0.321883,...,0.293661,0.0,0.110226,0.063111,0.398751,0.153435,0.130708,0.196241,0.493442,863.448763
min,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1700.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,33.0,11.0,0.0,2.0,1.0,1.0,1.0,9429.0,3.0,4.63,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,208.0
50%,42.0,11.0,0.0,4.0,1.0,1.0,2.0,13912.0,18.0,4.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,477.0
75%,44.0,11.0,0.0,6.0,1.0,2.0,4.0,21572.0,45.0,4.94,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1055.5
max,48.0,60.0,3.0,16.0,10.0,10.0,25.0,100286.0,3506.0,5.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5032.0


In [150]:
X

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,accommodates,bathrooms,bedrooms,beds,number_of_reviews,review_scores_rating,amnt_self_checkin,...,amnt_crib,amnt_king_bed,amnt_gym,amnt_breakfast,amnt_fireplace,amnt_smoking_allowed,amnt_waterfront,amnt_smoke_alarm,amnt_carbon_monoxide_alarm,first_review_days_since
0,4,11,0,3,1.0,0.0,2.0,0,,1,...,0,0,0,0,1,0,0,1,0,
1,4,11,0,8,1.0,2.0,5.0,0,,1,...,0,0,0,0,1,0,0,1,0,
2,46,12,0,4,2.0,2.0,2.0,24,4.42,1,...,0,0,0,0,0,0,0,1,1,575.0
3,20,8,0,8,1.0,3.0,6.0,15,4.73,1,...,0,0,0,0,0,0,0,1,1,538.0
4,44,11,0,3,1.0,1.0,1.0,17,5.00,1,...,1,0,0,0,1,0,0,1,1,553.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23002,42,11,0,4,1.0,1.0,2.0,0,,1,...,0,0,0,0,0,0,0,1,0,
23003,40,8,0,11,1.0,4.0,7.0,0,,1,...,0,0,0,0,0,0,0,0,0,
23004,30,8,0,7,1.0,3.0,4.0,0,,1,...,0,0,0,0,0,0,0,1,1,
23005,40,11,0,2,1.0,1.0,1.0,0,,1,...,0,0,0,0,0,0,0,1,0,


In [155]:
xgb_clf = XGBRegressor(
    n_estimators =270,
    max_depth = 5,
    learning_rate = 0.09,
    random_state = 42

    )

xgb_clf.fit(X_train,y_train)
y_pred_train = xgb_clf.predict(X_train)
y_pred_test = xgb_clf.predict(X_test)
print(xgb_clf.score(X_train, y_train))
print(xgb_clf.score(X_test, y_test))

0.7700769372658298
0.7007601561454638


In [162]:
xgb_clf = XGBRegressor(
    n_estimators =270,
    max_depth = 5,
    learning_rate = 0.09,
    random_state = 42,
    subsample=0.7,
    colsample_bytree=0.55,
    # reg_alpha = 0.01,
    # reg_lambda=3
    )

xgb_clf.fit(X_train,y_train)
y_pred_train = xgb_clf.predict(X_train)
y_pred_test = xgb_clf.predict(X_test)
print(xgb_clf.score(X_train, y_train))
print(xgb_clf.score(X_test, y_test))

0.7659628134885433
0.7021833313806256


In [114]:
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

base_models = [
    ('rf', RandomForestRegressor(n_estimators=200, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=270, max_depth=5, learning_rate=0.09, random_state=42))
]

stack = StackingRegressor(
    estimators=base_models,
    final_estimator=XGBRegressor(n_estimators=100, random_state=42)  # 메타모델
)

stack.fit(X_train, y_train)
print(stack.score(X_test, y_test))


0.6988306650590563


In [87]:
xgb_clf.feature_importances_

array([0.04185425, 0.03479398, 0.05799178, 0.33897963, 0.04441934,
       0.08978827, 0.01749993, 0.01305063, 0.02359363, 0.01518942,
       0.        , 0.00625835, 0.00745863, 0.01396642, 0.01568825,
       0.        , 0.01010367, 0.01245292, 0.00801183, 0.01037276,
       0.01707665, 0.01045462, 0.01752649, 0.03309593, 0.00181697,
       0.0132028 , 0.00414631, 0.03027978, 0.        , 0.00683862,
       0.02103265, 0.02081413, 0.01198967, 0.01294215, 0.00728454,
       0.01853851, 0.01148643], dtype=float32)

##### 하이퍼 파라미터 최적화

In [78]:
import optuna
from sklearn.model_selection import cross_val_score


In [79]:

def objective(trial):
    params = {
        'n_estimators' : trial.suggest_int('n_estimators', 100, 500, 30),
        'max_depth' : trial.suggest_int('max_depth', 3, 10),
        'learning_rate' : trial.suggest_float('learning_rate', 0.01, 0.2),
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.5, 1)
    }
    xgb_clf = XGBRegressor(**params)

    mean_acc = cross_val_score(xgb_clf, X_train, y_train, scoring='r2', cv = 3).mean()

    return mean_acc
    
# 2. study 객체 -> 최적화
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# 3. 결과 출력
print(study.best_value)
print(study.best_params)

[I 2025-09-26 17:29:50,001] A new study created in memory with name: no-name-37b014b0-f578-48a7-9b3f-ec4118236a10
Positional arguments ['self', 'name', 'low', 'high', 'step', 'log'] in suggest_int() have been deprecated since v3.5.0. They will be replaced with the corresponding keyword arguments in v5.0.0, so please use the keyword specification instead. See https://github.com/optuna/optuna/releases/tag/v3.5.0 for details.
  'n_estimators' : trial.suggest_int('n_estimators', 100, 500, 30),
[I 2025-09-26 17:29:52,591] Trial 0 finished with value: 0.688691516075355 and parameters: {'n_estimators': 280, 'max_depth': 6, 'learning_rate': 0.07058094888552192, 'colsample_bytree': 0.9441419154831245}. Best is trial 0 with value: 0.688691516075355.
Positional arguments ['self', 'name', 'low', 'high', 'step', 'log'] in suggest_int() have been deprecated since v3.5.0. They will be replaced with the corresponding keyword arguments in v5.0.0, so please use the keyword specification instead. See htt

0.7164228215077088
{'n_estimators': 340, 'max_depth': 10, 'learning_rate': 0.03685785896100095, 'colsample_bytree': 0.5174266385451538}


In [60]:
final_df.columns

Index(['neighbourhood_cleansed', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'price', 'number_of_reviews',
       'review_scores_rating', 'amnt_self_checkin', 'amnt_instant_book',
       'amnt_kitchen', 'amnt_hair_dryer', 'amnt_free_parking', 'amnt_wifi',
       'amnt_private_bathroom', 'amnt_bbq_grill', 'amnt_washer',
       'amnt_pets_allowed', 'amnt_clothes_dryer', 'amnt_heating',
       'amnt_air_conditioning', 'amnt_workspace', 'amnt_iron', 'amnt_pool',
       'amnt_bathtub', 'amnt_ev_charger', 'amnt_crib', 'amnt_king_bed',
       'amnt_gym', 'amnt_breakfast', 'amnt_fireplace', 'amnt_smoking_allowed',
       'amnt_waterfront', 'amnt_smoke_alarm', 'amnt_carbon_monoxide_alarm',
       'first_review_days_since'],
      dtype='object')

##### 평가 함수

In [None]:
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error, r2_score

In [75]:
# 회귀를 평가하는 함수
def evaluate_regression(y_true, y_pred):
    mse = mean_squared_error(y_true,y_pred)
    rmse = root_mean_squared_error(y_true,y_pred)
    mae = mean_absolute_error(y_true,y_pred)
    r2 = r2_score(y_true,y_pred)
    print(f'MSE: {mse}, RMSE: {rmse}, MAE: {mae}, R2: {r2}')

In [77]:
evaluate_regression(y_train, y_pred_train)
evaluate_regression(y_test, y_pred_test)

MSE: 0.08576357401496959, RMSE: 0.29285418558553944, MAE: 0.2203018831214187, R2: 0.7700769372658298
MSE: 0.10967390390693783, RMSE: 0.3311705057926171, MAE: 0.24858998436020624, R2: 0.7007601561454638


##### 특성 중요도 0.01 미만 컬럼 삭제

In [None]:
# low_importance_cols = final_df.drop('price',axis=1).columns[xgb_clf.feature_importances_ < 0.01]
# final_drop = final_df.drop(columns=low_importance_cols)

# X2 = final_drop.drop('price',axis=1)
# y2 = np.log(final_drop['price'])

# X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=42)


In [None]:
# xgb_clf = XGBRegressor(
#     n_estimators =270,
#     max_depth = 5,
#     learning_rate = 0.09,
#     random_state = 42
#     )


# xgb_clf.fit(X2_train,y2_train)
# y_pred_train = xgb_clf.predict(X2_train)
# y_pred_test = xgb_clf.predict(X2_test)
# print(xgb_clf.score(X2_train, y2_train))
# print(xgb_clf.score(X2_test, y2_test))

0.7712440487293695
0.6987286753940016


In [164]:
df_listings = pd.read_csv('./data/listings_copy.csv.')

In [None]:
# from sklearn.preprocessing import LabelEncoder

# X3 = df_listings.drop('price',axis=1)
# y3 = np.log(df_listings['price'])

# for col in ['neighbourhood_cleansed', 'property_type', 'room_type']:
#     le = LabelEncoder()
#     X3[col] = le.fit_transform(X3[col].astype(str))
# X3['amenities_count'] = X3['amenities'].str.count(',') + 1
# X3 = X3.drop(columns=['amenities'])

# X3['first_review'] = pd.to_datetime(X3['first_review'], errors='coerce')
# X3['first_review_days_since'] = (pd.Timestamp("2025-07-01") - X3['first_review']).dt.days
# X3['first_review_days_since'] = X3['first_review_days_since'].fillna(-1)
# X3 = X3.drop(columns=['first_review'])

# X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, random_state=42)

# xgb_clf3 = XGBRegressor(
#     n_estimators =270,
#     max_depth = 5,
#     learning_rate = 0.09,
#     random_state = 42,
#     subsample=0.7,
#     colsample_bytree=0.55,
#     # reg_alpha = 0.01,
#     # reg_lambda=3
#     )

# xgb_clf3.fit(X3_train,y3_train)
# y_pred_train3 = xgb_clf3.predict(X3_train)
# y_pred_test3 = xgb_clf3.predict(X3_test)
# print(xgb_clf3.score(X3_train, y3_train))
# print(xgb_clf3.score(X3_test, y3_test))

0.7550660239981039
0.6916973267290285
