In [32]:
import pandas as pd

data = pd.read_csv('merged_data.csv')
data.drop(columns = ['date'], inplace = True)
data

Unnamed: 0,pm2.5,o3,no2,기온(°C),풍속(m/s),강수량(mm),0~9,20~29,80~89,over 100,하차총승객수,입장객 수
0,12.241500,0.024,0.026,8.641667,1.837500,0.000000,4678,8267,1636,9,9721,17070.0
1,25.930316,0.035,0.021,7.912500,2.162500,0.000000,4678,8267,1636,9,10397,32457.0
2,44.903672,0.036,0.035,10.837500,2.687500,0.000000,4678,8267,1636,9,3498,3032.0
3,40.147587,0.031,0.042,12.245833,1.516667,0.000000,4678,8267,1636,9,5501,3576.0
4,24.873408,0.018,0.041,11.941667,1.112500,1.416667,4678,8267,1636,9,3850,1067.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1000,18.000000,0.018,0.026,-0.579167,1.375000,0.000000,4502,7952,1713,9,3583,1302.0
1001,23.000000,0.007,0.035,0.262500,0.579167,0.000000,4502,7952,1713,9,5194,2110.0
1002,28.000000,0.007,0.033,3.300000,0.833333,0.020833,4502,7952,1713,9,3582,6913.0
1003,26.000000,0.014,0.026,4.725000,1.891667,0.020833,4502,7952,1713,9,2277,863.0


## X,y분리하기

In [33]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

#train, test 나눈 후에 정규화(서로 간섭하는걸 방지하기 위해)
scaler = RobustScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)



NameError: name 'X' is not defined

## 1. 다중회귀분석

In [35]:
from sklearn.linear_model import LinearRegression


reg = LinearRegression()
reg.fit(X_train_scaled, y_train)

y_pred = reg.predict(X_test_scaled)


### 모델 성능 평가

In [36]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate_model_performance(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)  # RMSE는 MSE의 제곱근
    r2 = r2_score(y_true, y_pred)

    print(f'MAE: {mae}')
    print(f'MSE: {mse}')
    print(f'RMSE: {rmse}')
    print(f'R²: {r2}')

evaluate_model_performance(y_test, y_pred)

MAE: 2346.337529940856
MSE: 11919579.470222188
RMSE: 3452.4743981993824
R²: 0.7519705155869822


## 랜덤포레스트

In [38]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train_scaled, y_train)

y_pred = rf.predict(X_test_scaled)

evaluate_model_performance(y_test, y_pred)

MAE: 1958.7336318407959
MSE: 9824447.249287562
RMSE: 3134.397430015467
R²: 0.7955672352391924


## 랜덤포레스트  + Grid Search

In [44]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

rf = RandomForestRegressor()
param_grid = {
    'n_estimators':[100,150,200],
    'max_depth':[None, 10,20,30],
    'min_samples_split':[2,5,10,20],
    'min_samples_leaf':[1,2,4,10]
}

grid_search = GridSearchCV(estimator=rf,
                          param_grid = param_grid,
                          scoring = 'neg_mean_squared_error',
                          cv = 5,
                          verbose=2,
                          n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
print('Best parameters: ', grid_search.best_params_)

best_rf = grid_search.best_estimator_

y_pred = best_rf.predict(X_test_scaled)

evaluate_model_performance(y_test, y_pred)

Fitting 5 folds for each of 192 candidates, totalling 960 fits
Best parameters:  {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
MAE: 1840.5192575031874
MSE: 8709565.404732976
RMSE: 2951.1972832619945
R²: 0.8187663396855469


## XGBoost

In [41]:
from xgboost import XGBRegressor

xgb = XGBRegressor()
xgb.fit(X_train_scaled, y_train)

y_pred = xgb.predict(X_test_scaled)
evaluate_model_performance(y_test, y_pred)

MAE: 2017.1681772108695
MSE: 10944662.861849323
RMSE: 3308.271884511508
R²: 0.7722571426718137


## XGBoost + Grid Search

In [47]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

xgb = XGBRegressor()
param_grid = {
    'n_estimators':[100,200,300,500],
    'learning_rate':[0.01,0.05,0.1,0.2],
    'max_depth':[3,5,7,10],
    'min_child_weight':[1,3,5,7],
    'colsample_bytree':[0.6,0.7,0.8]
}

grid_search = GridSearchCV(estimator=xgb,
                          param_grid = param_grid,
                          scoring = 'neg_mean_squared_error',
                          cv = 5,
                          verbose=2,
                          n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
print('Best parameters: ', grid_search.best_params_)

best_xgb = grid_search.best_estimator_

y_pred = best_xgb.predict(X_test_scaled)

evaluate_model_performance(y_test, y_pred)

Fitting 5 folds for each of 768 candidates, totalling 3840 fits
Best parameters:  {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 7, 'n_estimators': 100}
MAE: 2012.8141198229434
MSE: 10433645.589704584
RMSE: 3230.115414300948
R²: 0.7828906848074947


## LightGBM

In [43]:
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor()
lgbm.fit(X_train_scaled, y_train)

y_pred = lgbm.predict(X_test_scaled)
evaluate_model_performance(y_test, y_pred)

MAE: 1946.0322808543845
MSE: 9757908.509740865
RMSE: 3123.7651175689994
R²: 0.7969518117088983


## LightGBM + Grid Search

In [49]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

lgbm = LGBMRegressor()
param_grid = {
    'n_estimators':[100,200,500,1000],
    'learning_rate':[0.01,0.05,0.1],
    'num_leaves':[20,30,40,50],
    'max_depth':[3,5,7,10],
    'min_child_samples':[10,20,30,50]
}

grid_search = GridSearchCV(estimator=lgbm,
                          param_grid = param_grid,
                          scoring = 'neg_mean_squared_error',
                          cv = 5,
                          verbose=2,
                          n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
print('Best parameters: ', grid_search.best_params_)

best_lgbm = grid_search.best_estimator_

y_pred = best_lgbm.predict(X_test_scaled)

evaluate_model_performance(y_test, y_pred)

Fitting 5 folds for each of 768 candidates, totalling 3840 fits
Best parameters:  {'learning_rate': 0.01, 'max_depth': 5, 'min_child_samples': 10, 'n_estimators': 1000, 'num_leaves': 30}
MAE: 1867.1833354547268
MSE: 9348940.74982316
RMSE: 3057.6037594533336
R²: 0.805461848735569
