In [19]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')

In [20]:
# 데이터 불러오기
data = pd.read_csv('./rawdata/final_data.csv')

In [21]:
data

Unnamed: 0,년도,강수여부,추석 전 휴일 수,코로나 유무,예측 소요시간
0,2018,0,2,0,379.083
1,2019,1,1,0,471.191
2,2020,0,1,1,316.005333
3,2021,0,3,1,296.066
4,2022,0,1,1,


In [22]:
data['예측 소요시간'] = [366, 492, 318, 318, np.nan]

In [23]:
data2 = pd.DataFrame({'년도': [2017], 
         '강수여부': [0], 
         '추석 전 휴일 수': [4], 
         '코로나 유무': [0], 
         '예측 소요시간':[462]})

In [24]:
data = pd.concat([data2, data], axis=0, ignore_index=True)

In [25]:
data

Unnamed: 0,년도,강수여부,추석 전 휴일 수,코로나 유무,예측 소요시간
0,2017,0,4,0,462.0
1,2018,0,2,0,366.0
2,2019,1,1,0,492.0
3,2020,0,1,1,318.0
4,2021,0,3,1,318.0
5,2022,0,1,1,


In [26]:
# X, y 분리
x = data.drop(['년도', '예측 소요시간'], axis = 1)
y = data[['예측 소요시간']]

In [27]:
# train, test 분리
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1, shuffle=False)

In [28]:
# 데이터 스케일링
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_test_s = scaler.transform(x_test)

### 1. Linear Regression

In [29]:
m1 = LinearRegression()
m1.fit(x_train, y_train)

In [30]:
pred = m1.predict(x_test)
pred

array([[294.]])

### 2. KNN

In [31]:
params = {'n_neighbors': range(2, 10), 'metric': ['euclidean', 'manhattan']}

In [32]:
m2 = KNeighborsRegressor()
m2_gs = GridSearchCV(m2, params, cv=4)
m2_gs.fit(x_train_s, y_train)

In [33]:
pred = m2_gs.predict(x_test_s)
pred

array([[318.]])

### 3. ARIMA

In [34]:
m3 = sm.tsa.SARIMAX(y_train, order=(1, 1, 1)).fit()
pred = m3.forecast(1)

In [35]:
pred

5    232.289291
dtype: float64

### 4. RF

In [36]:
m4 = RandomForestRegressor()
m4.fit(x_train, y_train)

In [37]:
pred = m4.predict(x_test)
pred

array([339.24])

### 5. XGB

In [38]:
m5 = XGBRegressor(max_depth=2, gamma=2, eta=0.8, reg_alpha=0.5, reg_lambda=0.5)
m5.fit(x_train, y_train)

In [39]:
pred = m5.predict(x_test)
pred

array([318.1294], dtype=float32)