In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

## 데이터 준비

In [2]:
# -------------------------------------------------
# data 불러오기
# -------------------------------------------------
df = pd.read_csv('data/bike_sharing_demand.csv', parse_dates=['datetime'])

# -------------------------------------------------
# 파생컬럼 추가
# -------------------------------------------------
# 연, 월, 일, 시, 요일
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['dayofweek'] = df['datetime'].dt.dayofweek # 월요일:0, 일요일:6
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,hour,dayofweek
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,1,0,5
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,1,1,5
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,1,2,5
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011,1,1,3,5
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011,1,1,4,5


## 변수선택

In [3]:
# --------------------------------------------
# 변수선택
#   - casual, registered는 독립변수의 성격이 아니므로 선택하지 않음
#   - datetime은 선택하지 않음
# --------------------------------------------

X = df.drop(['datetime','casual','registered','count'], axis=1).copy()
y = df['count']

## 훈련세트 테스트세트 분할

In [4]:
# 훈련세트/테스트세트 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(8164, 13) (2722, 13) (8164,) (2722,)


# LinearRegression

In [5]:
# --------------------------------------------
# 모델 생성 및 훈련
# --------------------------------------------
from sklearn.linear_model import LinearRegression
model = LinearRegression()  # 모델 생성
model.fit(X_train, y_train) # 모델 훈련

# --------------------------------------------
# 테스트 데이터로 예측
# --------------------------------------------
pred = model.predict(X_test)

# --------------------------------------------
# 모델 성능 평가
# --------------------------------------------
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error, mean_squared_error
r2 = r2_score(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)

print(f'r2:{r2}')
print(f'rmse:{rmse}')
print(f'mae:{mae}')
print(f'mse:{mse}')

# --------------------------------------------
# 과적합 확인
# --------------------------------------------
print('train>>>>>>>>>>>>', model.score(X_train, y_train))
print('test>>>>>>>>>>>>>', model.score(X_test, y_test))

r2:0.38775297175891066
rmse:141.74025022251396
mae:105.50229321537446
mse:20090.29853314087
train>>>>>>>>>>>> 0.38970071368542136
test>>>>>>>>>>>>> 0.38775297175891066


# KNeighborsRegressor

In [6]:
# --------------------------------------------
# 모델 생성 및 훈련
# --------------------------------------------
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor()
model.fit(X_train, y_train)

# --------------------------------------------
# 테스트 데이터로 예측
# --------------------------------------------
pred = model.predict(X_test)

# --------------------------------------------
# 모델 성능 평가
# --------------------------------------------
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error, mean_squared_error
r2 = r2_score(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)

print(f'r2:{r2}')
print(f'rmse:{rmse}')
print(f'mae:{mae}')
print(f'mse:{mse}')

# --------------------------------------------
# 과적합 확인
# --------------------------------------------
print('train>>>>>>>>>>>>', model.score(X_train, y_train))
print('test>>>>>>>>>>>>>', model.score(X_test, y_test))


r2:0.5279691576869651
rmse:124.45577120423012
mae:84.2998530492285
mse:15489.238986039674
train>>>>>>>>>>>> 0.6996429003429661
test>>>>>>>>>>>>> 0.5279691576869651


# DecisionTreeRegressor

In [7]:
# --------------------------------------------
# 모델 생성 및 훈련
# --------------------------------------------
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

# --------------------------------------------
# 테스트 데이터로 예측
# --------------------------------------------
pred = model.predict(X_test)

# --------------------------------------------
# 모델 성능 평가
# --------------------------------------------
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error, mean_squared_error
r2 = r2_score(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)

print(f'r2:{r2}')
print(f'rmse:{rmse}')
print(f'mae:{mae}')
print(f'mse:{mse}')

# --------------------------------------------
# 과적합 확인
# --------------------------------------------
print('train>>>>>>>>>>>>', model.score(X_train, y_train))
print('test>>>>>>>>>>>>>', model.score(X_test, y_test))



r2:0.9080131345646314
rmse:54.9405195704284
mae:33.06759735488611
mse:3018.460690668626
train>>>>>>>>>>>> 1.0
test>>>>>>>>>>>>> 0.9080131345646314


# RandomForestRegressor

In [8]:
# --------------------------------------------
# 모델 생성 및 훈련
# --------------------------------------------
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# --------------------------------------------
# 테스트 데이터로 예측
# --------------------------------------------
pred = model.predict(X_test)

# --------------------------------------------
# 모델 성능 평가
# --------------------------------------------
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error, mean_squared_error
r2 = r2_score(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)

print(f'r2:{r2}')
print(f'rmse:{rmse}')
print(f'mae:{mae}')
print(f'mse:{mse}')

# --------------------------------------------
# 과적합 확인
# --------------------------------------------
print('train>>>>>>>>>>>>', model.score(X_train, y_train))
print('test>>>>>>>>>>>>>', model.score(X_test, y_test))


r2:0.9554040957647562
rmse:38.25404278866102
mae:24.149974283614988
mse:1463.3717896767082
train>>>>>>>>>>>> 0.992831684167838
test>>>>>>>>>>>>> 0.9554040957647562
