In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

## 데이터 준비

In [2]:
# -------------------------------------------------
# data 불러오기
# -------------------------------------------------
df = pd.read_csv('data/bike_sharing_demand.csv', parse_dates=['datetime'])

# -------------------------------------------------
# 파생컬럼 추가
# -------------------------------------------------
# 연, 월, 일, 시, 요일
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['dayofweek'] = df['datetime'].dt.dayofweek # 월요일:0, 일요일:6
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,hour,dayofweek
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,1,0,5
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,1,1,5
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,1,2,5
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011,1,1,3,5
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011,1,1,4,5


## 변수선택

In [3]:
# --------------------------------------------
# 변수선택
#   - casual, registered는 독립변수의 성격이 아니므로 선택하지 않음
#   - datetime은 선택하지 않음
# --------------------------------------------

X = df.drop(['datetime','casual','registered','count'], axis=1).copy()
y = df['count']

## 훈련세트 테스트세트 분할

In [4]:
# 훈련세트/테스트세트 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(8164, 13) (2722, 13) (8164,) (2722,)


## 전처리
#### 수치형 변수 스케일링

In [5]:
from sklearn.preprocessing import StandardScaler 

# -------------------------
#   전처리 대상 변수 설정
# -------------------------
numerical_features   = ['temp', 'atemp', 'humidity','windspeed'] 

# -------------------------
#   스케일링
# -------------------------
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

In [6]:
X_train.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,hour,dayofweek
2930,3,0,1,1,1.084597,1.051561,0.164369,0.025162,2011,7,11,0,0
7669,2,0,1,1,0.348279,0.335944,-0.510793,1.121515,2012,5,18,22,4
1346,2,0,1,1,-1.01917,-0.916388,-0.043373,-0.82647,2011,4,1,23,4
9432,3,0,0,1,0.453467,0.425691,-0.095309,-0.461834,2012,9,16,9,6
453,1,0,1,3,-1.545112,-1.632006,1.618566,0.025162,2011,2,1,23,1


#### 범주형 변수 원핫인코딩

In [7]:
from sklearn.preprocessing import OneHotEncoder

# -------------------------
#   전처리 대상 변수 설정
# -------------------------
categorical_features   = ['season', 'weather', 'year','month','day','hour', 'dayofweek'] 

# -------------------------
#   원핫인코딩
# -------------------------
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

X_train_ohe = ohe.fit_transform(X_train[categorical_features])
X_test_ohe = ohe.transform(X_test[categorical_features])

# -------------------------
#   생성된 열 이름
# -------------------------
ohe_columns = ohe.get_feature_names_out(categorical_features)

# -------------------------
#   DataFrame으로 변환
# -------------------------
X_train_ohe = pd.DataFrame(X_train_ohe, 
             columns = ohe_columns,
             index = X_train.index)

X_test_ohe = pd.DataFrame(X_test_ohe, 
             columns = ohe_columns,
             index = X_test.index)




In [8]:
# -------------------------
#   최종 훈련 테스트 데이터 만들기
# -------------------------
X_train = pd.concat([X_train.drop(categorical_features, axis=1), X_train_ohe], axis=1)
X_test = pd.concat([X_test.drop(categorical_features, axis=1), X_test_ohe], axis=1)


In [9]:
X_train.head()

Unnamed: 0,holiday,workingday,temp,atemp,humidity,windspeed,season_1,season_2,season_3,season_4,...,hour_21,hour_22,hour_23,dayofweek_0,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6
2930,0,1,1.084597,1.051561,0.164369,0.025162,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7669,0,1,0.348279,0.335944,-0.510793,1.121515,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1346,0,1,-1.01917,-0.916388,-0.043373,-0.82647,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9432,0,0,0.453467,0.425691,-0.095309,-0.461834,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
453,0,1,-1.545112,-1.632006,1.618566,0.025162,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# LinearRegression

In [10]:
# --------------------------------------------
# 모델 생성 및 훈련
# --------------------------------------------
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

# --------------------------------------------
# 예측
# --------------------------------------------
pred = model.predict(X_test)

# --------------------------------------------
# 모델 성능 평가
# --------------------------------------------
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error, mean_squared_error
r2 = r2_score(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)

print(f'r2:{r2}')
print(f'rmse:{rmse}')
print(f'mae:{mae}')
print(f'mse:{mse}')

# --------------------------------------------
# 과적합 확인
# --------------------------------------------
print('train>>>>>>>>>>>>', model.score(X_train, y_train))
print('test>>>>>>>>>>>>>', model.score(X_test, y_test))

r2:0.6919904269321043
rmse:100.53376786180628
mae:74.48484604507097
mse:10107.038480491554
train>>>>>>>>>>>> 0.6959975059016567
test>>>>>>>>>>>>> 0.6919904269321043


# KNeighborsRegressor

In [11]:
# --------------------------------------------
# 모델 생성 및 훈련
# --------------------------------------------
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor()
model.fit(X_train, y_train)

# --------------------------------------------
# 예측
# --------------------------------------------
pred = model.predict(X_test)

# --------------------------------------------
# 모델 성능 평가
# --------------------------------------------
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error, mean_squared_error
r2 = r2_score(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)

print(f'r2:{r2}')
print(f'rmse:{rmse}')
print(f'mae:{mae}')
print(f'mse:{mse}')

# --------------------------------------------
# 과적합 확인
# --------------------------------------------
print('train>>>>>>>>>>>>', model.score(X_train, y_train))
print('test>>>>>>>>>>>>>', model.score(X_test, y_test))


r2:0.4259714150618783
rmse:137.24502541878982
mae:96.89963262307127
mse:18836.197002204262
train>>>>>>>>>>>> 0.6169661057343325
test>>>>>>>>>>>>> 0.4259714150618783


# DecisionTreeRegressor

In [12]:
# --------------------------------------------
# 모델 생성 및 훈련
# --------------------------------------------
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

# --------------------------------------------
# 예측
# --------------------------------------------
pred = model.predict(X_test)

# --------------------------------------------
# 모델 성능 평가
# --------------------------------------------
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error, mean_squared_error
r2 = r2_score(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)

print(f'r2:{r2}')
print(f'rmse:{rmse}')
print(f'mae:{mae}')
print(f'mse:{mse}')


# --------------------------------------------
# 과적합 확인
# --------------------------------------------
print('train>>>>>>>>>>>>', model.score(X_train, y_train))
print('test>>>>>>>>>>>>>', model.score(X_test, y_test))

r2:0.8585850217297903
rmse:68.12045777211884
mae:41.308596620132256
mse:4640.396767083027
train>>>>>>>>>>>> 1.0
test>>>>>>>>>>>>> 0.8585850217297903


# RandomForestRegressor

In [13]:
# --------------------------------------------
# 모델 생성 및 훈련
# --------------------------------------------
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# --------------------------------------------
# 예측
# --------------------------------------------
pred = model.predict(X_test)

# --------------------------------------------
# 모델 성능 평가
# --------------------------------------------
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error, mean_squared_error
r2 = r2_score(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)

print(f'r2:{r2}')
print(f'rmse:{rmse}')
print(f'mae:{mae}')
print(f'mse:{mse}')


# --------------------------------------------
# 과적합 확인
# --------------------------------------------
print('train>>>>>>>>>>>>', model.score(X_train, y_train))
print('test>>>>>>>>>>>>>', model.score(X_test, y_test))

r2:0.9279300332289547
rmse:48.63030720977073
mae:31.639217487141803
mse:2364.9067793166787
train>>>>>>>>>>>> 0.9885742781419462
test>>>>>>>>>>>>> 0.9279300332289547
