# 패키지 로드

In [37]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd 
import numpy as np 
from datetime import datetime, date
from tqdm.auto import tqdm 

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor 
from sklearn.svm import SVR

from xgboost import XGBRegressor, plot_tree, plot_importance

random_state = 42 

import warnings 
warnings.filterwarnings(action='ignore')


  from .autonotebook import tqdm as notebook_tqdm


# 데이터 로드 

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

sample_submission = pd.read_csv('./data/sample_submission.csv')

In [3]:
train.head()

Unnamed: 0,date,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max,rental
0,2018-01-01,,-1.3,3.8,-5.1,34.0,17.0,39.1,8.3,86.5,1.4,3.8,4950
1,2018-01-02,,-1.8,1.8,-4.3,36.0,22.0,42.0,7.9,82.3,1.8,4.9,7136
2,2018-01-03,,-4.7,-0.4,-7.1,31.0,19.0,42.3,8.6,88.7,2.2,3.5,7156
3,2018-01-04,,-4.7,-0.7,-8.7,39.0,24.0,43.0,6.2,63.9,1.4,3.5,7102
4,2018-01-05,,-3.0,1.6,-5.6,51.0,35.0,48.4,8.2,84.5,1.7,3.6,7705


In [4]:
train.tail()

Unnamed: 0,date,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max,rental
1090,2020-12-27,0.0,5.8,10.0,1.4,70.0,42.0,62.9,5.9,61.5,1.8,2.8,37103
1091,2020-12-28,1.3,6.7,11.4,4.2,66.0,44.0,72.1,8.0,83.3,1.4,3.1,46912
1092,2020-12-29,0.2,0.1,4.3,-6.2,69.0,46.0,70.8,0.0,0.0,2.9,6.1,35747
1093,2020-12-30,,-10.9,-6.2,-12.9,39.0,15.0,55.5,8.3,86.5,4.1,6.2,22488
1094,2020-12-31,0.0,-8.9,-5.0,-12.9,28.0,12.0,53.9,6.0,62.5,2.4,4.2,24535


In [5]:
test.head()

Unnamed: 0,date,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max
0,2021-01-01,,-4.2,1.6,-9.8,30.0,17.0,64.0,6.5,67.7,2.0,4.1
1,2021-01-02,,-5.0,-1.4,-8.4,34.0,12.0,38.5,9.0,93.8,2.6,5.4
2,2021-01-03,,-5.6,-2.0,-9.1,39.0,14.0,45.0,5.5,56.7,2.0,4.5
3,2021-01-04,0.0,-3.5,0.3,-8.4,40.0,23.0,51.4,4.6,47.4,1.7,3.2
4,2021-01-05,0.0,-5.5,-2.1,-9.9,30.0,17.0,52.8,8.6,88.7,2.9,5.7


In [6]:
test.tail()

Unnamed: 0,date,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max
360,2021-12-27,0.0,-7.6,-3.9,-12.9,33.0,20.0,60.9,3.8,39.6,1.7,3.1
361,2021-12-28,,-4.1,-0.9,-8.5,51.0,38.0,73.8,1.7,17.7,2.2,3.1
362,2021-12-29,0.2,0.4,5.9,-3.8,66.0,49.0,72.9,1.8,18.8,2.6,5.9
363,2021-12-30,0.0,-3.9,0.2,-6.8,30.0,17.0,48.5,7.3,76.0,3.3,6.6
364,2021-12-31,,-6.7,-3.9,-8.8,23.0,7.0,35.9,9.0,93.8,3.5,5.4


train 의 경우 2018-01-01 ~ 2021-12-31 의 데이터  
test 으 경우 2021-01-01 ~ 2021-12-31 의 데이터

# 데이터 전처리 
na 값 처리 : 강수량의 경우 0으로 대체 / 다른 날씨 데이터의 경우 1일 전 날짜의 데이터로 대체

feature engineering : 요일 추출 / 코로나 방역 단계 데이터 생성 (외부 데이터 사용) 

https://terms.naver.com/entry.naver?docId=5928099&cid=43667&categoryId=43667 (역대 사회적 거리두기 데이터)

In [7]:
#train 데이터 전처리 

# 연도, 월, 일 추출
year = []
month = []
day = [] 

for date in train["date"] :
    y = date.split('-')[0]
    m = date.split('-')[1]
    d = date.split('-')[2]
    
    year.append(int(y))
    month.append(int(m))
    day.append(int(d))

train["year"] = year 
train["month"] = month 
train["day"] = day

In [8]:
# na 값 처리 
# 강수량 na 값 처리 : 0 으로 대체 
train["precipitation"] = train["precipitation"].replace(np.nan, 0)

# 미세먼지/초미세먼지 na 값 처리 : 이전 날짜 데이터 복사 
train["PM10"] = train["PM10"].fillna(method="bfill")
train["PM2.5"] = train["PM2.5"].fillna(method="bfill")
train["sunshine_sum"] = train["sunshine_sum"].fillna(method="bfill")

In [9]:
# 날짜로 부터 요일 추출 
train_week_day = [] 

for i in train.date:
    date = datetime.strptime(i, "%Y-%m-%d")
    day = date.weekday() 
    train_week_day.append(day) 

train["week_day"] = train_week_day 

시기별 방역 단계 

2018-01-01 ~ 2020-03-21 : pre corona 

2020-03-22 ~ 2021-12-31 : corona

In [10]:
# 코로나 feature 생성 
threshold = datetime(2020, 3, 21)

corona = [] 

for i in train["date"]:
    date = datetime.strptime(i, "%Y-%m-%d")
    if date < threshold:
        corona.append(0)
    else :
        corona.append(1)

train["corona"] = corona

In [11]:

# 정규화 
num_features = ['precipitation', 'temp_mean', 'temp_highest', 'temp_lowest', 'PM10', 'PM2.5', 'humidity', 'sunshine_sum', 'sunshine_rate','wind_mean', 'wind_max']

scaler = MinMaxScaler()
scaler.fit(train[num_features])
train[num_features] = scaler.transform(train[num_features])


test 데이터 전처리

In [15]:
#test 데이터 전처리 

#연도, 월, 일 추출 
year_ = []
month_ = [] 
day_ = []

for i in test["date"]:
    y_ = i.split("-")[0]
    m_ = i.split("-")[1]
    d_ = i.split("-")[2]
    
    year_.append(int(y_))
    month_.append(int(m_))
    day_.append(int(d_))

test["year"] = year_
test["month"] = month_
test["day"] = day_ 

In [16]:
# na 값 처리 
#강수량 na 값 처리 
test["precipitation"] = test["precipitation"].replace(np.nan, 0)

#채광량 na값 처리
test["sunshine_sum"] = test["sunshine_sum"].fillna(method="bfill") 

In [18]:
# 날짜로 부터 요일 추출 
test_week_day = [] 

for i in test.date:
    date = datetime.strptime(i, "%Y-%m-%d")
    day = date.weekday() 
    test_week_day.append(day) 

test["week_day"] = test_week_day 

In [19]:
test['corona'] = 1

In [20]:
# 정규화 
test[num_features] = scaler.transform(test[num_features])


In [36]:
test

Unnamed: 0,date,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max,year,month,day,week_day,corona
0,2021-01-01,0.00000,0.218557,0.244533,0.166320,0.150602,0.122951,0.582807,0.477941,0.705944,0.333333,0.304878,2021,1,1,4,1
1,2021-01-02,0.00000,0.202062,0.184891,0.195426,0.174699,0.081967,0.260430,0.661765,0.978102,0.476190,0.463415,2021,1,2,5,1
2,2021-01-03,0.00000,0.189691,0.172962,0.180873,0.204819,0.098361,0.342604,0.404412,0.591241,0.333333,0.353659,2021,1,3,6,1
3,2021-01-04,0.00000,0.232990,0.218688,0.195426,0.210843,0.172131,0.423515,0.338235,0.494265,0.261905,0.195122,2021,1,4,0,1
4,2021-01-05,0.00000,0.191753,0.170974,0.164241,0.150602,0.122951,0.441214,0.632353,0.924922,0.547619,0.500000,2021,1,5,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,2021-12-27,0.00000,0.148454,0.135189,0.101871,0.168675,0.147541,0.543616,0.279412,0.412930,0.261905,0.182927,2021,12,27,0,1
361,2021-12-28,0.00000,0.220619,0.194831,0.193347,0.277108,0.295082,0.706700,0.125000,0.184567,0.380952,0.182927,2021,12,28,1,1
362,2021-12-29,0.00194,0.313402,0.330020,0.291060,0.367470,0.385246,0.695322,0.132353,0.196038,0.476190,0.524390,2021,12,29,2,1
363,2021-12-30,0.00000,0.224742,0.216700,0.228690,0.150602,0.122951,0.386852,0.536765,0.792492,0.642857,0.609756,2021,12,30,3,1


In [21]:
X = train.drop(["date","rental"], axis=1)
y = train["rental"]

train_x, val_x, train_y, val_y = train_test_split(X,y,test_size = 0.3)

test_x = test.drop(["date"], axis=1)

In [22]:
test_x

Unnamed: 0,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max,year,month,day,week_day,corona
0,0.00000,0.218557,0.244533,0.166320,0.150602,0.122951,0.582807,0.477941,0.705944,0.333333,0.304878,2021,1,1,4,1
1,0.00000,0.202062,0.184891,0.195426,0.174699,0.081967,0.260430,0.661765,0.978102,0.476190,0.463415,2021,1,2,5,1
2,0.00000,0.189691,0.172962,0.180873,0.204819,0.098361,0.342604,0.404412,0.591241,0.333333,0.353659,2021,1,3,6,1
3,0.00000,0.232990,0.218688,0.195426,0.210843,0.172131,0.423515,0.338235,0.494265,0.261905,0.195122,2021,1,4,0,1
4,0.00000,0.191753,0.170974,0.164241,0.150602,0.122951,0.441214,0.632353,0.924922,0.547619,0.500000,2021,1,5,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,0.00000,0.148454,0.135189,0.101871,0.168675,0.147541,0.543616,0.279412,0.412930,0.261905,0.182927,2021,12,27,0,1
361,0.00000,0.220619,0.194831,0.193347,0.277108,0.295082,0.706700,0.125000,0.184567,0.380952,0.182927,2021,12,28,1,1
362,0.00194,0.313402,0.330020,0.291060,0.367470,0.385246,0.695322,0.132353,0.196038,0.476190,0.524390,2021,12,29,2,1
363,0.00000,0.224742,0.216700,0.228690,0.150602,0.122951,0.386852,0.536765,0.792492,0.642857,0.609756,2021,12,30,3,1


# EDA 

In [23]:
train

Unnamed: 0,date,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max,rental,year,month,day,week_day,corona
0,2018-01-01,0.000000,0.278351,0.288270,0.264033,0.174699,0.122951,0.268015,0.610294,0.901981,0.190476,0.268293,4950,2018,1,1,0,0
1,2018-01-02,0.000000,0.268041,0.248509,0.280665,0.186747,0.163934,0.304678,0.580882,0.858186,0.285714,0.402439,7136,2018,1,2,1,0
2,2018-01-03,0.000000,0.208247,0.204771,0.222453,0.156627,0.139344,0.308470,0.632353,0.924922,0.380952,0.231707,7156,2018,1,3,2,0
3,2018-01-04,0.000000,0.208247,0.198807,0.189189,0.204819,0.180328,0.317320,0.455882,0.666319,0.190476,0.231707,7102,2018,1,4,3,0
4,2018-01-05,0.000000,0.243299,0.244533,0.253638,0.277108,0.270492,0.385588,0.602941,0.881126,0.261905,0.243902,7705,2018,1,5,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,2020-12-27,0.000000,0.424742,0.411531,0.399168,0.391566,0.327869,0.568900,0.433824,0.641293,0.285714,0.146341,37103,2020,12,27,6,1
1091,2020-12-28,0.012609,0.443299,0.439364,0.457380,0.367470,0.344262,0.685209,0.588235,0.868613,0.190476,0.182927,46912,2020,12,28,0,1
1092,2020-12-29,0.001940,0.307216,0.298211,0.241164,0.385542,0.360656,0.668774,0.000000,0.000000,0.547619,0.548780,35747,2020,12,29,1,1
1093,2020-12-30,0.000000,0.080412,0.089463,0.101871,0.204819,0.106557,0.475348,0.610294,0.901981,0.833333,0.560976,22488,2020,12,30,2,1


In [24]:
total_2018 = sum(train.loc[train['year'] == 2018, 'rental'].values)
total_2019 = sum(train.loc[train['year'] == 2019, 'rental'].values)
total_2020 = sum(train.loc[train['year'] == 2020, 'rental'].values)
print(total_2019/total_2018)
print(total_2020/total_2019)

1.8836727252111978
1.2429259327402773


2018 년도 대비 2019 년도에는 전체적으로 사용량이 1.8배 늘었다. 

2019년도 대비 2020 년도에는 전체적으로 사용량이 1.2배 늘었다. 

2020년도 대비 2021 년도 상승분은??

# 모델링

In [25]:
#NMAE metric
import numpy as np

def NMAE(true, pred):
    score = np.mean(np.abs(true-pred) / true)
    return score

In [28]:
# baseline model performance on validation data

xgr = XGBRegressor()
xgr.fit(train_x,train_y)
xgr_val_pred = xgr.predict(val_x)
NMAE(val_y,xgr_val_pred)

0.1887269810756372

In [29]:
rf = RandomForestRegressor()
rf.fit(train_x,train_y)
rf_val_pred = rf.predict(val_x)
NMAE(val_y,rf_val_pred)

0.2000505322820149

In [30]:
val_ensemble = (xgr_val_pred + rf_val_pred) / 2 
NMAE(val_y, val_ensemble)

0.18732469743332797

In [31]:
# model performance on validation
# rf : 0.20432992846002493
# lr : 0.4704906394251638
# ada : 0.48456421809707245
# gbr : 0.19139416721595723
# xgboost : 0.1721447495761992

In [32]:
#final model 
xgr.fit(X,y)
rf.fit(X,y)
xgr_pred = xgr.predict(test_x)
rf_pred = rf.predict(test_x)

ensemble_pred = (xgr_pred + rf_pred) / 2 

In [35]:
sample_submission["rental"] = ensemble_pred 
sample_submission.to_csv("xgr_rf_v03.csv",index=False)