# 패키지 로드

In [98]:
import pandas as pd 
import numpy as np 
from datetime import datetime, date

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import MinMaxScaler, StandardScaler 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor 
random_state = 42 


# 데이터 로드 

In [53]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

sample_submission = pd.read_csv('./data/sample_submission.csv')

In [54]:
train.head()

Unnamed: 0,date,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max,rental
0,2018-01-01,,-1.3,3.8,-5.1,34.0,17.0,39.1,8.3,86.5,1.4,3.8,4950
1,2018-01-02,,-1.8,1.8,-4.3,36.0,22.0,42.0,7.9,82.3,1.8,4.9,7136
2,2018-01-03,,-4.7,-0.4,-7.1,31.0,19.0,42.3,8.6,88.7,2.2,3.5,7156
3,2018-01-04,,-4.7,-0.7,-8.7,39.0,24.0,43.0,6.2,63.9,1.4,3.5,7102
4,2018-01-05,,-3.0,1.6,-5.6,51.0,35.0,48.4,8.2,84.5,1.7,3.6,7705


In [55]:
train.tail()

Unnamed: 0,date,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max,rental
1090,2020-12-27,0.0,5.8,10.0,1.4,70.0,42.0,62.9,5.9,61.5,1.8,2.8,37103
1091,2020-12-28,1.3,6.7,11.4,4.2,66.0,44.0,72.1,8.0,83.3,1.4,3.1,46912
1092,2020-12-29,0.2,0.1,4.3,-6.2,69.0,46.0,70.8,0.0,0.0,2.9,6.1,35747
1093,2020-12-30,,-10.9,-6.2,-12.9,39.0,15.0,55.5,8.3,86.5,4.1,6.2,22488
1094,2020-12-31,0.0,-8.9,-5.0,-12.9,28.0,12.0,53.9,6.0,62.5,2.4,4.2,24535


In [56]:
test.head()

Unnamed: 0,date,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max
0,2021-01-01,,-4.2,1.6,-9.8,30.0,17.0,64.0,6.5,67.7,2.0,4.1
1,2021-01-02,,-5.0,-1.4,-8.4,34.0,12.0,38.5,9.0,93.8,2.6,5.4
2,2021-01-03,,-5.6,-2.0,-9.1,39.0,14.0,45.0,5.5,56.7,2.0,4.5
3,2021-01-04,0.0,-3.5,0.3,-8.4,40.0,23.0,51.4,4.6,47.4,1.7,3.2
4,2021-01-05,0.0,-5.5,-2.1,-9.9,30.0,17.0,52.8,8.6,88.7,2.9,5.7


In [57]:
test.tail()

Unnamed: 0,date,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max
360,2021-12-27,0.0,-7.6,-3.9,-12.9,33.0,20.0,60.9,3.8,39.6,1.7,3.1
361,2021-12-28,,-4.1,-0.9,-8.5,51.0,38.0,73.8,1.7,17.7,2.2,3.1
362,2021-12-29,0.2,0.4,5.9,-3.8,66.0,49.0,72.9,1.8,18.8,2.6,5.9
363,2021-12-30,0.0,-3.9,0.2,-6.8,30.0,17.0,48.5,7.3,76.0,3.3,6.6
364,2021-12-31,,-6.7,-3.9,-8.8,23.0,7.0,35.9,9.0,93.8,3.5,5.4


train 의 경우 2018-01-01 ~ 2021-12-31 의 데이터  
test 으 경우 2021-01-01 ~ 2021-12-31 의 데이터

# 데이터 전처리 
na 값 처리 : 강수량의 경우 0으로 대체 / 다른 날씨 데이터의 경우 1일 전 날짜의 데이터로 대체

feature engineering : 요일 추출 / 코로나 방역 단계 데이터 생성 (외부 데이터 사용) 

https://terms.naver.com/entry.naver?docId=5928099&cid=43667&categoryId=43667 (역대 사회적 거리두기 데이터)

In [73]:
#train 데이터 전처리 

# 연도, 월, 일 추출
year = []
month = []
day = [] 

for date in train["date"] :
    y = date.split('-')[0]
    m = date.split('-')[1]
    d = date.split('-')[2]
    
    year.append(y)
    month.append(m)
    day.append(d)

train["year"] = year 
train["month"] = month 
train["day"] = day

In [59]:
# na 값 처리 
# 강수량 na 값 처리 : 0 으로 대체 
train["precipitation"] = train["precipitation"].replace(np.nan, 0)

# 미세먼지/초미세먼지 na 값 처리 : 이전 날짜 데이터 복사 
train["PM10"] = train["PM10"].fillna(method="bfill")
train["PM2.5"] = train["PM2.5"].fillna(method="bfill")
train["sunshine_sum"] = train["sunshine_sum"].fillna(method="bfill")

In [71]:
# 날짜로 부터 요일 추출 
train_week_day = [] 

for i in train.date:
    date = datetime.strptime(i, "%Y-%m-%d")
    day = date.weekday() 
    train_week_day.append(day) 

train["week_day"] = train_week_day 

시기별 방역 단계 

2018-01-01 ~ 2020-03-21 : pre corona 

2020-03-22 ~ 2021-12-31 : corona

In [86]:
# 코로나 feature 생성 
threshold = datetime(2020, 3, 21)

corona = [] 

for i in train["date"]:
    date = datetime.strptime(i, "%Y-%m-%d")
    if date < threshold:
        corona.append(0)
    else :
        corona.append(1)

train["corona"] = corona

In [105]:
# 정규화 
num_features = ['precipitation', 'temp_mean', 'temp_highest', 'temp_lowest', 'PM10', 'PM2.5', 'humidity', 'sunshine_sum', 'sunshine_rate','wind_mean', 'wind_max']

scaler = MinMaxScaler()
scaler.fit(train[num_features])
train[num_features] = scaler.transform(train[num_features])

test 데이터 전처리

In [88]:
#test 데이터 전처리 

#연도, 월, 일 추출 
year_ = []
month_ = [] 
day_ = []

for i in test["date"]:
    y_ = i.split("-")[0]
    m_ = i.split("-")[0]
    d_ = i.split("-")[1]
    
    year_.append(y_)
    month_.append(m_)
    day_.append(d_)

test["year"] = year_
test["month"] = month_
test["day"] = day_ 

In [91]:
# na 값 처리 
#강수량 na 값 처리 
test["precipitation"].fillna(0)

#채광량 na값 처리
test["sunshine_sum"] = test["sunshine_sum"].fillna(method="bfill") 

In [96]:
# 날짜로 부터 요일 추출 
test_week_day = [] 

for i in test.date:
    date = datetime.strptime(i, "%Y-%m-%d")
    day = date.weekday() 
    test_week_day.append(day) 

test["week_day"] = test_week_day 

In [108]:
# 정규화 
test[num_features] = scaler.transform(test[num_features])

In [110]:
X = train.drop(["date","rental"], axis=1)
y = train["rental"]

train_x, val_x, train_y, val_y = train_test_split(X,y,test_size = 0.3)

test_x = test.drop(["date"], axis=1)

# 모델링