# 패키지 로드

In [1]:
import pandas as pd 
import numpy as np 

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler 
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
random_state = 42


import matplotlib.pyplot as plt 

import warnings 
warnings.filterwarnings(action = 'ignore')

# 데이터 로드

In [2]:
train = pd.read_csv('open/train.csv')
test = pd.read_csv('open/test.csv')
sample_submission = pd.read_csv('open/sample_submission.csv')

# 데이터 살펴보기
## 데이터 설명
- date: 날짜
- precipitation: 강수량(mm)
- temp_mean: 평균 기온(℃)
- temp_highest: 최고 기온(℃)
- temp_lowest: 최저 기온(℃)
- PM10: 미세먼지(㎍/㎥)
- PM2.5: 초미세먼지(㎍/㎥)
- humidity: 습도(%rh)
- sunshine_sum: 일조합
- sunshine_rate: 일조율
- wind_mean: 평균 풍속(m/s)
- wind_max: 최대 풍속(m/s)
- rentals : 따릉이 대여량

In [3]:
train.head()

Unnamed: 0,date,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max,rental
0,2018-01-01,,-1.3,3.8,-5.1,34.0,17.0,39.1,8.3,86.5,1.4,3.8,4950
1,2018-01-02,,-1.8,1.8,-4.3,36.0,22.0,42.0,7.9,82.3,1.8,4.9,7136
2,2018-01-03,,-4.7,-0.4,-7.1,31.0,19.0,42.3,8.6,88.7,2.2,3.5,7156
3,2018-01-04,,-4.7,-0.7,-8.7,39.0,24.0,43.0,6.2,63.9,1.4,3.5,7102
4,2018-01-05,,-3.0,1.6,-5.6,51.0,35.0,48.4,8.2,84.5,1.7,3.6,7705


In [4]:
train.tail()

Unnamed: 0,date,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max,rental
1090,2020-12-27,0.0,5.8,10.0,1.4,70.0,42.0,62.9,5.9,61.5,1.8,2.8,37103
1091,2020-12-28,1.3,6.7,11.4,4.2,66.0,44.0,72.1,8.0,83.3,1.4,3.1,46912
1092,2020-12-29,0.2,0.1,4.3,-6.2,69.0,46.0,70.8,0.0,0.0,2.9,6.1,35747
1093,2020-12-30,,-10.9,-6.2,-12.9,39.0,15.0,55.5,8.3,86.5,4.1,6.2,22488
1094,2020-12-31,0.0,-8.9,-5.0,-12.9,28.0,12.0,53.9,6.0,62.5,2.4,4.2,24535


In [5]:
train.isnull().sum()

date               0
precipitation    678
temp_mean          0
temp_highest       0
temp_lowest        0
PM10              67
PM2.5             68
humidity           0
sunshine_sum       5
sunshine_rate      0
wind_mean          0
wind_max           0
rental             0
dtype: int64

강수량 데이터가 nan 값인 경우 강수량이 0 인 데이터로 예상됨 

기상청 포털, 서울 데이터 광장을 통해 nan 데이터를 불러올 수 있을 듯 

# 데이터 전처리

In [6]:
#preprocess train data
#extract year/month/day 
year = []
month = []
day = [] 

for date in train["date"] :
    y = date.split('-')[0]
    m = date.split('-')[1]
    d = date.split('-')[2]
    
    year.append(y)
    month.append(m)
    day.append(d)

train["year"] = year 
train["month"] = month 
train["day"] = day

#fill na 
train["precipitation"] = train["precipitation"].replace(np.nan, 0)
train["PM10"] = train["PM10"].fillna(train["PM10"].mean())
train["PM2.5"] = train["PM2.5"].fillna(train["PM2.5"].mean())
train["sunshine_sum"] = train["sunshine_sum"].fillna(train["sunshine_sum"].mean())


In [28]:
test.head()

Unnamed: 0,date,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max,year,month,day
0,2021-01-01,0.0,-4.2,1.6,-9.8,30.0,17.0,64.0,6.5,67.7,2.0,4.1,2021,1,1
1,2021-01-02,0.0,-5.0,-1.4,-8.4,34.0,12.0,38.5,9.0,93.8,2.6,5.4,2021,1,1
2,2021-01-03,0.0,-5.6,-2.0,-9.1,39.0,14.0,45.0,5.5,56.7,2.0,4.5,2021,1,1
3,2021-01-04,0.0,-3.5,0.3,-8.4,40.0,23.0,51.4,4.6,47.4,1.7,3.2,2021,1,1
4,2021-01-05,0.0,-5.5,-2.1,-9.9,30.0,17.0,52.8,8.6,88.7,2.9,5.7,2021,1,1


In [29]:
#processs test data
#extract year/month/day 

year_ =[] 
month_ = []
day_= [] 

for date in test["date"]:
    y_ = date.split("-")[0]
    m_ = date.split("-")[1]
    d_ = date.split("-")[2]

    year_.append(y_)
    month_.append(m_)
    day_.append(m_)

test["year"] = year_
test["month"] = month_
test["day"] = day_ 

test["precipitation"] = test["precipitation"].replace(np.nan, 0)
test["PM10"] = test["PM10"].fillna(train["PM10"].mean())
test["PM2.5"] = test["PM2.5"].fillna(train["PM2.5"].mean())
test["sunshine_sum"] = test["sunshine_sum"].fillna(train["sunshine_sum"].mean())

In [30]:
X = train.drop(["date","rental"],axis=1)
y = train["rental"]

train_x, val_x, train_y, val_y = train_test_split(X,y,test_size=0.25,shuffle=True)

test_x = test.drop(["date"], axis = 1)

# Model

In [31]:
import numpy as np

def NMAE(true, pred):
    score = np.mean(np.abs(true-pred) / true)
    return score

In [27]:
reg = RandomForestRegressor() 
reg.fit(train_x,train_y) 
val_pred = reg.predict(val_x)

NMAE(val_y, val_pred)


0.19613750364248292

# Prediction / Submission 

In [33]:
X

Unnamed: 0,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max,year,month,day
0,0.0,-1.3,3.8,-5.1,34.0,17.0,39.1,8.3,86.5,1.4,3.8,2018,01,01
1,0.0,-1.8,1.8,-4.3,36.0,22.0,42.0,7.9,82.3,1.8,4.9,2018,01,02
2,0.0,-4.7,-0.4,-7.1,31.0,19.0,42.3,8.6,88.7,2.2,3.5,2018,01,03
3,0.0,-4.7,-0.7,-8.7,39.0,24.0,43.0,6.2,63.9,1.4,3.5,2018,01,04
4,0.0,-3.0,1.6,-5.6,51.0,35.0,48.4,8.2,84.5,1.7,3.6,2018,01,05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,0.0,5.8,10.0,1.4,70.0,42.0,62.9,5.9,61.5,1.8,2.8,2020,12,27
1091,1.3,6.7,11.4,4.2,66.0,44.0,72.1,8.0,83.3,1.4,3.1,2020,12,28
1092,0.2,0.1,4.3,-6.2,69.0,46.0,70.8,0.0,0.0,2.9,6.1,2020,12,29
1093,0.0,-10.9,-6.2,-12.9,39.0,15.0,55.5,8.3,86.5,4.1,6.2,2020,12,30
