# 패키지 로드 

In [71]:
import pandas as pd 
import numpy as np 

from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor 
from xgboost import XGBRegressor, plot_tree, plot_importance

import warnings 
warnings.filterwarnings(action='ignore')

# 데이터 로드 

In [72]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
test_answer = pd.read_csv('./data/test_answer.csv',thousands=',')

# 데이터 전처리

In [73]:
#train preprocess

#split y/m/d
year = []
month = []
day = [] 

for date in train["date"] :
    y = date.split('-')[0]
    m = date.split('-')[1]
    d = date.split('-')[2]
    
    year.append(int(y))
    month.append(int(m))
    day.append(int(d))

train["year"] = year 
train["month"] = month 
train["day"] = day

#fill na 
train["precipitation"] = train["precipitation"].replace(np.nan, 0)
train["PM10"] = train["PM10"].fillna(method="ffill")
train["PM2.5"] = train["PM2.5"].fillna(method="ffill")
train["sunshine_sum"] = train["sunshine_sum"].fillna(method="ffill")


#test preprocess

#split y/m/d
year_ =[] 
month_ = []
day_= [] 

for date in test["date"]:
    y_ = date.split("-")[0]
    m_ = date.split("-")[1]
    d_ = date.split("-")[2]

    year_.append(int(y_))
    month_.append(int(m_))
    day_.append(int(m_))

test["year"] = year_
test["month"] = month_
test["day"] = day_ 

#fill na
test["precipitation"] = test["precipitation"].replace(np.nan, 0)
test["PM10"] = test["PM10"].fillna(method="ffill")
test["PM2.5"] = test["PM2.5"].fillna(method="ffill")
test["sunshine_sum"] = test["sunshine_sum"].fillna(method="ffill")

In [74]:
train.head() 

Unnamed: 0,date,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max,rental,year,month,day
0,2018-01-01,0.0,-1.3,3.8,-5.1,34.0,17.0,39.1,8.3,86.5,1.4,3.8,4950,2018,1,1
1,2018-01-02,0.0,-1.8,1.8,-4.3,36.0,22.0,42.0,7.9,82.3,1.8,4.9,7136,2018,1,2
2,2018-01-03,0.0,-4.7,-0.4,-7.1,31.0,19.0,42.3,8.6,88.7,2.2,3.5,7156,2018,1,3
3,2018-01-04,0.0,-4.7,-0.7,-8.7,39.0,24.0,43.0,6.2,63.9,1.4,3.5,7102,2018,1,4
4,2018-01-05,0.0,-3.0,1.6,-5.6,51.0,35.0,48.4,8.2,84.5,1.7,3.6,7705,2018,1,5


# 특성 공학 

In [75]:
#불쾌지수 특성 생성 

def get_discomfort(temp_mean, humidity):
    temp = temp_mean
    humidity = humidity / 100
    
    discomfort = 1.8 * temp - 0.55 * (1 - humidity) * (1.8*temp - 26) + 32
    return discomfort

train['discomfort'] = [0] * len(train)
for i in range(len(train)):
    train.discomfort[i] = get_discomfort(train.humidity[i], 
                                            train.temp_mean[i])

test['discomfort'] = [0] * len(test)
for i in range(len(test)):
    test.discomfort[i] = get_discomfort(test.humidity[i], 
                                           test.temp_mean[i])

In [76]:
# 일교차 특성 생성 

train['temp_diff'] = train['temp_highest'] - train['temp_lowest'] 
test['temp_diff'] = test['temp_highest'] - test['temp_lowest']

In [77]:
# 채감온도 특성 생성 
def get_sense_temp(temp_mean, wind_mean): 
    sense_temp = 13.12 + (0.6215 * temp_mean) - (11.37 * wind_mean *0.16) + (0.3965 * wind_mean * temp_mean * 0.16)

    return sense_temp

train['sense_temp'] = [0] * len(train)
for i in range(len(train)):
    train.discomfort[i] = get_sense_temp(train.temp_mean[i], 
                                            train.wind_mean[i])

test['sense_temp'] = [0] * len(test)
for i in range(len(test)):
    test.discomfort[i] = get_sense_temp(test.temp_mean[i], 
                                            test.wind_mean[i])


In [78]:
# 추위 특성 생성 
train["coldness"] = train["temp_lowest"] / train["wind_mean"]
test["coldness"] = test["temp_lowest"] / test["wind_mean"]


In [91]:
#create final data 
train_x = train.drop(["date", "rental"],axis=1)
train_y = train["rental"]

test_x = test.drop(["date"],axis=1)
test_y = test_answer["rental"]

# 모델링

In [90]:
#metric
def NMAE(true, pred):
    score = np.mean(np.abs(true-pred) / true)
    return score

In [95]:
reg_2018 = sum(train.loc[train['year'] == 2018, 'rental'].values)
reg_2019 = sum(train.loc[train['year'] == 2019, 'rental'].values)
reg_2020 = sum(train.loc[train['year'] == 2020, 'rental'].values)
reg_2021 = sum(test_answer.rental.values)

print(reg_2019/reg_2018)
print(reg_2020/reg_2019)
print(reg_2021/reg_2020)

1.8836727252111978
1.2429259327402773
1.3522322697892684


In [100]:
model = XGBRegressor() 
model.fit(train_x, train_y) 
pred = model.predict(test_x)
pred = pred * 1.35
NMAE(test_y, pred)

0.2706224998825547

In [93]:
train

Unnamed: 0,date,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max,rental,year,month,day,discomfort,temp_diff,sense_temp,coldness
0,2018-01-01,0.0,-1.3,3.8,-5.1,34.0,17.0,39.1,8.3,86.5,1.4,3.8,4950,2018,1,1,9,8.9,0,-3.642857
1,2018-01-02,0.0,-1.8,1.8,-4.3,36.0,22.0,42.0,7.9,82.3,1.8,4.9,7136,2018,1,2,8,6.1,0,-2.388889
2,2018-01-03,0.0,-4.7,-0.4,-7.1,31.0,19.0,42.3,8.6,88.7,2.2,3.5,7156,2018,1,3,5,6.7,0,-3.227273
3,2018-01-04,0.0,-4.7,-0.7,-8.7,39.0,24.0,43.0,6.2,63.9,1.4,3.5,7102,2018,1,4,7,8.0,0,-6.214286
4,2018-01-05,0.0,-3.0,1.6,-5.6,51.0,35.0,48.4,8.2,84.5,1.7,3.6,7705,2018,1,5,7,7.2,0,-3.294118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,2020-12-27,0.0,5.8,10.0,1.4,70.0,42.0,62.9,5.9,61.5,1.8,2.8,37103,2020,12,27,14,8.6,0,0.777778
1091,2020-12-28,1.3,6.7,11.4,4.2,66.0,44.0,72.1,8.0,83.3,1.4,3.1,46912,2020,12,28,15,7.2,0,3.000000
1092,2020-12-29,0.2,0.1,4.3,-6.2,69.0,46.0,70.8,0.0,0.0,2.9,6.1,35747,2020,12,29,7,10.5,0,-2.137931
1093,2020-12-30,0.0,-10.9,-6.2,-12.9,39.0,15.0,55.5,8.3,86.5,4.1,6.2,22488,2020,12,30,-3,6.7,0,-3.146341
