# 패키지 로드 

In [1]:
import pandas as pd 
import numpy as np 
from datetime import datetime, date

from sklearn.preprocessing import StandardScaler, MinMaxScaler 

from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor 
from sklearn.svm import SVR 

from sklearn.model_selection import GridSearchCV 
random_state = 42 

import matplotlib as plt 
import seaborn as sns 

from tqdm.auto import tqdm 

import warnings 
warnings.filterwarnings(action='ignore') 

  from .autonotebook import tqdm as notebook_tqdm


# 데이터 로드 

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')   
test_answer = pd.read_csv('./data/test_answer.csv',thousands=',')

sample_submission = pd.read_csv('./data/sample_submission.csv')

In [3]:
train

Unnamed: 0,date,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max,rental
0,2018-01-01,,-1.3,3.8,-5.1,34.0,17.0,39.1,8.3,86.5,1.4,3.8,4950
1,2018-01-02,,-1.8,1.8,-4.3,36.0,22.0,42.0,7.9,82.3,1.8,4.9,7136
2,2018-01-03,,-4.7,-0.4,-7.1,31.0,19.0,42.3,8.6,88.7,2.2,3.5,7156
3,2018-01-04,,-4.7,-0.7,-8.7,39.0,24.0,43.0,6.2,63.9,1.4,3.5,7102
4,2018-01-05,,-3.0,1.6,-5.6,51.0,35.0,48.4,8.2,84.5,1.7,3.6,7705
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,2020-12-27,0.0,5.8,10.0,1.4,70.0,42.0,62.9,5.9,61.5,1.8,2.8,37103
1091,2020-12-28,1.3,6.7,11.4,4.2,66.0,44.0,72.1,8.0,83.3,1.4,3.1,46912
1092,2020-12-29,0.2,0.1,4.3,-6.2,69.0,46.0,70.8,0.0,0.0,2.9,6.1,35747
1093,2020-12-30,,-10.9,-6.2,-12.9,39.0,15.0,55.5,8.3,86.5,4.1,6.2,22488


In [4]:
test

Unnamed: 0,date,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max
0,2021-01-01,,-4.2,1.6,-9.8,30.0,17.0,64.0,6.5,67.7,2.0,4.1
1,2021-01-02,,-5.0,-1.4,-8.4,34.0,12.0,38.5,9.0,93.8,2.6,5.4
2,2021-01-03,,-5.6,-2.0,-9.1,39.0,14.0,45.0,5.5,56.7,2.0,4.5
3,2021-01-04,0.0,-3.5,0.3,-8.4,40.0,23.0,51.4,4.6,47.4,1.7,3.2
4,2021-01-05,0.0,-5.5,-2.1,-9.9,30.0,17.0,52.8,8.6,88.7,2.9,5.7
...,...,...,...,...,...,...,...,...,...,...,...,...
360,2021-12-27,0.0,-7.6,-3.9,-12.9,33.0,20.0,60.9,3.8,39.6,1.7,3.1
361,2021-12-28,,-4.1,-0.9,-8.5,51.0,38.0,73.8,1.7,17.7,2.2,3.1
362,2021-12-29,0.2,0.4,5.9,-3.8,66.0,49.0,72.9,1.8,18.8,2.6,5.9
363,2021-12-30,0.0,-3.9,0.2,-6.8,30.0,17.0,48.5,7.3,76.0,3.3,6.6


# 데이터 전처리

In [3]:
#fill na 
train["precipitation"] = train["precipitation"].replace(np.nan, 0)
train["PM10"] = train["PM10"].fillna(method="ffill")
train["PM2.5"] = train["PM2.5"].fillna(method="ffill")
train["sunshine_sum"] = train["sunshine_sum"].fillna(method="ffill")

#fill na
test["precipitation"] = test["precipitation"].replace(np.nan, 0)
test["sunshine_sum"] = test["sunshine_sum"].fillna(method="ffill")

# 특성 공학 

In [4]:
def extract_ymd(df):
    year = []
    month = [] 
    day = [] 

    for date in df["date"]:
        y = date.split('-')[0]
        m = date.split('-')[1]
        d = date.split('-')[2]
    
        year.append(int(y))
        month.append(int(m))
        day.append(int(d))

    df["year"] = year 
    df["month"] = month 
    df["day"] = day 

    return df 

train = extract_ymd(train)
test = extract_ymd(test)

In [5]:
def extract_week_day(df): 
    week_day = [] 
    for i in df.date: 
        date = datetime.strptime(i, "%Y-%m-%d")
        day = date.weekday() 
        week_day.append(day) 
        
    df["week_day"] = week_day 

    return df 

train = extract_week_day(train)
test = extract_week_day(test)

In [6]:
#불쾌지수 특성 생성 
def get_discomfort(temp_mean, humidity):
    temp = temp_mean
    humidity = humidity / 100
    
    discomfort = 1.8 * temp - 0.55 * (1 - humidity) * (1.8*temp - 26) + 32
    return discomfort

train['discomfort'] = [0] * len(train)
for i in range(len(train)):
    train.discomfort[i] = get_discomfort(train.humidity[i], train.temp_mean[i])

test['discomfort'] = [0] * len(test)
for i in range(len(test)):
    test.discomfort[i] = get_discomfort(test.humidity[i], test.temp_mean[i])


In [7]:
# 체감온도 특성 생성 
def get_sense_temp(temp_mean, wind_mean): 
    sense_temp = 13.12 + (0.6215 * temp_mean) - (11.37 * wind_mean *0.16) + (0.3965 * wind_mean * temp_mean * 0.16)
    return sense_temp

train['sense_temp'] = [0] * len(train)
for i in range(len(train)):
    train.sense_temp[i] = get_sense_temp(train.temp_mean[i], train.wind_mean[i])

test['sense_temp'] = [0] * len(test)
for i in range(len(test)):
    test.sense_temp[i] = get_sense_temp(test.temp_mean[i], test.wind_mean[i])


In [8]:
# 일교차 특성 생성 
train['temp_diff'] = train['temp_highest'] - train['temp_lowest'] 
test['temp_diff'] = test['temp_highest'] - test['temp_lowest']

In [9]:
# 추위 특성 생성 
train["coldness"] = train["temp_lowest"] / train["wind_mean"]
test["coldness"] = test["temp_lowest"] / test["wind_mean"]

In [10]:
# 정규화
num_features = ['precipitation', 'temp_mean', 'temp_highest', 'temp_lowest',
       'PM10', 'PM2.5', 'humidity', 'sunshine_sum', 'sunshine_rate',
       'wind_mean', 'wind_max','discomfort', 'temp_diff', 'sense_temp', 'coldness']

scaler = MinMaxScaler()
scaler.fit(train[num_features])

train_scaled = train.copy()
train_scaled[num_features] = scaler.fit_transform(train_scaled[num_features])

test_scaled = test.copy()
test_scaled[num_features] = scaler.transform(test_scaled[num_features])

In [11]:
#create final data 
train_x = train.drop(["date", "rental"],axis=1)
train_scaled_x = train_scaled.drop(["date", "rental"],axis=1)
train_y = train["rental"]

test_x = test.drop(["date"],axis=1)
test_scaled_x = test_scaled.drop(["date"],axis=1)
test_y = test_answer["rental"]

In [12]:
train_x.head()

Unnamed: 0,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max,year,month,day,week_day,discomfort,sense_temp,temp_diff,coldness
0,0.0,-1.3,3.8,-5.1,34.0,17.0,39.1,8.3,86.5,1.4,3.8,2018,1,1,0,77,9,8.9,-3.642857
1,0.0,-1.8,1.8,-4.3,36.0,22.0,42.0,7.9,82.3,1.8,4.9,2018,1,2,1,79,8,6.1,-2.388889
2,0.0,-4.7,-0.4,-7.1,31.0,19.0,42.3,8.6,88.7,2.2,3.5,2018,1,3,2,79,5,6.7,-3.227273
3,0.0,-4.7,-0.7,-8.7,39.0,24.0,43.0,6.2,63.9,1.4,3.5,2018,1,4,3,79,7,8.0,-6.214286
4,0.0,-3.0,1.6,-5.6,51.0,35.0,48.4,8.2,84.5,1.7,3.6,2018,1,5,4,84,7,7.2,-3.294118


In [13]:
train_scaled_x.head()

Unnamed: 0,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max,year,month,day,week_day,discomfort,sense_temp,temp_diff,coldness
0,0.0,0.278351,0.28827,0.264033,0.174699,0.122951,0.268015,0.610294,0.901981,0.190476,0.268293,2018,1,1,0,0.190476,0.342105,0.459627,0.189142
1,0.0,0.268041,0.248509,0.280665,0.186747,0.163934,0.304678,0.580882,0.858186,0.285714,0.402439,2018,1,2,1,0.214286,0.315789,0.285714,0.219887
2,0.0,0.208247,0.204771,0.222453,0.156627,0.139344,0.30847,0.632353,0.924922,0.380952,0.231707,2018,1,3,2,0.214286,0.236842,0.322981,0.199331
3,0.0,0.208247,0.198807,0.189189,0.204819,0.180328,0.31732,0.455882,0.666319,0.190476,0.231707,2018,1,4,3,0.214286,0.289474,0.403727,0.126095
4,0.0,0.243299,0.244533,0.253638,0.277108,0.270492,0.385588,0.602941,0.881126,0.261905,0.243902,2018,1,5,4,0.27381,0.289474,0.354037,0.197692


# 모델링

In [14]:
#metric
def NMAE(true, pred):
    score = np.mean(np.abs(true-pred) / true)
    return score

In [15]:
#trend 반영
reg_2018 = sum(train.loc[train['year'] == 2018, 'rental'].values)
reg_2019 = sum(train.loc[train['year'] == 2019, 'rental'].values)
reg_2020 = sum(train.loc[train['year'] == 2020, 'rental'].values)
reg_2021 = sum(test_answer.rental.values)

print(reg_2019/reg_2018)
print(reg_2020/reg_2019)
print(reg_2021/reg_2020) 

1.8836727252111978
1.2429259327402773
1.3522322697892684


In [16]:
def ensemble_model(models,X,y,test_X,test_y,trend):
    predictions = []
    for model in models: 
        model.fit(X,y)
        pred = model.predict(test_X)
        predictions.append(pred) 
    
    for i in range(0,len(models)): 
        print(type(models[i]).__name__ ,":", NMAE(test_y, predictions[i]))

    for i in range(0,len(models)): 
        ensemble_pred = np.sum(predictions,axis=0) / len(models) 

    ensemble_pred = ensemble_pred * trend
    print("ensemble model :" , NMAE(test_y, ensemble_pred))
    
    return ensemble_pred 
    

In [24]:
models = [] 

xgr = XGBRegressor(random_state=random_state) 
models.append(xgr)

rf = RandomForestRegressor(random_state=random_state) 
models.append(rf)

gbr = GradientBoostingRegressor(random_state=random_state) 
models.append(gbr)

mlp = MLPRegressor(max_iter=5000, random_state=random_state)
models.append(mlp)

ada = AdaBoostRegressor(random_state=random_state)
models.append(ada)

In [25]:
pred = ensemble_model(models, train_x, train_y, test_x, test_y,trend=1.4)

XGBRegressor : 0.3466012899638067
RandomForestRegressor : 0.3395287197242903
GradientBoostingRegressor : 0.3215845191481981
MLPRegressor : 0.4692129204180189
AdaBoostRegressor : 0.39529746412650335
ensemble model : 0.22243329409299756


# Submit

In [19]:
sample_submission["rental"] = pred 
sample_submission

Unnamed: 0,date,rental
0,2021-01-01,27540.126074
1,2021-01-02,24521.086608
2,2021-01-03,21908.356226
3,2021-01-04,29739.230769
4,2021-01-05,26376.153154
...,...,...
360,2021-12-27,35499.620385
361,2021-12-28,35143.087491
362,2021-12-29,55138.521135
363,2021-12-30,39663.327656


In [20]:
sample_submission.to_csv('rental_prediction_v02_04.csv', index=False)