# 패키지 로드 

In [1]:
import pandas as pd 
import numpy as np 
from datetime import datetime, date

from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor  
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor 
from sklearn.model_selection import GridSearchCV 
random_state = 42 

import matplotlib as plt 
import seaborn as sns 

from tqdm.auto import tqdm 

import warnings 
warnings.filterwarnings(action='ignore') 


  from .autonotebook import tqdm as notebook_tqdm


# 데이터 로드 

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')   
test_answer = pd.read_csv('./data/test_answer.csv',thousands=',')

# 데이터 전처리

In [3]:
#split y/m/d
year = []
month = []
day = [] 

for date in train["date"] :
    y = date.split('-')[0]
    m = date.split('-')[1]
    d = date.split('-')[2]
    
    year.append(int(y))
    month.append(int(m))
    day.append(int(d))

train["year"] = year 
train["month"] = month 
train["day"] = day

#test
year_ =[] 
month_ = []
day_= [] 

for date in test["date"]:
    y_ = date.split("-")[0]
    m_ = date.split("-")[1]
    d_ = date.split("-")[2]

    year_.append(int(y_))
    month_.append(int(m_))
    day_.append(int(m_))

test["year"] = year_
test["month"] = month_
test["day"] = day_ 

In [4]:
#fill na 
train["precipitation"] = train["precipitation"].replace(np.nan, 0)
train["PM10"] = train["PM10"].fillna(method="ffill")
train["PM2.5"] = train["PM2.5"].fillna(method="ffill")
train["sunshine_sum"] = train["sunshine_sum"].fillna(method="ffill")

#fill na
test["precipitation"] = test["precipitation"].replace(np.nan, 0)
test["sunshine_sum"] = test["sunshine_sum"].fillna(method="ffill")

# 특성 공학 

In [5]:
# 날짜로 부터 요일 추출 
train_week_day = [] 

for i in train.date:
    date = datetime.strptime(i, "%Y-%m-%d")
    day = date.weekday() 
    train_week_day.append(day) 

train["week_day"] = train_week_day 

test_week_day = [] 

for i in test.date:
    date = datetime.strptime(i, "%Y-%m-%d")
    day = date.weekday() 
    test_week_day.append(day) 

test["week_day"] = test_week_day 

In [6]:
#불쾌지수 특성 생성 
def get_discomfort(temp_mean, humidity):
    temp = temp_mean
    humidity = humidity / 100
    
    discomfort = 1.8 * temp - 0.55 * (1 - humidity) * (1.8*temp - 26) + 32
    return discomfort

train['discomfort'] = [0] * len(train)
for i in range(len(train)):
    train.discomfort[i] = get_discomfort(train.humidity[i], 
                                            train.temp_mean[i])

test['discomfort'] = [0] * len(test)
for i in range(len(test)):
    test.discomfort[i] = get_discomfort(test.humidity[i], 
                                           test.temp_mean[i])



In [7]:
# 일교차 특성 생성 

train['temp_diff'] = train['temp_highest'] - train['temp_lowest'] 
test['temp_diff'] = test['temp_highest'] - test['temp_lowest']

In [8]:
'''
# 채감온도 특성 생성 
def get_sense_temp(temp_mean, wind_mean): 
    sense_temp = 13.12 + (0.6215 * temp_mean) - (11.37 * wind_mean *0.16) + (0.3965 * wind_mean * temp_mean * 0.16)
    return sense_temp

train['sense_temp'] = [0] * len(train)
for i in range(len(train)):
    train.discomfort[i] = get_sense_temp(train.temp_mean[i], 
                                            train.wind_mean[i])

test['sense_temp'] = [0] * len(test)
for i in range(len(test)):
    test.discomfort[i] = get_sense_temp(test.temp_mean[i], 
                                            test.wind_mean[i])
'''

"\n# 채감온도 특성 생성 \ndef get_sense_temp(temp_mean, wind_mean): \n    sense_temp = 13.12 + (0.6215 * temp_mean) - (11.37 * wind_mean *0.16) + (0.3965 * wind_mean * temp_mean * 0.16)\n    return sense_temp\n\ntrain['sense_temp'] = [0] * len(train)\nfor i in range(len(train)):\n    train.discomfort[i] = get_sense_temp(train.temp_mean[i], \n                                            train.wind_mean[i])\n\ntest['sense_temp'] = [0] * len(test)\nfor i in range(len(test)):\n    test.discomfort[i] = get_sense_temp(test.temp_mean[i], \n                                            test.wind_mean[i])\n"

In [9]:
# 추위 특성 생성 
train["coldness"] = train["temp_lowest"] / train["wind_mean"]
test["coldness"] = test["temp_lowest"] / test["wind_mean"]

In [10]:
#create final data 
train_x = train.drop(["date", "rental"],axis=1)
train_y = train["rental"]

test_x = test.drop(["date"],axis=1)
test_y = test_answer["rental"]

# 모델링

In [11]:
#metric
def NMAE(true, pred):
    score = np.mean(np.abs(true-pred) / true)
    return score

In [12]:
#trend 반영
reg_2018 = sum(train.loc[train['year'] == 2018, 'rental'].values)
reg_2019 = sum(train.loc[train['year'] == 2019, 'rental'].values)
reg_2020 = sum(train.loc[train['year'] == 2020, 'rental'].values)
reg_2021 = sum(test_answer.rental.values)

print(reg_2019/reg_2018)
print(reg_2020/reg_2019)
print(reg_2021/reg_2020) 

1.8836727252111978
1.2429259327402773
1.3522322697892684


In [13]:
xgr = XGBRegressor(random_state=random_state) 
xgr.fit(train_x, train_y)

rf = RandomForestRegressor(random_state=random_state) 
rf.fit(train_x,train_y)

gbr = GradientBoostingRegressor(random_state=random_state) 
gbr.fit(train_x,train_y)

mlp = MLPRegressor(max_iter=5000, random_state=random_state)
mlp.fit(train_x, train_y)

xgr_pred = xgr.predict(test_x)
rf_pred = rf.predict(test_x) 
gbr_pred = gbr.predict(test_x)
mlp_pred = mlp.predict(test_x)

In [16]:
ensemble_pred = (xgr_pred + rf_pred + gbr_pred + mlp_pred) / 4
ensemble_pred = ensemble_pred * 1.4 #trend 반영 

print("xgr : ", NMAE(test_y, xgr_pred))
print("rf : ",NMAE(test_y, rf_pred))
print("gbr : ",NMAE(test_y, gbr_pred))
print("mlp : ",NMAE(test_y, mlp_pred))
print("ensmble : ",NMAE(test_y, ensemble_pred)) 

xgr :  0.3461155474123027
rf :  0.3440714858439987
gbr :  0.3233371410136635
mlp :  0.46216328806204676
ensmble :  0.20372571314752802


# Submit

In [14]:
sample_submission = pd.read_csv('./data/sample_submission.csv')
sample_submission["rental"] = ensemble_pred 

sample_submission

Unnamed: 0,date,rental
0,2021-01-01,26022.024720
1,2021-01-02,23951.872843
2,2021-01-03,20834.716331
3,2021-01-04,27495.829266
4,2021-01-05,25529.724990
...,...,...
360,2021-12-27,38091.791305
361,2021-12-28,36535.326617
362,2021-12-29,57381.676302
363,2021-12-30,41214.028738


In [15]:
sample_submission.to_csv('rental_prediction_v02_02.csv', index=False)

# Ensemble Model 
with girdsearchcv optimization 

In [14]:
def gridSearchCV(models,params, x, y): 
    best_models =[] 
    for i in tqdm(range(0,len(models))):
        model_gird = GridSearchCV(models[i], params[i], n_jobs = 4, cv=5)
        model_gird.fit(x,y)
        best_models.append(model_gird.best_estimator_)
    
    return best_models

In [15]:
models = []

xgr = XGBRegressor()
models.append(xgr)

rf = RandomForestRegressor()
models.append(rf)

gbr = GradientBoostingRegressor() 
models.append(gbr)

mlp = MLPRegressor() 
models.append(mlp)

params = []

params_xgr = { 
    'max_depth' : [3,4,5,6,7,8,9,10]
}
params.append(params_xgr)

params_rf = {
    'n_estimators' : [70,80,90,100,110,120,130],
    'min_samples_split': [1,2,3,4,5,6,7,8]
}
params.append(params_rf)

params_gbr = {
    'learning_rate' : [0.07, 0.08, 0.09, 0.1, 1.1, 1.2],
    'n_estimators' : [70,80,90,100,110,120,130]
}
params.append(params_gbr)

params_mlp ={ 
    'max_iter' : [500,1000,1500,2000,2500,3000]
}
params.append(params_mlp)


In [16]:
best_models = gridSearchCV(models,params,train_x,train_y)

100%|██████████| 4/4 [02:04<00:00, 31.03s/it]


In [17]:
for i in range(0, len(models)):
    print(best_models[i])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=3, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=32,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)
RandomForestRegressor(min_samples_split=5, n_estimators=130)
GradientBoostingRegressor(learning_rate=0.09, n_estimators=130)
MLPRegressor(max_iter=2500)


In [18]:
for model in best_models:
    model.fit(train_x, train_y)
    
predictions = [] 
for model in best_models:
    pred = model.predict(test_x)
    predictions.append(pred)

ensemble_pred = (predictions[0] + predictions[1] + predictions[2] + predictions[3]) / 4

print("xgr : ", NMAE(test_y, predictions[0]))
print("rf : ",NMAE(test_y, predictions[1]))
print("gbr : ",NMAE(test_y, predictions[2]))
print("mlp : ",NMAE(test_y, predictions[3]))
print("ensmble : ",NMAE(test_y, ensemble_pred))


xgr :  0.32619025055284
rf :  0.34570357965447013
gbr :  0.3255613228215589
mlp :  0.47204208501424866
ensmble :  0.33486932743456194
