## Goal: 각 LGBM 모델별로 Early stopping의 효과 확인

---

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
import math
from lightgbm import LGBMRegressor, plot_importance
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import mean_squared_error # squared=True 기본
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import joblib
from tqdm import tqdm
import shutil

In [3]:
!pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org tqdm

Collecting tqdm
  Downloading tqdm-4.56.0-py2.py3-none-any.whl (72 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.56.0


---

### train data 만들기

In [192]:
train = pd.read_csv('train/train_new.csv')
print(train.shape)
train.head(2)

(52560, 20)


Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET,GHI,dew,sunny,sunrise,sunset,date,cloudy,temp_diff,delta,zenith,azi
0,0,0,0,0,0,1.5,69.08,-12,0.0,0,-4.035806,8.5,8.0,16.5,0,0.0,9,-23.169132,167.169132,0.0
1,0,0,30,0,0,1.5,69.06,-12,0.0,0,-4.03967,8.5,8.0,16.5,0,0.0,9,-23.169132,167.169132,0.0


In [193]:
# date를 뺀 train dataset: date는 단순히 delta, zenith, azi feature를 생성하기 위한 것이였음
no_date = train.drop(['date'],axis=1)
print(no_date.shape)
no_date.head(1)

(52560, 19)


Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET,GHI,dew,sunny,sunrise,sunset,cloudy,temp_diff,delta,zenith,azi
0,0,0,0,0,0,1.5,69.08,-12,0.0,0,-4.035806,8.5,8.0,16.5,0.0,9,-23.169132,167.169132,0.0


In [194]:
no_date.tail(1)

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET,GHI,dew,sunny,sunrise,sunset,cloudy,temp_diff,delta,zenith,azi
52559,1094,23,30,0,0,2.1,67.7,-4,0.0,0,-4.304766,8.5,8.0,16.5,0.0,6,-23.227986,161.791433,310.43504


In [195]:
# for col in list(no_date.columns)[3:]:
    
train_df = pd.DataFrame()
for i in tqdm(range(1087)):
    days_df = pd.DataFrame() 
    # x_columns
    for day in range(7):
        if day == 0:
            day_df = no_date[no_date.Day == (i+day)].drop(['Day'],axis=1).reset_index(drop=True) 
            cols = ['Hour','Minute'] + ['{}_{}'.format(col,day) for col in list(no_date.columns)[3:]]
        else:
            day_df = no_date[no_date.Day == (i+day)].drop(['Day','Hour','Minute'],axis=1).reset_index(drop=True) 
            cols = ['{}_{}'.format(col,day) for col in list(no_date.columns)[3:]]
        day_df.columns = cols
        days_df = pd.concat([days_df,day_df],axis=1)
    # y_columns
    target1 = no_date[no_date.Day == (i+7)][['TARGET']].rename(columns={'TARGET':'target1'}).reset_index(drop=True)
    target2 = no_date[no_date.Day == (i+8)][['TARGET']].rename(columns={'TARGET':'target2'}).reset_index(drop=True)
    # train_df
    days_df = pd.concat([days_df,target1,target2],axis=1)
    train_df = pd.concat([train_df,days_df],axis=0).reset_index(drop=True)
train_df.head(2)

100%|██████████████████████████████████████████████████████████████████████████████| 1087/1087 [01:32<00:00, 11.71it/s]


Unnamed: 0,Hour,Minute,DHI_0,DNI_0,WS_0,RH_0,T_0,TARGET_0,GHI_0,dew_0,...,sunny_6,sunrise_6,sunset_6,cloudy_6,temp_diff_6,delta_6,zenith_6,azi_6,target1,target2
0,0,0,0,0,1.5,69.08,-12,0.0,0,-4.035806,...,8.5,8.0,16.5,0.0,9,-22.672581,166.672581,0.0,0.0,0.0
1,0,30,0,0,1.5,69.06,-12,0.0,0,-4.03967,...,8.5,8.0,16.5,0.0,9,-22.672581,166.672581,0.0,0.0,0.0


In [196]:
train_df.to_csv('train_refined/new0.csv', index=False)

(52176, 116)

---

### test data 만들기 (LGBM ver.)

#### 새로운 컬럼들 추가
- 1. GHI (= DHI + DNI)
- 2. dewpoint (이슬점)
- 3. sunny (해가 총 떠있는 시간)
- 4. sunrise
- 5. sunset
- 6. cloudy (DHI/DNI+0.001)
- 7. temp_diff (일교차)
- 8. delta (적위) & zenith (Zenith Angle: 천정각)
- 9. azi (Azimuth: 방위각)

#### 1. GHI (= DHI + DNI)

In [8]:
# test['GHI'] = test['DHI'] + test['DNI']
def GHI(data):
    return data['DHI'] + data['DNI']

#### 2. dew (이슬점)

In [9]:
def dewpoint(data):
    rh = data['RH']
    t = data['T']
    gamma = math.log(rh/100) + (17.62*t)/(243.12*t+0.001)
    dew = (243.12*gamma)/(17.62-gamma)
    return dew

In [10]:
def dew(data):
    return [dewpoint(data.iloc[i]) for i in range(len(data.index))]

#### 3. sunny (해가 총 떠있는 시간) & 4. sunrise (일출) & 5. sunset(일몰)

In [11]:
# Hour & Minute 모두 비교
def time_minute(data):
    hour = data['Hour']
    if data['Minute'] == 0:
        minute = 0
    else: # ==30
        minute = 0.5
    return hour + minute

In [12]:
def suns(data,col='sunny'):
    ghi = data.iloc[np.where(data['DHI'] + data['DNI'] > 0)[0],:].reset_index(drop=True)
    ghi['time_float'] = [time_minute(ghi.iloc[i]) for i in range(len(ghi.index))]
    ghi_min_hour = pd.DataFrame(ghi.groupby(['Day'])['time_float'].min()).reset_index()
    ghi_max_hour = pd.DataFrame(ghi.groupby(['Day'])['time_float'].max()).reset_index()
    
    if col == 'sunny':
        ghi_minus_hour = ghi_max_hour.copy()
        ghi_minus_hour['sunny'] = ghi_max_hour['time_float'] - ghi_min_hour['time_float']
        result = pd.merge(data, ghi_minus_hour[['Day','sunny']], on='Day')
        return result['sunny']
    elif col == 'sunrise':
        ghi_min_hour.columns = ['Day', 'sunrise']
        result = pd.merge(data, ghi_min_hour, on='Day')
        return result['sunrise']
    elif col == 'sunset':
        ghi_max_hour.columns = ['Day', 'sunset']
        result = pd.merge(data, ghi_max_hour, on='Day')
        return result['sunset']
    else:
        raise Exception('wrong input')

#### 6. cloudy (DHI/DNI+0.001)

In [13]:
def cloudy(data):
    return data['DHI']/(data['DNI']+0.001)

#### 7. temp_diff (일교차)

In [14]:
def temp_diff(data):
    max_temp = pd.DataFrame(data.groupby(['Day'])['T'].max()).reset_index()
    min_temp = pd.DataFrame(data.groupby(['Day'])['T'].min()).reset_index()
    diff = min_temp.copy() ; diff=diff.drop(['T'],axis=1,)
    diff['temp_diff'] = max_temp['T'] - min_temp['T']
    result = pd.merge(data,diff,on='Day')
    return result['temp_diff']

#### 8. delta & zenith (delta: 적위 & Zenith Angle: 천정각)

In [15]:
def delta(data):
    del_model = joblib.load('model/test_delta.pkl')
    predicted_del = del_model.predict(data.drop(['Day'],axis=1))
    return predicted_del  

In [16]:
def HRA(data):
    hour = data['Hour']
    return 15*(hour-12)

def elevation(data): # -90 ~ 90
    de = math.radians(data['delta'])
    lat = math.radians(36) # 위도는 남한 반도의 중간인 36으로 설정
    hra = math.radians(HRA(data))
    return math.degrees(math.asin(math.sin(de)*math.sin(lat) + math.cos(de)*math.cos(lat)*math.cos(hra)))

def zen_angle(data):
    angle = 90 - elevation(data)
    return angle 

def zenith(data):
    return [zen_angle(data.iloc[i]) for i in range(len(data.index))]

#### 9. azi (Azimuth: 방위각)

In [17]:
def azimuth(data):
    hra = math.radians(HRA(data))
    de = math.radians(data['delta'])
    lat = math.radians(36)
    ele = math.radians(elevation(data))
    
    top = round(math.sin(de)*math.cos(lat) - math.cos(de)*math.sin(lat)*math.cos(hra),5)
    bottom = round(math.cos(ele),5)
    azi = math.degrees(math.acos(top/bottom))
    if hra > 0:
        return 360 - azi
    else:
        return azi

In [18]:
def azi(data):
    return [azimuth(data.iloc[i]) for i in range(len(data.index))]

---

In [157]:
def test_preprocess1(df):
    data = df.copy()
    data['GHI'] = GHI(data)
    data['dew'] = dew(data)
    data['sunny'] = suns(data,col='sunny')
    data['sunrise'] = suns(data,col='sunrise')
    data['sunset'] = suns(data,col='sunset')
    data['cloudy'] = cloudy(data)
    data['temp_diff'] = temp_diff(data)
    data['delta'] = delta(data)
    return data

def test_preprocess2(df):
    data = df.copy()
    data['zenith'] = zenith(data)
    data['azi'] = azi(data)
    return data


---

### Train 시키기 

In [152]:
def make_id(i):
    id_ls = []
    for day in [7,8]:
        for hour in range(0,24):
            for minute in ['00','30']:
                id_ls.append('{}.csv_Day{}_{}h{}m'.format(i,day,hour,minute))
    return id_ls

def test_predict(i,df,date):
    result = pd.DataFrame({'id':make_id(i)})
    model1_pred = pd.DataFrame()
    model2_pred = pd.DataFrame()
    
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    for q in quantiles:
        model1 = joblib.load('model/{}/model1/q{}'.format(date, str(q).replace('.','')))
        model2 = joblib.load('model/{}/model2/q{}'.format(date, str(q).replace('.','')))
        pred1 = pd.Series(model1.predict(df).round(6))
        pred2 = pd.Series(model2.predict(df).round(6))
        model1_pred = pd.concat([model1_pred,pred1],axis=1)
        model2_pred = pd.concat([model2_pred,pred2],axis=1)
    predict_df = pd.concat([model1_pred,model2_pred],axis=0).reset_index(drop=True)
    
    result = pd.concat([result,predict_df],axis=1)
    result.columns = ['id','q_0.1','q_0.2','q_0.3','q_0.4','q_0.5','q_0.6','q_0.7','q_0.8','q_0.9']
        
    return result

In [62]:
def train_real(date, train_x1, valid_x1, train_y1, valid_y1, train_x2, valid_x2, train_y2, valid_y2):
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    
    if not os.path.isdir('model/{}'.format(date)):
        os.makedirs('model/{}/model1'.format(date), exist_ok=True)
        os.makedirs('model/{}/model2'.format(date), exist_ok=True)
    
    for q in tqdm(quantiles):
        model1, error1 = LGBM(q, train_x1, train_y1, valid_x1, valid_y1)
        joblib.dump(model1,'model/{}/model1/q{}'.format(date,str(q).replace('.','')))

        model2, error2 = LGBM(q, train_x2, train_y2, valid_x2, valid_y2)
        joblib.dump(model2,'model/{}/model2/q{}'.format(date, str(q).replace('.','')))

In [63]:
def train_quantiles(train_x1, valid_x1, train_y1, valid_y1, train_x2, valid_x2, train_y2, valid_y2):
    error_dict1 = {} ; error_dict2 = {}
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    
    for q in tqdm(quantiles):
        model1, error1 = LGBM(q, train_x1, train_y1, valid_x1, valid_y1)
        error_dict1['q{}'.format(str(q).replace('.',''))] = error1

        model2, error2 = LGBM(q, train_x2, train_y2, valid_x2, valid_y2)
        error_dict2['q{}'.format(str(q).replace('.',''))] = error2
    return error_dict1, error_dict2

---

**0109_2.csv 버전의 parameter setting**: 0112까지 최고 점수
- **validation result:** 1.5159237110248014 / 1.506349717600947 | 1.4957201456318443 1.5023286354497691
- **real result:** 1.9097930439 - **사실상 얘도 model1로만 Day7, Day8로 예측한 잘못된 결과임**
- **제대로 된 result:** 1.8761783965 - **제대로 model을 적용한 결과**

In [22]:
train_x1, valid_x1, train_y1, valid_y1 = train_test_split(train_df.iloc[:,:-2], train_df.iloc[:,-2], test_size=0.2, random_state=42)
train_x2, valid_x2, train_y2, valid_y2 = train_test_split(train_df.iloc[:,:-2], train_df.iloc[:,-1], test_size=0.2, random_state=42)

In [23]:
def LGBM(q, train_x, train_y, valid_x, valid_y):
    model = LGBMRegressor(objective='quantile', alpha=q,
                         n_estimators=10000, bagging_fraction=0.7, learning_rate=0.027, subsample=0.7)
    model.fit(train_x, train_y, eval_metric = ['quantile'], 
          eval_set=[(valid_x, valid_y)], early_stopping_rounds=300, verbose=False)
    min_error = min(model.evals_result_['valid_0']['quantile'])
    return model, min_error

In [24]:
model1_errors, model2_errors = train_quantiles(train_x1, valid_x1, train_y1, valid_y1, train_x2, valid_x2, train_y2, valid_y2)
print(np.mean(list(model1_errors.values())), np.mean(list(model2_errors.values())))

  0%|                                                                                            | 0/9 [00:00<?, ?it/s]



100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [07:48<00:00, 52.02s/it]

1.5159237110248014 1.506349717600947





In [66]:
def LGBM(q, train_x, train_y, valid_x, valid_y):
    model = LGBMRegressor(objective='quantile', alpha=q,
                         n_estimators=10000, bagging_fraction=0.7, learning_rate=0.027, subsample=0.7)
    model.fit(train_x, train_y, eval_metric = ['quantile'], 
          eval_set=[(valid_x, valid_y)], early_stopping_rounds=300, verbose=False)
    min_error = min(model.evals_result_['valid_0']['quantile'])
    iter_num = len(model.evals_result_['valid_0']['quantile']) - 300 # early_stopping_rounds=300
    return model, min_error, iter_num

In [71]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

error_dict1 = {} ; error_dict2 = {} ; num_dict1 = {} ; num_dict2 = {}
for i, (train_index, valid_index) in tqdm(enumerate(cv.split(train_df)), total=5):
    train_x, valid_x = train_df.iloc[train_index,:-2],train_df.iloc[valid_index,:-2]
    train_y1, valid_y1 = train_df.iloc[train_index,-2], train_df.iloc[valid_index,-2] # model1
    train_y2, valid_y2 = train_df.iloc[train_index,-1], train_df.iloc[valid_index,-1] # model2
    
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    for q in quantiles:
        model1, error1, num1 = LGBM(q, train_x, train_y1, valid_x, valid_y1)
        model2, error2, num2 = LGBM(q, train_x, train_y2, valid_x, valid_y2)

        if i == 0:
            error_dict1['q{}'.format(str(q).replace('.',''))] = [error1]
            error_dict2['q{}'.format(str(q).replace('.',''))] = [error2]
            num_dict1['q{}'.format(str(q).replace('.',''))] = [num1]
            num_dict2['q{}'.format(str(q).replace('.',''))] = [num2]
        else:
            error_dict1['q{}'.format(str(q).replace('.',''))].append(error1)
            error_dict2['q{}'.format(str(q).replace('.',''))].append(error2)    
            num_dict1['q{}'.format(str(q).replace('.',''))].append(num1)
            num_dict2['q{}'.format(str(q).replace('.',''))].append(num2)

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [36:09<00:00, 433.93s/it]


In [73]:
print(np.mean(list(error_dict1.values())), np.mean(list(error_dict2.values())), np.mean(list(num_dict1.values())), np.mean(list(num_dict2.values())))

1.4957201456318443 1.5023286354497691 8289.022222222222 8488.577777777778


In [81]:
num_dict1, num_dict2

({'q01': [3026, 6498, 3296, 7499, 5814],
  'q02': [6514, 8410, 8477, 9700, 9700],
  'q03': [9700, 9700, 9700, 9700, 9700],
  'q04': [9700, 9700, 9700, 9700, 9700],
  'q05': [9700, 9700, 9700, 9700, 9700],
  'q06': [9700, 9700, 9700, 9700, 9700],
  'q07': [8605, 9700, 9700, 9700, 9700],
  'q08': [9700, 9700, 7602, 4660, 9700],
  'q09': [4960, 4965, 4516, 3886, 2978]},
 {'q01': [5816, 9700, 9449, 7251, 3018],
  'q02': [8222, 9700, 7748, 7508, 8618],
  'q03': [9700, 9698, 9700, 9700, 9700],
  'q04': [9700, 9700, 9700, 9700, 9700],
  'q05': [9700, 9700, 9700, 9700, 9700],
  'q06': [9700, 8291, 9700, 9700, 9700],
  'q07': [9700, 9700, 9700, 9700, 9700],
  'q08': [9700, 4620, 9700, 5541, 9700],
  'q09': [5235, 3798, 4659, 6969, 3945]})

---

**parameter tuning ver(4)** : max_depth: 21 - Cross validation에서는 결과가 좋았으나 실제 값은 좋지 않았던 parameter setting
- **validation result:** 1.481943053800047 / 1.4627980519984785 | 1.4608987400320006 / 1.4533393522667164
- **real result:** 1.9627326801
- 아마도 너무 20%의 validation data에 과적합이 되어버린 듯 → **실제 test 데이터와 20%의 validation 데이터의 분포가 달랐던 듯**
    - 아마 이럴 경우 random_state에 따라 값이 확확 바뀔 수 있음. 이러면 나머지 test data에 대해서는 제대로 예측하지 못 할 것
- 아니면 scaling이 test 부분에서 잘못 적용이 되었나?
    - train/test 데이터를 분석해본 결과 **데이터의 range가 다른 경우**가 많음 → **train의 scaling이 test에 맞지 않음**
    - scaling을 하지 않는 방향으로 모델링 진행하자

In [25]:
train_x1, valid_x1, train_y1, valid_y1 = train_test_split(train_df.iloc[:,:-2], train_df.iloc[:,-2], test_size=0.2, random_state=42)
train_x2, valid_x2, train_y2, valid_y2 = train_test_split(train_df.iloc[:,:-2], train_df.iloc[:,-1], test_size=0.2, random_state=42)
# scaler 적용
scaler = StandardScaler()
scaler.fit(train_x1) # train_x1 == train_x2
scaled_train_x = scaler.transform(train_x1)
scaled_valid_x1 = scaler.transform(valid_x1)
scaled_valid_x2 = scaler.transform(valid_x2) 

In [26]:
def LGBM(q, train_x, train_y, valid_x, valid_y):
    model = LGBMRegressor(objective='quantile', alpha=q,
                         n_estimators=10000, bagging_fraction=0.9, learning_rate=0.027, 
                         max_depth=21, colsample_bytree=0.9,
                         num_leaves= 170, min_data_in_leaf = 70)
    model.fit(train_x, train_y, eval_metric = ['quantile'], eval_set=[(valid_x, valid_y)], verbose=False)
    min_error = min(model.evals_result_['valid_0']['quantile'])
    return model, min_error

In [28]:
model1_errors, model2_errors = train_quantiles(scaled_train_x, scaled_valid_x1, train_y1, valid_y1, scaled_train_x, scaled_valid_x2, train_y2, valid_y2)
print(np.mean(list(model1_errors.values())), np.mean(list(model2_errors.values())))

100%|███████████████████████████████████████████████████████████████████████████████████| 9/9 [23:19<00:00, 155.46s/it]

1.481943053800047 1.4627980519984785





In [82]:
def LGBM(q, train_x, train_y, valid_x, valid_y):
    model = LGBMRegressor(objective='quantile', alpha=q,
                         n_estimators=10000, bagging_fraction=0.9, learning_rate=0.027, 
                         max_depth=21, colsample_bytree=0.9,
                         num_leaves= 170, min_data_in_leaf = 70)
    model.fit(train_x, train_y, eval_metric = ['quantile'], eval_set=[(valid_x, valid_y)], verbose=False)
    min_error = min(model.evals_result_['valid_0']['quantile'])
    iter_num = len(model.evals_result_['valid_0']['quantile']) - 300 # early_stopping_rounds=300
    return model, min_error, iter_num

In [83]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

error_dict1 = {} ; error_dict2 = {} ; num_dict1 = {} ; num_dict2 = {}
for i, (train_index, valid_index) in tqdm(enumerate(cv.split(train_df)), total=5):
    train_x, valid_x = train_df.iloc[train_index,:-2],train_df.iloc[valid_index,:-2]
    train_y1, valid_y1 = train_df.iloc[train_index,-2], train_df.iloc[valid_index,-2] # model1
    train_y2, valid_y2 = train_df.iloc[train_index,-1], train_df.iloc[valid_index,-1] # model2
    
    scaler = StandardScaler()
    scaler.fit(train_x) 
    scaled_train_x = scaler.transform(train_x)
    scaled_valid_x = scaler.transform(valid_x)
    
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    for q in quantiles:
        model1, error1, num1 = LGBM(q, scaled_train_x, train_y1, scaled_valid_x, valid_y1)
        model2, error2, num2 = LGBM(q, scaled_train_x, train_y2, scaled_valid_x, valid_y2)

        if i == 0:
            error_dict1['q{}'.format(str(q).replace('.',''))] = [error1]
            error_dict2['q{}'.format(str(q).replace('.',''))] = [error2]
            num_dict1['q{}'.format(str(q).replace('.',''))] = [num1]
            num_dict2['q{}'.format(str(q).replace('.',''))] = [num2]
        else:
            error_dict1['q{}'.format(str(q).replace('.',''))].append(error1)
            error_dict2['q{}'.format(str(q).replace('.',''))].append(error2)    
            num_dict1['q{}'.format(str(q).replace('.',''))].append(num1)
            num_dict2['q{}'.format(str(q).replace('.',''))].append(num2)

100%|████████████████████████████████████████████████████████████████████████████████| 5/5 [2:00:27<00:00, 1445.57s/it]


In [84]:
print(np.mean(list(error_dict1.values())), np.mean(list(error_dict2.values())), np.mean(list(num_dict1.values())), np.mean(list(num_dict2.values())))

1.4608987400320006 1.4533393522667164 9700.0 9700.0


---

### 새로운 데이터로 Train (train_refined/new1.csv)

**0109_2.csv 버전의 parameter setting**
- **cross validation result (dew X, sunny X):** 1.5269704395358443 1.5355730494647777
- **cross validation result (dew O, sunny X):** 1.5144389202211872 1.5215790291380176
- **cross validation result (dew O, sunny O):** 1.5117458767635403 1.516240773841569 
    - model1: [3527.4, 5438.6, 9176.2, 10000. , 10000. , 9566.8, 9162.6, 8743.4, 4154. ]
    - model2: [5371.8, 7145. , 10000. , 10000. , 10000. , 10000. , 7026.8, 9011.4, 4676. ]
- **cross validation result (dew O, sunny O + RH):** 1.513805356818006 1.5169129223516677
    - model1: [ 4117. ,  6959.6,  9839.6, 10000. , 10000. ,  6950. ,  8801.4, 7687. ,  4825.2]
    - model2: [ 5679.8,  6932.6,  9876.8, 10000. , 10000. ,  7839. ,  9599. , 8580.6,  5192.6]
- **real result:** 

In [221]:
train = pd.read_csv('train/train_new.csv')
print(train.shape)
train.head(2)

(52560, 20)


Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET,GHI,dew,sunny,sunrise,sunset,date,cloudy,temp_diff,delta,zenith,azi
0,0,0,0,0,0,1.5,69.08,-12,0.0,0,-4.035806,8.5,8.0,16.5,0,0.0,9,-23.169132,167.169132,0.0
1,0,0,30,0,0,1.5,69.06,-12,0.0,0,-4.03967,8.5,8.0,16.5,0,0.0,9,-23.169132,167.169132,0.0


In [222]:
no_date = train.drop(['date'],axis=1)
to_log = ['DHI','DNI','WS','TARGET','GHI','cloudy']
for col in to_log:
    no_date[col] = np.log(no_date[col] + 0.001)
print(no_date.shape)
no_date.head(1)

(52560, 19)


Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET,GHI,dew,sunny,sunrise,sunset,cloudy,temp_diff,delta,zenith,azi
0,0,0,0,-6.907755,-6.907755,0.406132,69.08,-12,-6.907755,-6.907755,-4.035806,8.5,8.0,16.5,-6.907755,9,-23.169132,167.169132,0.0


In [223]:
train_df = pd.DataFrame()
for i in tqdm(range(1087)):
    days_df = pd.DataFrame() 
    # x_columns
    for day in range(7):
        if day == 0:
            day_df = no_date[no_date.Day == (i+day)].drop(['Day'],axis=1).reset_index(drop=True) 
            cols = ['Hour','Minute'] + ['{}_{}'.format(col,day) for col in list(no_date.columns)[3:]]
        else:
            day_df = no_date[no_date.Day == (i+day)].drop(['Day','Hour','Minute'],axis=1).reset_index(drop=True) 
            cols = ['{}_{}'.format(col,day) for col in list(no_date.columns)[3:]]
        day_df.columns = cols
        days_df = pd.concat([days_df,day_df],axis=1)
    # y_columns
    target1 = no_date[no_date.Day == (i+7)][['TARGET']].rename(columns={'TARGET':'target1'}).reset_index(drop=True)
    target2 = no_date[no_date.Day == (i+8)][['TARGET']].rename(columns={'TARGET':'target2'}).reset_index(drop=True)
    # train_df
    days_df = pd.concat([days_df,target1,target2],axis=1)
    train_df = pd.concat([train_df,days_df],axis=0).reset_index(drop=True)
train_df.head(2)

100%|██████████████████████████████████████████████████████████████████████████████| 1087/1087 [01:21<00:00, 13.34it/s]


Unnamed: 0,Hour,Minute,DHI_0,DNI_0,WS_0,RH_0,T_0,TARGET_0,GHI_0,dew_0,...,sunny_6,sunrise_6,sunset_6,cloudy_6,temp_diff_6,delta_6,zenith_6,azi_6,target1,target2
0,0,0,-6.907755,-6.907755,0.406132,69.08,-12,-6.907755,-6.907755,-4.035806,...,8.5,8.0,16.5,-6.907755,9,-22.672581,166.672581,0.0,-6.907755,-6.907755
1,0,30,-6.907755,-6.907755,0.406132,69.06,-12,-6.907755,-6.907755,-4.03967,...,8.5,8.0,16.5,-6.907755,9,-22.672581,166.672581,0.0,-6.907755,-6.907755


In [122]:
def LGBM(q, train_x, train_y, valid_x, valid_y):
    model = LGBMRegressor(objective='quantile', alpha=q,
                         n_estimators=10000, bagging_fraction=0.7, learning_rate=0.027, subsample=0.7)
    model.fit(train_x, train_y, eval_metric = ['quantile'], 
          eval_set=[(valid_x, valid_y)], early_stopping_rounds=300, verbose=False)
    min_error = min(model.evals_result_['valid_0']['quantile'])
    if len(model.evals_result_['valid_0']['quantile']) == 10000:
        iter_num = 10000
    else:
        iter_num = len(model.evals_result_['valid_0']['quantile']) - 300
    return model, min_error, iter_num

In [198]:
train_df = pd.read_csv('train_refined/new0.csv')
print(train_df.shape)
train_df.head(1)

(52176, 116)


Unnamed: 0,Hour,Minute,DHI_0,DNI_0,WS_0,RH_0,T_0,TARGET_0,GHI_0,dew_0,...,sunny_6,sunrise_6,sunset_6,cloudy_6,temp_diff_6,delta_6,zenith_6,azi_6,target1,target2
0,0,0,0,0,1.5,69.08,-12,0.0,0,-4.035806,...,8.5,8.0,16.5,0.0,9,-22.672581,166.672581,0.0,0.0,0.0


In [124]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

error_dict1 = {} ; error_dict2 = {} ; num_dict1 = {} ; num_dict2 = {}
for i, (train_index, valid_index) in tqdm(enumerate(cv.split(train_df)), total=5):
    train_x, valid_x = train_df.iloc[train_index,:-2],train_df.iloc[valid_index,:-2]
    train_y1, valid_y1 = train_df.iloc[train_index,-2], train_df.iloc[valid_index,-2] # model1
    train_y2, valid_y2 = train_df.iloc[train_index,-1], train_df.iloc[valid_index,-1] # model2
    
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    for q in quantiles:
        model1, error1, num1 = LGBM(q, train_x, train_y1, valid_x, valid_y1)
        model2, error2, num2 = LGBM(q, train_x, train_y2, valid_x, valid_y2)

        if i == 0:
            error_dict1['q{}'.format(str(q).replace('.',''))] = [error1]
            error_dict2['q{}'.format(str(q).replace('.',''))] = [error2]
            num_dict1['q{}'.format(str(q).replace('.',''))] = [num1]
            num_dict2['q{}'.format(str(q).replace('.',''))] = [num2]
        else:
            error_dict1['q{}'.format(str(q).replace('.',''))].append(error1)
            error_dict2['q{}'.format(str(q).replace('.',''))].append(error2)    
            num_dict1['q{}'.format(str(q).replace('.',''))].append(num1)
            num_dict2['q{}'.format(str(q).replace('.',''))].append(num2)

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [43:15<00:00, 519.09s/it]


In [125]:
print(np.mean(list(error_dict1.values())), np.mean(list(error_dict2.values())), np.mean(list(num_dict1.values())), np.mean(list(num_dict2.values())))

1.513805356818006 1.5169129223516677 7686.644444444444 8188.933333333333


In [127]:
np.mean(list(num_dict2.values()),axis=1)

array([ 5679.8,  6932.6,  9876.8, 10000. , 10000. ,  7839. ,  9599. ,
        8580.6,  5192.6])

In [117]:
list(num_dict2.values())

[[4462, 8560, 4968, 3996, 4873],
 [6454, 6094, 9272, 5472, 8433],
 [9700, 9700, 9700, 9700, 9700],
 [9700, 9700, 9700, 9700, 9700],
 [9700, 9700, 9700, 9700, 9700],
 [9700, 9700, 9700, 9700, 9700],
 [5130, 5626, 9235, 9700, 5143],
 [8524, 7288, 9245, 9700, 9700],
 [4520, 5068, 4939, 5950, 2903]]

---

### Test 및 submission (1) : 80%의 training data만 사용하는 경우

#### data 

In [224]:
# train_df = pd.read_csv('train_refined/new_row_removed.csv')
print(train_df.shape)
train_x1, valid_x1, train_y1, valid_y1 = train_test_split(train_df.iloc[:,:-2], train_df.iloc[:,-2], test_size=0.2, random_state=42)
train_x2, valid_x2, train_y2, valid_y2 = train_test_split(train_df.iloc[:,:-2], train_df.iloc[:,-1], test_size=0.2, random_state=42)

(52176, 116)


In [225]:
train_df.head(1)

Unnamed: 0,Hour,Minute,DHI_0,DNI_0,WS_0,RH_0,T_0,TARGET_0,GHI_0,dew_0,...,sunny_6,sunrise_6,sunset_6,cloudy_6,temp_diff_6,delta_6,zenith_6,azi_6,target1,target2
0,0,0,-6.907755,-6.907755,0.406132,69.08,-12,-6.907755,-6.907755,-4.035806,...,8.5,8.0,16.5,-6.907755,9,-22.672581,166.672581,0.0,-6.907755,-6.907755


#### Model 및 Training

In [226]:
def LGBM(q, train_x, train_y, valid_x, valid_y):
    model = LGBMRegressor(objective='quantile', alpha=q,
                         n_estimators=10000, bagging_fraction=0.7, learning_rate=0.027, subsample=0.7)
    model.fit(train_x, train_y, eval_metric = ['quantile'], eval_set=[(valid_x, valid_y)], 
              early_stopping_rounds=300, verbose=False)
    min_error = min(model.evals_result_['valid_0']['quantile'])
    return model, min_error

In [227]:
def train_real(date, train_x1, valid_x1, train_y1, valid_y1, train_x2, valid_x2, train_y2, valid_y2):
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    
    if not os.path.isdir('model/{}'.format(date)):
        os.makedirs('model/{}/model1'.format(date), exist_ok=True)
        os.makedirs('model/{}/model2'.format(date), exist_ok=True)
    
    for q in tqdm(quantiles):
        model1, error1 = LGBM(q, train_x1, train_y1, valid_x1, valid_y1)
        joblib.dump(model1,'model/{}/model1/q{}'.format(date,str(q).replace('.','')))

        model2, error2 = LGBM(q, train_x2, train_y2, valid_x2, valid_y2)
        joblib.dump(model2,'model/{}/model2/q{}'.format(date, str(q).replace('.','')))

In [228]:
train_real('0115_2',train_x1, valid_x1, train_y1, valid_y1, train_x2, valid_x2, train_y2, valid_y2)

  0%|                                                                                            | 0/9 [00:00<?, ?it/s]



100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [07:30<00:00, 50.01s/it]


#### Test data 및 submission 생성

In [229]:
def test_predict(i,df,date):
    result = pd.DataFrame({'id':make_id(i)})
    model1_pred = pd.DataFrame()
    model2_pred = pd.DataFrame()
    
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    for q in quantiles:
        model1 = joblib.load('model/{}/model1/q{}'.format(date, str(q).replace('.','')))
        model2 = joblib.load('model/{}/model2/q{}'.format(date, str(q).replace('.','')))
        # log씌운 TARGET인 경우
        pred1 = pd.Series(np.round(np.exp(model1.predict(df))-0.001,6))  
        pred2 = pd.Series(np.round(np.exp(model2.predict(df))-0.001,6))
        # log 안 씌운 TARGET인 경우
#         pred1 = pd.Series(model1.predict(df).round(6))
#         pred2 = pd.Series(model2.predict(df).round(6))
        model1_pred = pd.concat([model1_pred,pred1],axis=1)
        model2_pred = pd.concat([model2_pred,pred2],axis=1)
    predict_df = pd.concat([model1_pred,model2_pred],axis=0).reset_index(drop=True)
    
    result = pd.concat([result,predict_df],axis=1)
    result.columns = ['id','q_0.1','q_0.2','q_0.3','q_0.4','q_0.5','q_0.6','q_0.7','q_0.8','q_0.9']
        
    return result

In [230]:
dfs = []
for i in tqdm(range(81)):
    path = 'test/{}.csv'.format(i)
    df = pd.read_csv(path)
    df1 = test_preprocess1(df)
    df2 = test_preprocess2(df1)
    
    test_df = pd.DataFrame()
    for day in range(7):
        if day == 0:
            day_df = df2[df2.Day == day].drop(['Day'],axis=1).reset_index(drop=True) 
            cols = ['Hour','Minute'] + ['{}_{}'.format(col,day) for col in list(df2.columns)[3:]]
        else:
            day_df = df2[df2.Day == day].drop(['Day','Hour','Minute'],axis=1).reset_index(drop=True) 
            cols = ['{}_{}'.format(col,day) for col in list(df2.columns)[3:]]
        day_df.columns = cols
        test_df = pd.concat([test_df,day_df],axis=1)  
        #
        to_log = ['DHI','DNI','WS','TARGET','GHI','cloudy']
        for col in to_log:
            test_df['{}_{}'.format(col,day)] = np.log(test_df['{}_{}'.format(col,day)] + 0.001)
#         test_df.drop(['RH_{}'.format(day),'sunrise_{}'.format(day),'sunset_{}'.format(day)], axis=1, inplace=True)
    
    # scaler 미적용
    predict_result = test_predict(i,test_df,'0115_2') 
    
    dfs.append(predict_result)
    
sub = pd.concat(dfs,axis=0).reset_index(drop=True)
print(sub.shape)

100%|██████████████████████████████████████████████████████████████████████████████████| 81/81 [05:04<00:00,  3.76s/it]

(7776, 10)





In [231]:
sub.to_csv('submission/{}.csv'.format('0115_2') , index=False)

In [219]:
dfs = []
for i in tqdm(range(81)):
    path = 'test/{}.csv'.format(i)
    df = pd.read_csv(path)
    test_df = pd.DataFrame([df.iloc[i:(i+289),:][['TARGET']].T.values.tolist()[0] for i in range((df.shape[0]-288))]) # df.shape[0]-288
    test_df.columns = ['{}'.format(j) for j in range(289)] 
    
    predict_result = test_predict(i,test_df,'0115_1')
    dfs.append(predict_result)
sub = pd.concat(dfs,axis=0).reset_index(drop=True)
print(sub.shape)

100%|██████████████████████████████████████████████████████████████████████████████████| 81/81 [01:39<00:00,  1.23s/it]

(7776, 10)





In [220]:
sub.to_csv('submission/{}.csv'.format('0115_1') , index=False)

---

### Test 및 submission (2) : 100%의 training data를 사용하는 경우

#### data 

In [135]:
train_df = pd.read_csv('train_refined/new1.csv')
print(train_df.shape)
train_df.head(1)

(52176, 101)


Unnamed: 0,Hour,Minute,DHI_0,DNI_0,WS_0,T_0,TARGET_0,GHI_0,dew_0,sunny_0,...,zenith_6,azi_6,TARGET_diff_0,TARGET_diff_1,TARGET_diff_2,TARGET_diff_3,TARGET_diff_4,TARGET_diff_5,target1,target2
0,0,0,0,0,1.5,-12,0.0,0,-4.035806,8.5,...,166.672581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [136]:
train_x, train_y1, train_y2 = train_df.iloc[:,:-2], train_df.iloc[:,-2], train_df.iloc[:,-1]

#### Model 및 Training

In [148]:
def LGBM(q, train_x, train_y, nums):
    model = LGBMRegressor(objective='quantile', alpha=q,
                         n_estimators=nums, bagging_fraction=0.7, learning_rate=0.027, subsample=0.7)
    model.fit(train_x, train_y)
    return model

In [149]:
def train_alldata(date, train_x, train_y1, train_y2, iters1, iters2):
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    
    if not os.path.isdir('model/{}'.format(date)):
        os.makedirs('model/{}/model1'.format(date), exist_ok=True)
        os.makedirs('model/{}/model2'.format(date), exist_ok=True)
    
    for i in tqdm(range(len(quantiles))):
        q = quantiles[i]
        num1 = int(iters1[i]) ; num2 = int(iters2[i])
        
        model1 = LGBM(q, train_x, train_y1, num1)
        joblib.dump(model1,'model/{}/model1/q{}'.format(date,str(q).replace('.','')))

        model2 = LGBM(q, train_x, train_y2, num2)
        joblib.dump(model2,'model/{}/model2/q{}'.format(date, str(q).replace('.','')))

In [150]:
iters1 = [3527.4, 5438.6, 9176.2, 10000. , 10000. , 9566.8, 9162.6, 8743.4, 4154. ]
iters2 =  [5371.8, 7145. , 10000. , 10000. , 10000. , 10000. , 7026.8, 9011.4, 4676. ]
train_alldata('0113',train_x, train_y1, train_y2, iters1, iters2)

100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [07:42<00:00, 51.35s/it]


In [155]:
train_x.head()

Unnamed: 0,Hour,Minute,DHI_0,DNI_0,WS_0,T_0,TARGET_0,GHI_0,dew_0,sunny_0,...,temp_diff_6,delta_6,zenith_6,azi_6,TARGET_diff_0,TARGET_diff_1,TARGET_diff_2,TARGET_diff_3,TARGET_diff_4,TARGET_diff_5
0,0,0,0,0,1.5,-12,0.0,0,-4.035806,8.5,...,9,-22.672581,166.672581,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,30,0,0,1.5,-12,0.0,0,-4.03967,8.5,...,9,-22.672581,166.672581,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,0,0,1.6,-12,0.0,0,-3.523105,8.5,...,9,-22.672581,161.375576,48.400354,0.0,0.0,0.0,0.0,0.0,0.0
3,1,30,0,0,1.6,-12,0.0,0,-3.528707,8.5,...,9,-22.672581,161.375576,48.400354,0.0,0.0,0.0,0.0,0.0,0.0
4,2,0,0,0,1.6,-12,0.0,0,-2.897725,8.5,...,9,-22.672581,150.815833,71.112247,0.0,0.0,0.0,0.0,0.0,0.0


#### Test data 및 submission 생성

In [158]:
dfs = []
for i in tqdm(range(81)):
    path = 'test/{}.csv'.format(i)
    df = pd.read_csv(path)
    df1 = test_preprocess1(df)
    df2 = test_preprocess2(df1)
    
    test_df = pd.DataFrame()
    for day in range(7):
        if day == 0:
            day_df = df2[df2.Day == day].drop(['Day'],axis=1).reset_index(drop=True) 
            cols = ['Hour','Minute'] + ['{}_{}'.format(col,day) for col in list(df2.columns)[3:]]
        else:
            day_df = df2[df2.Day == day].drop(['Day','Hour','Minute'],axis=1).reset_index(drop=True) 
            cols = ['{}_{}'.format(col,day) for col in list(df2.columns)[3:]]
        day_df.columns = cols
        test_df = pd.concat([test_df,day_df],axis=1) 
        test_df.drop(['RH_{}'.format(day),'sunrise_{}'.format(day),'sunset_{}'.format(day)], axis=1, inplace=True)
        
    # TARGET_diff
    for day in range(6):
        test_df['TARGET_diff_{}'.format(day)] = test_df['TARGET_{}'.format(day+1)] - test_df['TARGET_{}'.format(day)]
        
    # scaler 적용
#     scaled_test_df = scaler.transform(test_df)
#     predict_result = test_predict(i,scaled_test_df,'0112') 
    
    # scaler 미적용
    predict_result = test_predict(i,test_df,'0113') 
    
    dfs.append(predict_result)
    
sub = pd.concat(dfs,axis=0).reset_index(drop=True)
print(sub.shape)

100%|██████████████████████████████████████████████████████████████████████████████████| 81/81 [04:04<00:00,  3.02s/it]

(7776, 10)





In [159]:
sub.to_csv('submission/{}.csv'.format('0113') , index=False)

In [160]:
test_df.head(1)

Unnamed: 0,Hour,Minute,DHI_0,DNI_0,WS_0,T_0,TARGET_0,GHI_0,dew_0,sunny_0,...,temp_diff_6,delta_6,zenith_6,azi_6,TARGET_diff_0,TARGET_diff_1,TARGET_diff_2,TARGET_diff_3,TARGET_diff_4,TARGET_diff_5
0,0,0,0,0,2.1,3.9,0.0,0,-1.620077,14.0,...,15.9,22.231059,121.768941,0.0,0.0,0.0,0.0,0.0,0.0,0.0
