In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
import math
from lightgbm import LGBMRegressor, plot_importance
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import mean_squared_error # squared=True 기본
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import joblib
from tqdm import tqdm
import shutil

In [4]:
train = pd.read_csv('./data/train/train.csv')
print(train.shape)
train.head(2)

(52560, 9)


Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
0,0,0,0,0,0,1.5,69.08,-12,0.0
1,0,0,30,0,0,1.5,69.06,-12,0.0


### train_by_day 데이터 구성

In [9]:
def time_minute(data):
    hour = data['Hour']
    if data['Minute'] == 0:
        minute = 0
    else:
        minute = 0.5
    
    return hour + minute

In [10]:
ghi = train.iloc[np.where(train['DHI'] + train['DNI'] >0)[0]].reset_index(drop=True)
ghi['time_float'] = [time_minute(ghi.iloc[i]) for i in range(len(ghi.index))]

train_by_day = pd.DataFrame(ghi.groupby(['Day'])['time_float'].min()).reset_index()
train_by_day.rename(columns = {'time_float': 'sunrise'}, inplace=True)
train_by_day['sunset'] = pd.DataFrame(ghi.groupby(['Day'])['time_float'].max())['time_float']
train_by_day['sunny'] = train_by_day['sunset'] - train_by_day['sunrise']

In [11]:
train_by_day['date'] = train_by_day['Day']%365 # 365일 기준 날짜 생성

def day_to_month(date):
    
    if date < 31 :
        return 0
    elif date < 59 :
        return 1
    elif date < 90 :
        return 2
    elif date < 120 :
        return 3
    elif date < 151 :
        return 4
    elif date < 181 :
        return 5
    elif date < 212 :
        return 6
    elif date < 243 :
        return 7
    elif date < 273 :
        return 8
    elif date < 304 :
        return 9
    elif date < 334 :
        return 10
    else :
        return 11
    
train_by_day['month'] = [day_to_month(i) for i in train_by_day['date']]

In [12]:
train_by_day['temp_diff'] = pd.DataFrame(pd.DataFrame(train.groupby(['Day'])['T'].max()) - pd.DataFrame(train.groupby(['Day'])['T'].min()))['T']

In [13]:
train_by_day

Unnamed: 0,Day,sunrise,sunset,sunny,date,month,temp_diff
0,0,8.0,16.5,8.5,0,0,9
1,1,8.0,16.5,8.5,1,0,14
2,2,8.0,16.5,8.5,2,0,13
3,3,8.0,16.5,8.5,3,0,15
4,4,8.0,16.5,8.5,4,0,11
...,...,...,...,...,...,...,...
1090,1090,8.0,16.5,8.5,360,11,14
1091,1091,8.0,16.5,8.5,361,11,8
1092,1092,8.0,16.5,8.5,362,11,11
1093,1093,8.0,16.5,8.5,363,11,14


### 기본 train 데이터 만들기 (시계열 구조화 하기 전)

In [26]:
df_train = train.copy()

# GHI -----------------------------------------------------------------------------
df_train['GHI'] = train['DHI'] + train['DNI']

# dew -----------------------------------------------------------------------------
def dewpoint(rh, t) :
    gamma = math.log(rh/100) + 17.62*t/(243.12+t)
    dew = (243.12*gamma)/(17.62-gamma)
    return dew

df_train['dew'] = [dewpoint(train['RH'][i], train['T'][i]) for i in range(len(train))]

# cloudy -----------------------------------------------------------------------------
df_train['cloudy'] = train['DHI']/(train['DNI']+0.1)

In [27]:
df_train['Hour_Minute'] = 2*train['Hour'] + train['Minute']/30
df_train['Hour_Minute'] = train['Hour_Minute'].astype(int)
df_train.drop(['Hour', 'Minute'],axis=1, inplace=True)

df_train[:50]

Unnamed: 0,Day,DHI,DNI,WS,RH,T,TARGET,GHI,dew,cloudy,Hour_Minute
0,0,0,0,1.5,69.08,-12,0.0,0,-16.522271,0.0,0
1,0,0,0,1.5,69.06,-12,0.0,0,-16.525742,0.0,1
2,0,0,0,1.6,71.78,-12,0.0,0,-16.061776,0.0,2
3,0,0,0,1.6,71.75,-12,0.0,0,-16.066807,0.0,3
4,0,0,0,1.6,75.2,-12,0.0,0,-15.500215,0.0,4
5,0,0,0,1.5,69.29,-11,0.0,0,-15.524406,0.0,5
6,0,0,0,1.5,72.56,-11,0.0,0,-14.965431,0.0,6
7,0,0,0,1.4,72.55,-11,0.0,0,-14.967106,0.0,7
8,0,0,0,1.3,74.62,-11,0.0,0,-14.624744,0.0,8
9,0,0,0,1.3,74.61,-11,0.0,0,-14.626377,0.0,9


In [28]:
train_by_day

Unnamed: 0,Day,sunrise,sunset,sunny,date,month,temp_diff
0,0,8.0,16.5,8.5,0,0,9
1,1,8.0,16.5,8.5,1,0,14
2,2,8.0,16.5,8.5,2,0,13
3,3,8.0,16.5,8.5,3,0,15
4,4,8.0,16.5,8.5,4,0,11
...,...,...,...,...,...,...,...
1090,1090,8.0,16.5,8.5,360,11,14
1091,1091,8.0,16.5,8.5,361,11,8
1092,1092,8.0,16.5,8.5,362,11,11
1093,1093,8.0,16.5,8.5,363,11,14


In [30]:
df_train = pd.merge(df_train, train_by_day.drop(['date'], axis=1), on='Day')

In [39]:
df_train = df_train[['Day', 'Hour_Minute', 'DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET', 'GHI', 'dew', 'cloudy','sunrise', 'sunset', 'sunny', 'month', 'temp_diff']]

In [40]:
df_train

Unnamed: 0,Day,Hour_Minute,DHI,DNI,WS,RH,T,TARGET,GHI,dew,cloudy,sunrise,sunset,sunny,month,temp_diff
0,0,0,0,0,1.5,69.08,-12,0.0,0,-16.522271,0.0,8.0,16.5,8.5,0,9
1,0,1,0,0,1.5,69.06,-12,0.0,0,-16.525742,0.0,8.0,16.5,8.5,0,9
2,0,2,0,0,1.6,71.78,-12,0.0,0,-16.061776,0.0,8.0,16.5,8.5,0,9
3,0,3,0,0,1.6,71.75,-12,0.0,0,-16.066807,0.0,8.0,16.5,8.5,0,9
4,0,4,0,0,1.6,75.20,-12,0.0,0,-15.500215,0.0,8.0,16.5,8.5,0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52555,1094,43,0,0,2.4,70.70,-4,0.0,0,-8.540093,0.0,8.0,16.5,8.5,11,6
52556,1094,44,0,0,2.4,66.79,-4,0.0,0,-9.268639,0.0,8.0,16.5,8.5,11,6
52557,1094,45,0,0,2.2,66.78,-4,0.0,0,-9.270551,0.0,8.0,16.5,8.5,11,6
52558,1094,46,0,0,2.1,67.72,-4,0.0,0,-9.091976,0.0,8.0,16.5,8.5,11,6


### 시계열 데이터 만들기 (feature: 0~6일 → target: 7일, 8일)

In [42]:
train_df = pd.DataFrame()

for i in tqdm(range(1087)):
    days_df = pd.DataFrame()
    
    # X columns : 0~6일 이후까지의 피처 합쳐기
    for day in range(7):
        if day == 0:
            day_df = df_train[df_train.Day == (i)].drop(['Day'], axis=1).reset_index(drop=True)
            cols = ['Hour_Minute'] + ['{}_{}'.format(col, day) for col in list(df_train.columns)[2:]]
            
        else:
            day_df = df_train[df_train.Day == (i+day)].drop(['Day','Hour_Minute'],axis=1).reset_index(drop=True) 
            cols = ['{}_{}'.format(col,day) for col in list(df_train.columns)[2:]]
            
        day_df.columns = cols
        days_df = pd.concat([days_df, day_df],axis=1)
        
    # y_columns
    target1 = df_train[df_train.Day == (i+7)][['TARGET']].rename(columns={'TARGET':'target1'}).reset_index(drop=True)
    target2 = df_train[df_train.Day == (i+8)][['TARGET']].rename(columns={'TARGET':'target2'}).reset_index(drop=True)
    
    # concat
    days_df = pd.concat([days_df,target1,target2],axis=1)
    train_df = pd.concat([train_df,days_df],axis=0).reset_index(drop=True)
    
train_df.head(2)

100%|██████████████████████████████████████████████████████████████████████████████| 1087/1087 [02:07<00:00,  8.50it/s]


Unnamed: 0,Hour_Minute,DHI_0,DNI_0,WS_0,RH_0,T_0,TARGET_0,GHI_0,dew_0,cloudy_0,...,GHI_6,dew_6,cloudy_6,sunrise_6,sunset_6,sunny_6,month_6,temp_diff_6,target1,target2
0,0,0,0,1.5,69.08,-12,0.0,0,-16.522271,0.0,...,0,-3.950791,0.0,8.0,16.5,8.5,0,9,0.0,0.0
1,1,0,0,1.5,69.06,-12,0.0,0,-16.525742,0.0,...,0,-3.946161,0.0,8.0,16.5,8.5,0,9,0.0,0.0


list(train_df.columns)

In [48]:
train_df[['month_0', 'month_1', 'month_2','month_3','month_4','month_5', 'month_6']]

Unnamed: 0,month_0,month_1,month_2,month_3,month_4,month_5,month_6
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...
52171,11,11,11,11,11,11,11
52172,11,11,11,11,11,11,11
52173,11,11,11,11,11,11,11
52174,11,11,11,11,11,11,11


In [65]:
52176/36

1449.3333333333333

In [50]:
def mode(lst):
    count = 0
    mode = 0;
    for x in lst: 
        if lst.count(x) > count:
            count = lst.coxunt(x)
            mode = x

    return mode

In [54]:
train_df['month'] = [ mode([train_df['month_0'][i], train_df['month_1'][i], train_df['month_2'][i], train_df['month_3'][i], train_df['month_4'][i], train_df['month_5'][i], train_df['month_6'][i]]) for i in range(len(train_df))]

In [70]:
train_df.drop(['month_0', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6'], axis=1, inplace=True)

In [71]:
train_df

Unnamed: 0,Hour_Minute,DHI_0,DNI_0,WS_0,RH_0,T_0,TARGET_0,GHI_0,dew_0,cloudy_0,...,GHI_6,dew_6,cloudy_6,sunrise_6,sunset_6,sunny_6,temp_diff_6,target1,target2,month
0,0,0,0,1.5,69.08,-12,0.0,0,-16.522271,0.0,...,0,-3.950791,0.0,8.0,16.5,8.5,9,0.0,0.0,0
1,1,0,0,1.5,69.06,-12,0.0,0,-16.525742,0.0,...,0,-3.946161,0.0,8.0,16.5,8.5,9,0.0,0.0,0
2,2,0,0,1.6,71.78,-12,0.0,0,-16.061776,0.0,...,0,-5.056141,0.0,8.0,16.5,8.5,9,0.0,0.0,0
3,3,0,0,1.6,71.75,-12,0.0,0,-16.066807,0.0,...,0,-5.054597,0.0,8.0,16.5,8.5,9,0.0,0.0,0
4,4,0,0,1.6,75.20,-12,0.0,0,-15.500215,0.0,...,0,-5.838443,0.0,8.0,16.5,8.5,9,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52171,43,0,0,4.1,74.26,-7,0.0,0,-10.810680,0.0,...,0,-8.689729,0.0,8.0,16.5,8.5,11,0.0,0.0,11
52172,44,0,0,3.6,70.65,-8,0.0,0,-12.399657,0.0,...,0,-10.028583,0.0,8.0,16.5,8.5,11,0.0,0.0,11
52173,45,0,0,3.1,70.67,-8,0.0,0,-12.396139,0.0,...,0,-10.033261,0.0,8.0,16.5,8.5,11,0.0,0.0,11
52174,46,0,0,2.7,67.70,-9,0.0,0,-13.887054,0.0,...,0,-9.516353,0.0,8.0,16.5,8.5,11,0.0,0.0,11


### Train 시키기

In [74]:
SEED = 42

In [72]:
def train_real(date, train_x1, valid_x1, train_y1, valid_y1, train_x2, valid_x2, train_y2, valid_y2):
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    
    if not os.path.isdir('model/{}'.format(date)):
        os.makedirs('model/{}/model1'.format(date), exist_ok=True)
        os.makedirs('model/{}/model2'.format(date), exist_ok=True)
    
    for q in tqdm(quantiles):
        model1, error1 = LGBM(q, train_x1, train_y1, valid_x1, valid_y1)
        joblib.dump(model1,'model/{}/model1/q{}'.format(date,str(q).replace('.','')))

        model2, error2 = LGBM(q, train_x2, train_y2, valid_x2, valid_y2)
        joblib.dump(model2,'model/{}/model2/q{}'.format(date, str(q).replace('.','')))

In [73]:
def train_quantiles(train_x1, valid_x1, train_y1, valid_y1, train_x2, valid_x2, train_y2, valid_y2):
    error_dict1 = {} ; error_dict2 = {}
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    
    for q in tqdm(quantiles):
        model1, error1 = LGBM(q, train_x1, train_y1, valid_x1, valid_y1)
        error_dict1['q{}'.format(str(q).replace('.',''))] = error1

        model2, error2 = LGBM(q, train_x2, train_y2, valid_x2, valid_y2)
        error_dict2['q{}'.format(str(q).replace('.',''))] = error2
    return error_dict1, error_dict2

In [76]:
def LGBM(q, train_x, train_y, valid_x, valid_y):
    model = LGBMRegressor(objective='quantile', alpha=q,
                         n_estimators=10000, bagging_fraction=0.7, learning_rate=0.027, subsample=0.7)
    model.fit(train_x, train_y, eval_metric = ['quantile'], 
          eval_set=[(valid_x, valid_y)], early_stopping_rounds=300, verbose=False)
    min_error = min(model.evals_result_['valid_0']['quantile'])
    return model, min_error

In [75]:
train_x1, valid_x1, train_y1, valid_y1 = train_test_split(train_df.drop(['target1', 'target2'], axis=1), train_df['target1'], test_size=0.2, random_state=SEED)
train_x2, valid_x2, train_y2, valid_y2 = train_test_split(train_df.drop(['target1', 'target2'], axis=1), train_df['target2'], test_size=0.2, random_state=SEED)

In [81]:
model1_errors, model2_errors = train_quantiles(train_x1, valid_x1, train_y1, valid_y1, train_x2, valid_x2, train_y2, valid_y2)
print(np.mean(list(model1_errors.values())), np.mean(list(model2_errors.values())))

100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [10:50<00:00, 72.23s/it]

1.4943374337791515 1.4997964055286768



