In [43]:
import numpy as np
import pandas as pd
import pickle
import joblib
import math
from tqdm import tqdm

import xgboost as xgb
import lightgbm as lgbm
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 100)

In [11]:
tmps = []
for i in tqdm(range(81)):
    tmp = pd.read_csv('./data/test/{}.csv'.format(i))
    tmps.append(tmp)
    
test = pd.concat(tmps, axis=0).reset_index(drop=True)

100%|█████████████████████████████████████████████████████████████████████████████████| 81/81 [00:00<00:00, 423.73it/s]


In [12]:
print(test.shape)
test.head()

(27216, 9)


Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
0,0,0,0,0,0,2.7,34.42,0.0,0.0
1,0,0,30,0,0,2.7,34.17,0.1,0.0
2,0,1,0,0,0,2.7,34.23,0.2,0.0
3,0,1,30,0,0,2.7,33.99,0.3,0.0
4,0,2,0,0,0,2.8,33.97,0.4,0.0


In [13]:
test['file'] = test.index
test['file'] = (test['file']/336).astype(int)
test['date'] = test['Day'] + 7*test['file']
test

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET,file,date
0,0,0,0,0,0,2.7,34.42,0.0,0.0,0,0
1,0,0,30,0,0,2.7,34.17,0.1,0.0,0,0
2,0,1,0,0,0,2.7,34.23,0.2,0.0,0,0
3,0,1,30,0,0,2.7,33.99,0.3,0.0,0,0
4,0,2,0,0,0,2.8,33.97,0.4,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
27211,6,21,30,0,0,0.8,63.35,13.7,0.0,80,566
27212,6,22,0,0,0,0.7,64.82,13.1,0.0,80,566
27213,6,22,30,0,0,0.7,66.10,12.8,0.0,80,566
27214,6,23,0,0,0,0.6,67.64,12.4,0.0,80,566


In [14]:
def time_minute(data):
    hour = data['Hour']
    if data['Minute'] == 0:
        minute = 0
    else:
        minute = 0.5
    
    return hour + minute

In [15]:
ghi = test.iloc[np.where(test['DHI'] + test['DNI'] >0)[0]].reset_index(drop=True)
ghi['time_float'] = [time_minute(ghi.iloc[i]) for i in range(len(ghi.index))]

In [22]:
ghi

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET,file,date,time_float
0,0,7,30,8,0,4.2,32.62,2.7,0.750761,0,0,7.5
1,0,8,0,35,2,4.3,31.98,3.9,3.284535,0,0,8.0
2,0,8,30,64,12,4.4,29.00,5.3,6.193599,0,0,8.5
3,0,9,0,90,21,4.4,30.59,6.7,8.914892,0,0,9.0
4,0,9,30,128,137,4.3,28.19,7.9,15.952754,0,0,9.5
...,...,...,...,...,...,...,...,...,...,...,...,...
14062,6,17,30,130,244,1.7,32.01,20.6,19.891246,80,566,17.5
14063,6,18,0,91,181,1.4,41.65,19.4,12.666762,80,566,18.0
14064,6,18,30,54,126,1.2,45.74,17.9,6.849547,80,566,18.5
14065,6,19,0,21,51,1.0,53.81,16.4,2.251943,80,566,19.0


### test_by_date 만들기

In [25]:
test_by_date = pd.DataFrame(ghi.groupby(['date'])['time_float'].min()).reset_index()
test_by_date.rename(columns = {'time_float': 'sunrise'}, inplace=True)
test_by_date['sunset'] = pd.DataFrame(ghi.groupby(['date'])['time_float'].max())['time_float']
test_by_date['sunny'] = test_by_date['sunset'] - test_by_date['sunrise']

In [32]:
test_by_date['T_mean'] = pd.DataFrame(test.groupby(['date'])['T'].mean())['T']
test_by_date['DHI_mean'] = pd.DataFrame(test.groupby(['date'])['DHI'].mean())['DHI']
test_by_date['DNI_mean'] = pd.DataFrame(test.groupby(['date'])['DNI'].mean())['DNI']
test_by_date['WS_mean'] = pd.DataFrame(test.groupby(['date'])['WS'].mean())['WS']
test_by_date['RH_mean'] = pd.DataFrame(test.groupby(['date'])['RH'].mean())['RH']
test_by_date['TARGET_mean'] = pd.DataFrame(test.groupby(['date'])['TARGET'].mean())['TARGET']

test_by_date['temp_diff'] = pd.DataFrame(pd.DataFrame(test.groupby(['date'])['T'].max()) - pd.DataFrame(test.groupby(['date'])['T'].min()))['T']

In [36]:
test_by_date.head()

Unnamed: 0,date,sunrise,sunset,sunny,T_mean,DHI_mean,DNI_mean,WS_mean,RH_mean,TARGET_mean,temp_diff
0,0,7.5,16.5,9.0,4.652083,45.3125,84.395833,2.55625,35.827292,6.703528,11.8
1,1,7.5,16.5,9.0,5.45,40.395833,78.166667,1.454167,39.045208,6.21477,9.3
2,2,7.5,16.5,9.0,3.414583,38.083333,121.583333,1.185417,44.850625,7.370306,9.8
3,3,7.5,16.5,9.0,0.5125,22.229167,156.229167,1.616667,56.891042,7.507346,12.9
4,4,7.5,16.5,9.0,-2.258333,16.125,317.333333,1.839583,49.736667,11.253597,10.1


In [38]:
# GHI -----------------------------------------------------------------------------
test['GHI'] = test['DHI'] + test['DNI']

# dew -----------------------------------------------------------------------------
def dewpoint(rh, t) :
    gamma = math.log(rh/100) + 17.62*t/(243.12+t)
    dew = (243.12*gamma)/(17.62-gamma)
    return dew

test['dew'] = [dewpoint(test['RH'][i], test['T'][i]) for i in range(len(test))]

# cloudy -----------------------------------------------------------------------------
test['cloudy'] = test['DHI']/(test['DNI']+0.1)

In [40]:
test_by_date['GHI_mean'] = pd.DataFrame(test.groupby(['date'])['GHI'].mean())['GHI']
test_by_date['dew_mean'] = pd.DataFrame(test.groupby(['date'])['dew'].mean())['dew']
test_by_date['cloudy_mean'] = pd.DataFrame(test.groupby(['date'])['cloudy'].mean())['cloudy']

In [44]:
test_by_date

Unnamed: 0,date,sunrise,sunset,sunny,T_mean,DHI_mean,DNI_mean,WS_mean,RH_mean,TARGET_mean,temp_diff,GHI_mean,dew_mean,cloudy_mean
0,0,7.5,16.5,9.0,4.652083,45.312500,84.395833,2.556250,35.827292,6.703528,11.8,129.708333,-9.413587,2.421375
1,1,7.5,16.5,9.0,5.450000,40.395833,78.166667,1.454167,39.045208,6.214770,9.3,118.562500,-7.566488,17.958062
2,2,7.5,16.5,9.0,3.414583,38.083333,121.583333,1.185417,44.850625,7.370306,9.8,159.666667,-7.617549,2.019084
3,3,7.5,16.5,9.0,0.512500,22.229167,156.229167,1.616667,56.891042,7.507346,12.9,178.458333,-7.265113,8.756155
4,4,7.5,16.5,9.0,-2.258333,16.125000,317.333333,1.839583,49.736667,11.253597,10.1,333.458333,-11.659642,0.020597
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562,562,4.5,19.0,14.5,14.922917,107.312500,274.875000,1.414583,47.613958,25.421188,16.4,382.187500,2.560023,0.607048
563,563,4.5,19.0,14.5,15.972917,85.770833,326.041667,1.627083,51.993333,27.129276,16.8,411.812500,4.674903,0.388461
564,564,4.5,19.0,14.5,18.062500,108.479167,291.604167,1.772917,49.728333,25.689886,18.5,400.083333,5.257152,1.107088
565,565,4.5,19.0,14.5,17.520833,108.520833,229.416667,1.304167,48.447292,23.092370,15.8,337.937500,4.891991,1.945497


### month 예측

In [47]:
test_X = test_by_date.drop(['date', 'cloudy_mean'], axis=1)

In [48]:
test_X

Unnamed: 0,sunrise,sunset,sunny,T_mean,DHI_mean,DNI_mean,WS_mean,RH_mean,TARGET_mean,temp_diff,GHI_mean,dew_mean
0,7.5,16.5,9.0,4.652083,45.312500,84.395833,2.556250,35.827292,6.703528,11.8,129.708333,-9.413587
1,7.5,16.5,9.0,5.450000,40.395833,78.166667,1.454167,39.045208,6.214770,9.3,118.562500,-7.566488
2,7.5,16.5,9.0,3.414583,38.083333,121.583333,1.185417,44.850625,7.370306,9.8,159.666667,-7.617549
3,7.5,16.5,9.0,0.512500,22.229167,156.229167,1.616667,56.891042,7.507346,12.9,178.458333,-7.265113
4,7.5,16.5,9.0,-2.258333,16.125000,317.333333,1.839583,49.736667,11.253597,10.1,333.458333,-11.659642
...,...,...,...,...,...,...,...,...,...,...,...,...
562,4.5,19.0,14.5,14.922917,107.312500,274.875000,1.414583,47.613958,25.421188,16.4,382.187500,2.560023
563,4.5,19.0,14.5,15.972917,85.770833,326.041667,1.627083,51.993333,27.129276,16.8,411.812500,4.674903
564,4.5,19.0,14.5,18.062500,108.479167,291.604167,1.772917,49.728333,25.689886,18.5,400.083333,5.257152
565,4.5,19.0,14.5,17.520833,108.520833,229.416667,1.304167,48.447292,23.092370,15.8,337.937500,4.891991


In [49]:
model = joblib.load('./model/month_xgb_200121.pkl')
predict_values = model.predict(test_X)

In [50]:
test_by_date['month'] = predict_values

In [51]:
test_by_date['month'].unique()

array([10,  0,  2,  1,  6,  7,  3,  5,  8,  4,  9], dtype=int64)

In [52]:
test_by_date['month'].value_counts()

6     78
2     67
0     65
10    60
3     56
5     55
4     50
8     37
9     36
1     35
7     28
Name: month, dtype: int64

In [53]:
test_by_date[:50]

Unnamed: 0,date,sunrise,sunset,sunny,T_mean,DHI_mean,DNI_mean,WS_mean,RH_mean,TARGET_mean,temp_diff,GHI_mean,dew_mean,cloudy_mean,month
0,0,7.5,16.5,9.0,4.652083,45.3125,84.395833,2.55625,35.827292,6.703528,11.8,129.708333,-9.413587,2.421375,10
1,1,7.5,16.5,9.0,5.45,40.395833,78.166667,1.454167,39.045208,6.21477,9.3,118.5625,-7.566488,17.958062,10
2,2,7.5,16.5,9.0,3.414583,38.083333,121.583333,1.185417,44.850625,7.370306,9.8,159.666667,-7.617549,2.019084,10
3,3,7.5,16.5,9.0,0.5125,22.229167,156.229167,1.616667,56.891042,7.507346,12.9,178.458333,-7.265113,8.756155,0
4,4,7.5,16.5,9.0,-2.258333,16.125,317.333333,1.839583,49.736667,11.253597,10.1,333.458333,-11.659642,0.020597,10
5,5,7.5,16.5,9.0,-1.810417,27.625,193.1875,1.202083,64.367917,8.706162,9.2,220.8125,-7.891506,0.177984,0
6,6,7.5,16.5,9.0,-5.4875,32.0,8.333333,4.258333,65.420833,3.280946,5.6,40.333333,-11.034327,14.563181,0
7,7,7.5,16.5,9.0,-7.3875,40.604167,101.541667,3.029167,50.174167,7.160091,8.6,142.145833,-16.674219,9.992749,0
8,8,7.5,16.5,9.0,-9.289583,41.604167,73.25,1.866667,56.062292,6.401682,6.1,114.854167,-16.556945,5.348181,0
9,9,7.5,16.5,9.0,-6.152083,14.520833,325.5,2.633333,46.826458,11.353767,14.0,340.020833,-16.439323,0.018161,10


In [None]:
0 1 2 3 4 5 6 7 8 9 10

In [None]:
1 2 3 4 5 6 7 8 9 10 11

In [54]:
model = joblib.load('./model/month_lgb_200121.pkl')
predict_values = model.predict(test_X)

In [56]:
test_by_date['month'] = predict_values

In [57]:
test_by_date['month'].unique()

array([10,  0,  1,  2,  6,  7,  3,  5,  8,  4,  9], dtype=int64)

In [58]:
test_by_date['month'].value_counts()

6     76
0     73
2     62
5     56
3     55
10    54
4     49
8     39
1     37
9     35
7     31
Name: month, dtype: int64