In [280]:
import numpy as np
import pandas as pd
import pickle
import joblib
import math
from tqdm import tqdm

import xgboost as xgb
import lightgbm as lgbm
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 1000)

In [2]:
train = pd.read_csv('./data/train/train.csv')
print(train.shape)
train.head(1)

(52560, 9)


Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
0,0,0,0,0,0,1.5,69.08,-12,0.0


### sunrise sunset sunny 변수 추가	

In [3]:
def time_minute(data):
    hour = data['Hour']
    if data['Minute'] == 0:
        minute = 0
    else:
        minute = 0.5
    
    return hour + minute

In [10]:
ghi = train.iloc[np.where(train['DHI'] + train['DNI'] >0)[0]].reset_index(drop=True)
ghi['time_float'] = [time_minute(ghi.iloc[i]) for i in range(len(ghi.index))]

train_by_day = pd.DataFrame(ghi.groupby(['Day'])['time_float'].min()).reset_index()
train_by_day.rename(columns = {'time_float': 'sunrise'}, inplace=True)
train_by_day['sunset'] = pd.DataFrame(ghi.groupby(['Day'])['time_float'].max())['time_float']
train_by_day['sunny'] = train_by_day['sunset'] - train_by_day['sunrise']

In [None]:
print(train_by_day.shape)
train_by_day.head(1)

### date, month 변수 추가

In [16]:
train_by_day[train_by_day['sunny'] == 8] # 마지막 동지를 2018년 동지, 12월22일로 지정 (1월1일부터 데이터 맞음)

Unnamed: 0,Day,sunrise,sunset,sunny
356,356,8.0,16.0,8.0
721,721,8.0,16.0,8.0
1086,1086,8.0,16.0,8.0


In [17]:
train_by_day['date'] = train_by_day['Day']%365 # 365일 기준 날짜 생성

In [18]:
def day_to_month(date):
    
    if date < 31 :
        return 0
    elif date < 59 :
        return 1
    elif date < 90 :
        return 2
    elif date < 120 :
        return 3
    elif date < 151 :
        return 4
    elif date < 181 :
        return 5
    elif date < 212 :
        return 6
    elif date < 243 :
        return 7
    elif date < 273 :
        return 8
    elif date < 304 :
        return 9
    elif date < 334 :
        return 10
    else :
        return 11

In [21]:
train_by_day['month'] = [day_to_month(i) for i in train_by_day['date']]
train_by_day.head(1)

Unnamed: 0,Day,sunrise,sunset,sunny,date,month
0,0,8.0,16.5,8.5,0,0


### 기타 변수 추가 (day 기준으로 변환)

In [40]:
train_by_day['temp_diff'] = pd.DataFrame(pd.DataFrame(train.groupby(['Day'])['T'].max()) - pd.DataFrame(train.groupby(['Day'])['T'].min()))['T']
train_by_day['T_mean'] = pd.DataFrame(train.groupby(['Day'])['T'].mean())['T']
train_by_day['DHI_mean'] = pd.DataFrame(train.groupby(['Day'])['DHI'].mean())['DHI']
train_by_day['DNI_mean'] = pd.DataFrame(train.groupby(['Day'])['DNI'].mean())['DNI']
train_by_day['WS_mean'] = pd.DataFrame(train.groupby(['Day'])['WS'].mean())['WS']
train_by_day['RH_mean'] = pd.DataFrame(train.groupby(['Day'])['RH'].mean())['RH']
train_by_day['TARGET_mean'] = pd.DataFrame(train.groupby(['Day'])['TARGET'].mean())['TARGET']

In [41]:
train_by_day

Unnamed: 0,Day,sunrise,sunset,sunny,date,month,T_mean,DHI_mean,DNI_mean,WS_mean,RH_mean,TARGET_mean,temp_diff
0,0,8.0,16.5,8.5,0,0,-7.979167,44.937500,78.708333,1.929167,70.329375,6.520751,9
1,1,8.0,16.5,8.5,1,0,-6.312500,18.604167,295.187500,1.718750,74.231250,11.025184,14
2,2,8.0,16.5,8.5,2,0,-6.479167,28.937500,24.645833,2.470833,76.275000,3.165478,13
3,3,8.0,16.5,8.5,3,0,-5.687500,39.312500,66.979167,2.279167,65.695208,5.810807,15
4,4,8.0,16.5,8.5,4,0,0.854167,44.125000,22.500000,3.995833,73.361042,4.817273,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,1090,8.0,16.5,8.5,360,11,-0.895833,18.729167,306.437500,2.810417,57.857500,11.261230,14
1091,1091,8.0,16.5,8.5,361,11,0.854167,29.833333,206.583333,1.970833,55.269792,9.349126,8
1092,1092,8.0,16.5,8.5,362,11,2.187500,34.562500,171.291667,3.368750,49.227292,8.875729,11
1093,1093,8.0,16.5,8.5,363,11,1.687500,16.708333,309.812500,3.252083,49.218333,11.114171,14


### GHI, dew, cloudy 추가

In [26]:
train[:3]

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
0,0,0,0,0,0,1.5,69.08,-12,0.0
1,0,0,30,0,0,1.5,69.06,-12,0.0
2,0,1,0,0,0,1.6,71.78,-12,0.0


In [84]:
# GHI -----------------------------------------------------------------------------
train['GHI'] = train['DHI'] + train['DNI']

# dew -----------------------------------------------------------------------------
def dewpoint(rh, t) :
    gamma = math.log(rh/100) + 17.62*t/(243.12+t)
    dew = (243.12*gamma)/(17.62-gamma)
    return dew

train['dew'] = [dewpoint(train['RH'][i], train['T'][i]) for i in range(len(train))]

# cloudy -----------------------------------------------------------------------------
train['cloudy'] = train['DHI']/(train['DNI']+0.1)

In [85]:
train[:3]

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET,GHI,dew,cloudy
0,0,0,0,0,0,1.5,69.08,-12,0.0,0,-16.522271,0.0
1,0,0,30,0,0,1.5,69.06,-12,0.0,0,-16.525742,0.0
2,0,1,0,0,0,1.6,71.78,-12,0.0,0,-16.061776,0.0


In [86]:
train_by_day['GHI_mean'] = pd.DataFrame(train.groupby(['Day'])['GHI'].mean())['GHI']
train_by_day['dew_mean'] = pd.DataFrame(train.groupby(['Day'])['dew'].mean())['dew']
train_by_day['cloudy_mean'] = pd.DataFrame(train.groupby(['Day'])['cloudy'].mean())['cloudy']

In [87]:
train_by_day

Unnamed: 0,Day,sunrise,sunset,sunny,date,month,T_mean,DHI_mean,DNI_mean,WS_mean,RH_mean,TARGET_mean,temp_diff,GHI_mean,dew_mean,cloudy_mean
0,0,8.0,16.5,8.5,0,0,-7.979167,44.937500,78.708333,1.929167,70.329375,6.520751,9,123.645833,-12.552381,16.837523
1,1,8.0,16.5,8.5,1,0,-6.312500,18.604167,295.187500,1.718750,74.231250,11.025184,14,313.791667,-10.362553,4.812806
2,2,8.0,16.5,8.5,2,0,-6.479167,28.937500,24.645833,2.470833,76.275000,3.165478,13,53.583333,-10.028902,145.157079
3,3,8.0,16.5,8.5,3,0,-5.687500,39.312500,66.979167,2.279167,65.695208,5.810807,15,106.291667,-11.207274,51.440077
4,4,8.0,16.5,8.5,4,0,0.854167,44.125000,22.500000,3.995833,73.361042,4.817273,11,66.625000,-3.405437,99.004255
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,1090,8.0,16.5,8.5,360,11,-0.895833,18.729167,306.437500,2.810417,57.857500,11.261230,14,325.166667,-8.348487,0.022263
1091,1091,8.0,16.5,8.5,361,11,0.854167,29.833333,206.583333,1.970833,55.269792,9.349126,8,236.416667,-7.286454,3.383069
1092,1092,8.0,16.5,8.5,362,11,2.187500,34.562500,171.291667,3.368750,49.227292,8.875729,11,205.854167,-7.465153,3.202771
1093,1093,8.0,16.5,8.5,363,11,1.687500,16.708333,309.812500,3.252083,49.218333,11.114171,14,326.520833,-8.257195,0.019912


### month 예측 모델 생성

In [264]:
X = train_by_day.drop(['Day', 'date', 'month', 'cloudy_mean'], axis=1)
y = train_by_day['month']

SEED = 777
TEST_RATIO = 0.2

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=TEST_RATIO, random_state=SEED)

In [265]:
len(X_train), len(X_val)

(876, 219)

#### XGB

In [266]:
model = xgb.XGBClassifier(use_label_encoder=False, objective='multi:softmax', eval_metric='mlogloss', n_estimators=1000, random_state=SEED)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=200)

Parameters: { use_label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.63318
Will train until validation_0-mlogloss hasn't improved in 200 rounds.
[1]	validation_0-mlogloss:1.29301
[2]	validation_0-mlogloss:1.07842
[3]	validation_0-mlogloss:0.93635
[4]	validation_0-mlogloss:0.82380
[5]	validation_0-mlogloss:0.74052
[6]	validation_0-mlogloss:0.67853
[7]	validation_0-mlogloss:0.63132
[8]	validation_0-mlogloss:0.59391
[9]	validation_0-mlogloss:0.56118
[10]	validation_0-mlogloss:0.54015
[11]	validation_0-mlogloss:0.52212
[12]	validation_0-mlogloss:0.50940
[13]	validation_0-mlogloss:0.49943
[14]	validation_0-mlogloss:0.49069
[15]	validation_0-mlogloss:0.48179
[16]	validation_0-mlogloss:0.47265
[17]	validation_0-mlogloss:0.46566
[18]	validation_0

[218]	validation_0-mlogloss:0.50130
[219]	validation_0-mlogloss:0.50216
[220]	validation_0-mlogloss:0.50243
[221]	validation_0-mlogloss:0.50224
Stopping. Best iteration:
[21]	validation_0-mlogloss:0.45124



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, objective='multi:softprob', random_state=777,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [267]:
y_pred = model.predict(X_val)
y_check = np.array(y_val)

accuracy_score(y_pred, y_check)

0.8356164383561644

In [268]:
X_train.columns

Index(['sunrise', 'sunset', 'sunny', 'T_mean', 'DHI_mean', 'DNI_mean',
       'WS_mean', 'RH_mean', 'TARGET_mean', 'temp_diff', 'GHI_mean',
       'dew_mean'],
      dtype='object')

In [269]:
model.feature_importances_

array([0.26369366, 0.18907659, 0.36609125, 0.05706494, 0.0123616 ,
       0.01040507, 0.00917696, 0.01144751, 0.01923084, 0.00786069,
       0.00844096, 0.04514994], dtype=float32)

In [272]:
# 모델 저장
joblib.dump(model, './model/month_xgb_200121.pkl')

['./model/month_xgb_200121.pkl']

In [273]:
for i in range(len(y_pred)):
    
    if y_check[i] != y_pred[i]: 
        print(y_check[i]-y_pred[i], ': ', y_check[i]+1, y_pred[i]+1)

-1 :  6 7
-1 :  6 7
1 :  3 2
-1 :  8 9
-1 :  6 7
-1 :  1 2
-1 :  8 9
-1 :  11 12
-11 :  1 12
1 :  7 6
-1 :  6 7
-1 :  4 5
-5 :  4 9
-1 :  5 6
10 :  11 1
-1 :  6 7
1 :  7 6
-1 :  4 5
1 :  5 4
-1 :  6 7
1 :  3 2
-1 :  6 7
-1 :  10 11
4 :  8 4
-1 :  7 8
-1 :  2 3
-11 :  1 12
1 :  7 6
-1 :  10 11
-1 :  7 8
1 :  7 6
-1 :  11 12
1 :  5 4
-1 :  10 11
1 :  8 7
1 :  7 6


#### LGBM

In [274]:
model = lgbm.LGBMClassifier(objective='multi:softmax', eval_metric='mlogloss', n_estimators=1000, random_state=SEED)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=200)

[1]	valid_0's multi_logloss: 2.14974
Training until validation scores don't improve for 200 rounds.
[2]	valid_0's multi_logloss: 1.9225
[3]	valid_0's multi_logloss: 1.74682
[4]	valid_0's multi_logloss: 1.60458
[5]	valid_0's multi_logloss: 1.48654
[6]	valid_0's multi_logloss: 1.38166
[7]	valid_0's multi_logloss: 1.29442
[8]	valid_0's multi_logloss: 1.2205
[9]	valid_0's multi_logloss: 1.15549
[10]	valid_0's multi_logloss: 1.09604
[11]	valid_0's multi_logloss: 1.04584
[12]	valid_0's multi_logloss: 0.997121
[13]	valid_0's multi_logloss: 0.954978
[14]	valid_0's multi_logloss: 0.91612
[15]	valid_0's multi_logloss: 0.879326
[16]	valid_0's multi_logloss: 0.849159
[17]	valid_0's multi_logloss: 0.819984
[18]	valid_0's multi_logloss: 0.792509
[19]	valid_0's multi_logloss: 0.767484
[20]	valid_0's multi_logloss: 0.742779
[21]	valid_0's multi_logloss: 0.723081
[22]	valid_0's multi_logloss: 0.701311
[23]	valid_0's multi_logloss: 0.684113
[24]	valid_0's multi_logloss: 0.665689
[25]	valid_0's multi_log

[220]	valid_0's multi_logloss: 0.729443
[221]	valid_0's multi_logloss: 0.733273
[222]	valid_0's multi_logloss: 0.735685
[223]	valid_0's multi_logloss: 0.737157
[224]	valid_0's multi_logloss: 0.73941
[225]	valid_0's multi_logloss: 0.741179
[226]	valid_0's multi_logloss: 0.743417
[227]	valid_0's multi_logloss: 0.745126
[228]	valid_0's multi_logloss: 0.746481
[229]	valid_0's multi_logloss: 0.748937
[230]	valid_0's multi_logloss: 0.752931
[231]	valid_0's multi_logloss: 0.754006
[232]	valid_0's multi_logloss: 0.754628
[233]	valid_0's multi_logloss: 0.755187
[234]	valid_0's multi_logloss: 0.755712
[235]	valid_0's multi_logloss: 0.758896
[236]	valid_0's multi_logloss: 0.76077
[237]	valid_0's multi_logloss: 0.76396
[238]	valid_0's multi_logloss: 0.765535
[239]	valid_0's multi_logloss: 0.766126
[240]	valid_0's multi_logloss: 0.767003
[241]	valid_0's multi_logloss: 0.769105
[242]	valid_0's multi_logloss: 0.77259
[243]	valid_0's multi_logloss: 0.774053
[244]	valid_0's multi_logloss: 0.775302
[245

LGBMClassifier(eval_metric='mlogloss', n_estimators=1000,
               objective='multi:softmax', random_state=777)

In [275]:
y_pred = model.predict(X_val)
y_check = np.array(y_val)

accuracy_score(y_pred, y_check)

0.8356164383561644

In [277]:
joblib.dump(model, './model/month_lgb_200121.pkl')

['./model/month_lgb_200121.pkl']

In [261]:
for i in range(len(y_pred)):
    
    if y_check[i] != y_pred[i]: 
        print(y_check[i]-y_pred[i], ': ', y_check[i]+1, y_pred[i]+1)

-1 :  6 7
1 :  8 7
-1 :  8 9
-1 :  2 3
-1 :  6 7
-1 :  1 2
-1 :  2 3
-1 :  11 12
-1 :  7 8
10 :  11 1
-11 :  1 12
-1 :  6 7
1 :  2 1
-1 :  4 5
1 :  11 10
-11 :  1 12
10 :  11 1
-1 :  6 7
1 :  7 6
-1 :  6 7
-1 :  10 11
-1 :  8 9
-1 :  8 9
-1 :  7 8
-11 :  1 12
1 :  7 6
-1 :  10 11
-1 :  7 8
1 :  7 6
-11 :  1 12
1 :  4 3
1 :  8 7
-1 :  10 11
-1 :  8 9
-1 :  2 3
1 :  7 6


In [262]:
model.feature_importances_

array([1072,  842, 1538, 4083, 2643, 1974, 2575, 2381, 3268,  685, 1765,
       3219])

In [263]:
X_train.columns

Index(['sunrise', 'sunset', 'sunny', 'T_mean', 'DHI_mean', 'DNI_mean',
       'WS_mean', 'RH_mean', 'TARGET_mean', 'temp_diff', 'GHI_mean',
       'dew_mean'],
      dtype='object')

#### RF

In [287]:
model = RandomForestClassifier(n_estimators=1000, random_state=SEED)
model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000, random_state=777)

In [288]:
y_pred = model.predict(X_val)
y_check = np.array(y_val)

accuracy_score(y_pred, y_check)

0.821917808219178

In [289]:
for i in range(len(y_pred)):
    
    if y_check[i] != y_pred[i]: 
        print(y_check[i]-y_pred[i], ': ', y_check[i]+1, y_pred[i]+1)

-1 :  6 7
-1 :  6 7
1 :  8 7
-1 :  8 9
-1 :  6 7
-1 :  1 2
-1 :  8 9
-1 :  11 12
-1 :  7 8
-11 :  1 12
-1 :  6 7
1 :  2 1
-1 :  4 5
11 :  12 1
-5 :  4 9
-1 :  5 6
1 :  11 10
10 :  11 1
-1 :  6 7
1 :  7 6
-1 :  4 5
-1 :  6 7
1 :  7 6
-1 :  10 11
-1 :  8 9
-1 :  6 7
-1 :  8 9
-1 :  7 8
-1 :  5 6
-11 :  1 12
1 :  7 6
1 :  7 6
-1 :  10 11
-1 :  7 8
1 :  7 6
1 :  8 7
-1 :  10 11
-1 :  8 9
1 :  7 6


In [290]:
model.feature_importances_

array([0.1230732 , 0.136161  , 0.14551505, 0.1426119 , 0.06782852,
       0.04187108, 0.04181205, 0.05357496, 0.07367155, 0.02918218,
       0.0453983 , 0.0993002 ])