In [98]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

* Hour - 시간
* Minute - 분
* DHI - 수평면 산란일사량(Diffuse Horizontal Irradiance (W/m2))
* DNI - 직달일사량(Direct Normal Irradiance (W/m2))
* WS - 풍속(Wind Speed (m/s))
* RH - 상대습도(Relative Humidity (%))
* T - 기온(Temperature (Degree C))
* Target - 태양광 발전량 (kW)

In [99]:
sun_df = pd.read_csv('/content/drive/MyDrive/files/DACON - 태양열 발전양 예측/train/train.csv')
sun_df.sample(15)

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
34030,708,23,0,0,0,2.2,78.93,1,0.0
42235,879,21,30,0,0,1.2,73.83,10,0.0
36623,762,23,30,0,0,1.0,77.16,-3,0.0
16058,334,13,0,123,625,5.5,40.52,0,37.539151
24809,516,20,30,0,0,3.1,90.4,12,0.0
48314,1006,13,0,78,940,5.9,36.76,23,67.27192
19427,404,17,30,0,0,1.8,70.05,0,0.0
13913,289,20,30,0,0,2.9,57.9,11,0.0
45097,939,12,30,349,24,2.2,49.31,24,34.808379
46147,961,9,30,93,908,3.5,18.34,27,70.365023


In [100]:
submission_df = pd.read_csv('/content/drive/MyDrive/files/DACON - 태양열 발전양 예측/sample_submission.csv')
submission_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7776 entries, 0 to 7775
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      7776 non-null   object 
 1   q_0.1   7776 non-null   float64
 2   q_0.2   7776 non-null   float64
 3   q_0.3   7776 non-null   float64
 4   q_0.4   7776 non-null   float64
 5   q_0.5   7776 non-null   float64
 6   q_0.6   7776 non-null   float64
 7   q_0.7   7776 non-null   float64
 8   q_0.8   7776 non-null   float64
 9   q_0.9   7776 non-null   float64
dtypes: float64(9), object(1)
memory usage: 607.6+ KB


In [101]:
sun_df = sun_df.drop(['WS','RH','T'], axis=1,inplace=False)

In [102]:
sun_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52560 entries, 0 to 52559
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Day     52560 non-null  int64  
 1   Hour    52560 non-null  int64  
 2   Minute  52560 non-null  int64  
 3   DHI     52560 non-null  int64  
 4   DNI     52560 non-null  int64  
 5   TARGET  52560 non-null  float64
dtypes: float64(1), int64(5)
memory usage: 2.4 MB


In [103]:
sun_df['TARGET'].value_counts()

0.000000     26660
1.501566        12
1.689020        11
1.783110        11
1.970805         8
             ...  
45.875977        1
38.568086        1
87.068817        1
60.049440        1
41.198313        1
Name: TARGET, Length: 16931, dtype: int64

In [104]:
sun_data = sun_df.drop(['TARGET'], axis=1, inplace=False)
sun_target = sun_df['TARGET']

X_train, X_train_test, y_train, y_train_test = train_test_split(sun_data, sun_target, test_size=0.25, random_state=0)

In [105]:
print(X_train.shape, y_train.shape)

(39420, 5) (39420,)


In [108]:
from lightgbm import LGBMRegressor
from sklearn.metrics import accuracy_score, mean_squared_error

lgbm = LGBMRegressor(n_estimators = 5000, num_leaves=32)
evals = [(X_train_test, y_train_test)]
lgbm.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="logloss", eval_set=evals, verbose=False)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=5000, n_jobs=-1, num_leaves=32, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [113]:
preds = lgbm.predict(X_train_test)

In [109]:
sun_test = []

for i in range(81):
  path = '/content/drive/MyDrive/files/DACON - 태양열 발전양 예측/test/' + str(i) + '.csv'
  read_file = pd.read_csv(path)
  sun_test.append(read_file)

sun_test_df = pd.concat(sun_test)
sun_test_df.shape

(27216, 9)

In [110]:
sun_test_df.reset_index()

Unnamed: 0,index,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
0,0,0,0,0,0,0,2.7,34.42,0.0,0.0
1,1,0,0,30,0,0,2.7,34.17,0.1,0.0
2,2,0,1,0,0,0,2.7,34.23,0.2,0.0
3,3,0,1,30,0,0,2.7,33.99,0.3,0.0
4,4,0,2,0,0,0,2.8,33.97,0.4,0.0
...,...,...,...,...,...,...,...,...,...,...
27211,331,6,21,30,0,0,0.8,63.35,13.7,0.0
27212,332,6,22,0,0,0,0.7,64.82,13.1,0.0
27213,333,6,22,30,0,0,0.7,66.10,12.8,0.0
27214,334,6,23,0,0,0,0.6,67.64,12.4,0.0


In [111]:
sun_test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27216 entries, 0 to 335
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Day     27216 non-null  int64  
 1   Hour    27216 non-null  int64  
 2   Minute  27216 non-null  int64  
 3   DHI     27216 non-null  int64  
 4   DNI     27216 non-null  int64  
 5   WS      27216 non-null  float64
 6   RH      27216 non-null  float64
 7   T       27216 non-null  float64
 8   TARGET  27216 non-null  float64
dtypes: float64(4), int64(5)
memory usage: 2.1 MB


In [129]:
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

def LGBM(q, X_train, y_train, X_train_test, y_train_test, X_test):
    
    # (a) Modeling
    model = LGBMRegressor(objective='quantile', alpha=q,
                         n_estimators=10000, bagging_fraction=0.7, learning_rate=0.027, subsample=0.7)                   
                         
                         
    model.fit(X_train, y_train, eval_metric = ['quantile'], 
          eval_set=[(X_train_test, y_train_test)], early_stopping_rounds=300, verbose=500)

    # (b) Predictions
    pred = pd.Series(model.predict(X_test).round(2))
    return pred, model

In [130]:
def get_preds(X_train, X_train_test, y_train, y_train_test, X_test):
  LGBM_models = []
  LGBM_pred_df = pd.DataFrame()

  for q in quantiles:
    print(q)
    pred, model = LGBM(q, X_train, y_train, X_train_test, y_train_test, X_test)
    LGBM_models.append(model)
    LGBM_pred_df = pd.concat([LGBM_pred_df, pred], axis=1)

  LGBM_pred_df.columns = quantiles

  return LGBM_models, LGBM_pred_df

In [None]:
import warnings
warnings.filterwarnings("ignore")

model, result = get_preds(X_train, X_train_test, y_train, y_train_test, sun_test_df)

In [133]:
lgbm = LGBMRegressor(objective='quantile', alpha = 0.2, n_estimators=10000, learning_rate=0.027, subsample=0.7)
lgbm.fit(X_train, y_train, eval_metric=['quantile'], eval_set=[(X_train_test, y_train_test)], early_stopping_rounds=300, verbose=500)

Training until validation scores don't improve for 300 rounds.
[500]	valid_0's quantile: 0.185625
[1000]	valid_0's quantile: 0.139699
[1500]	valid_0's quantile: 0.12687
[2000]	valid_0's quantile: 0.122589
[2500]	valid_0's quantile: 0.119009
[3000]	valid_0's quantile: 0.11707
[3500]	valid_0's quantile: 0.114837
[4000]	valid_0's quantile: 0.113592
[4500]	valid_0's quantile: 0.112662
[5000]	valid_0's quantile: 0.111973
[5500]	valid_0's quantile: 0.111389
[6000]	valid_0's quantile: 0.110191
[6500]	valid_0's quantile: 0.109031
[7000]	valid_0's quantile: 0.108655
[7500]	valid_0's quantile: 0.108095
[8000]	valid_0's quantile: 0.107551
[8500]	valid_0's quantile: 0.107224
[9000]	valid_0's quantile: 0.106943
[9500]	valid_0's quantile: 0.106843
[10000]	valid_0's quantile: 0.106629
Did not meet early stopping. Best iteration is:
[9994]	valid_0's quantile: 0.106628


LGBMRegressor(alpha=0.2, boosting_type='gbdt', class_weight=None,
              colsample_bytree=1.0, importance_type='split',
              learning_rate=0.027, max_depth=-1, min_child_samples=20,
              min_child_weight=0.001, min_split_gain=0.0, n_estimators=10000,
              n_jobs=-1, num_leaves=31, objective='quantile', random_state=None,
              reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=0.7,
              subsample_for_bin=200000, subsample_freq=0)