In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

# 데이터 불러오기

In [None]:
PATHS = '/content/drive/MyDrive/DataAnalysis/전력사용량 예측 AI 경진대회/Data/'

train = pd.read_csv(PATHS + 'train.csv', encoding='cp949')

test = pd.read_csv(PATHS + 'test.csv', encoding='cp949')
test = test.rename(columns = {
    '강수량(mm, 6시간)' : '강수량(mm)',
    '일조(hr, 3시간)' : '일조(hr)'
    })

submission = pd.read_csv(PATHS + 'sample_submission.csv', encoding='cp949')

# 피쳐엔지니어링 및 데이터 생성

In [None]:
def CDH(xs):
  ys = []
  for i in range(len(xs)):
    if i < 11:
      ys.append(np.sum(xs[:(i+1)]-26))
    else:
      ys.append(np.sum(xs[(i-11):(i+1)]-26))
  return np.array(ys)

def get_month_hour_mean(x, df):
  month = x['month']
  hour = x['hour']
  ret = df[(month, hour)]
  return ret

def build_dataset(time_series, hour_series, seq_length):
  dataX = []
  dataY = []
  dataHour = []

  for i in range(0, len(time_series) - seq_length):
    _x = time_series[i:i + seq_length + 1, :-1]
    _y = time_series[i + seq_length, -1]  # Next close price
    _hour = hour_series[i + seq_length]  # Next close price

    dataX.append(_x)
    dataY.append(_y)
    dataHour.append(_hour)
    
  return np.array(dataX), np.array(dataY), np.array(dataHour)

def get_data(train_df, test_df, num):

  _train_df = train_df[train_df['num'] == num].reset_index(drop=True)
  _train_df['date_time'] = pd.to_datetime(_train_df['date_time'])

  _test_df = test_df[test_df['num'] == num].reset_index(drop=True)
  _test_df['date_time'] = pd.to_datetime(test_df['date_time'])
  _test_df = _test_df.interpolate(method='linear')

  df = pd.concat([_train_df, _test_df]).reset_index(drop=True)

  df['date_time'] = pd.to_datetime(df['date_time'])
  
  # 불쾌지수(DI)
  # 9/5Ta-0.55(1-RH)(9/5Ta-26)+32
  df['DI'] = 9/5*df['기온(°C)'] - 0.55*(1-df['습도(%)']/100)*(9/5*df['기온(°C)']-26)+32

  # 시간
  df['hour'] = df['date_time'].dt.hour

  # 요일
  df['weekday'] = df['date_time'].dt.weekday

  # 달
  df['month'] = df['date_time'].dt.month

  # 냉방도일(CDH)
  cdhs = np.array([])
  cdh = CDH(df['기온(°C)'].values)
  cdhs = np.concatenate([cdhs, cdh])
  df['CDH'] = cdhs

  # 시간 코사인
  df['cos_time'] = np.cos(2*np.pi*(df['hour']/24))

  # DP
  c = 243.12
  b = 17.62
  gamma = (b * (df['기온(°C)']) / (c + (df['기온(°C)']))) + np.log(df['습도(%)'] / 100)
  dp = ( c * gamma) / (b - gamma)
  df['DP'] = dp

  # 달, 시간 평균 온도
  month_hour_temp = df.groupby(by = ['month', 'hour']).mean()['기온(°C)']
  df['month_hour_temp'] = df.apply(lambda x : get_month_hour_mean(x, month_hour_temp), axis = 1)

  # 달, 시간 평균 습도
  month_hour_hu = df.groupby(by = ['month', 'hour']).mean()['습도(%)']
  df['month_hour_hu'] = df.apply(lambda x : get_month_hour_mean(x, month_hour_hu), axis = 1)

  # # 7일, 8일, 9일 전의 현재 시간의 전력 사용량의 평균 및 그 값
  # df['shift1'] = df['전력사용량(kWh)'].shift(24 * 7)
  # df['shift2'] = df['전력사용량(kWh)'].shift(24 * 8)
  # df['shift3'] = df['전력사용량(kWh)'].shift(24 * 9)

  # df['avg2'] = np.mean(df[['shift1', 'shift2']].values, axis=-1)
  # df['avg3'] = np.mean(df[['shift1', 'shift2', 'shift3']].values, axis=-1)

  # df = df[~df['avg3'].isna()]
  # df = df.reset_index(drop=True)
  _hour = df['month'].values

  cols = ['기온(°C)', '풍속(m/s)', '습도(%)', '강수량(mm)', '일조(hr)',
          'DI', 'hour', 'weekday', 'month', 'CDH', 
          'cos_time', 'DP', 'month_hour_temp', 'month_hour_hu',]

  min = df[cols].values.min(axis=0)
  max = df[cols].values.max(axis=0)

  df.loc[:, cols] = (df[cols] - min) / (max - min)

  cols = ['기온(°C)', '풍속(m/s)', '습도(%)', '강수량(mm)', '일조(hr)',
          'DI', 'hour', 'weekday', 'month', 'CDH', 
          'cos_time', 'DP', 'month_hour_temp', 'month_hour_hu', '전력사용량(kWh)']

  _X = df[cols].values

  X, y, hour = build_dataset(time_series = _X, hour_series = _hour, seq_length = 71)

  test_size = 24 * 7

  X_train = X[ : -(test_size)]
  y_train = y[ : -(test_size)]
  Hour_train = hour[ : -(test_size)]

  X_test = X[-test_size: ]
  # y_test = y[-test_size: ]
  # Hour_test = hour[-test_size: ]

  return X_train, y_train, Hour_train, X_test

  # return X_train, y_train, Hour_train, X_test, y_test, Hour_test

# 모델 생성

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Conv1D, Flatten,MaxPooling1D,BatchNormalization, Lambda, AveragePooling1D, Dropout, Input
from keras.callbacks import EarlyStopping, ModelCheckpoint,ReduceLROnPlateau
import tensorflow.keras as keras

def smape(true, pred):

  v = 2 * abs(pred - true) / (abs(pred) + abs(true))
  output = np.mean(v) * 100

  return output

def my_metric(true, pred):
  score = tf.py_function(func=smape, inp=[true, pred], Tout=tf.float32,  name='custom_nmae')
  return score

def set_model():

  nf = 16
  fs = 3
  padding = 'causal'
  activation = 'elu'

  model = Sequential()

  model.add(keras.layers.InputLayer((72, 14)))

  model.add(Conv1D(filters = nf, kernel_size = fs, padding = padding))
  model.add(BatchNormalization())
  model.add(Activation(activation = activation))
  # model.add(Dropout(0.4))

  model.add(Conv1D(filters = nf * 2, kernel_size = fs, padding = padding))
  model.add(BatchNormalization())
  model.add(Activation(activation = activation))
  # model.add(Dropout(0.4))

  model.add(Conv1D(filters = nf * 4, kernel_size = fs, padding = padding))
  model.add(BatchNormalization())
  model.add(Activation(activation = activation))
  # model.add(Dropout(0.4))

  model.add(Conv1D(filters = nf * 8, kernel_size = fs, padding = padding))
  model.add(BatchNormalization())
  model.add(Activation(activation = activation))
  # model.add(Dropout(0.4))

  model.add(Conv1D(filters = nf * 16, kernel_size = fs, padding = padding))
  model.add(BatchNormalization())
  model.add(Activation(activation = activation))
  # model.add(Dropout(0.4))

  model.add(Conv1D(filters = nf * 32, kernel_size = fs, padding = padding))
  model.add(BatchNormalization())
  model.add(Activation(activation = activation))
  # model.add(Dropout(0.4))

  model.add(Flatten())
  model.add(Dense(512, activation = activation))
  model.add(Dense(128, activation = activation))
  model.add(Dense(32, activation = activation))
  model.add(Dense(8, activation = activation))
  model.add(Dense(4, activation = activation)) 
  model.add(Dense(1))# output size 

  optimizer = keras.optimizers.RMSprop(lr=0.001)

  model.compile(loss = 'mae', optimizer = optimizer, metrics=[my_metric])

  return model

# 학습

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from tqdm import tqdm_notebook

BATCH_SIZE = 256
EPOCHS = 100
n_split = 5
kfold = StratifiedKFold(n_splits = n_split, shuffle=True, random_state=22)
PATHS = '/content/drive/MyDrive/DataAnalysis/전력사용량 예측 AI 경진대회/Model/'

for num in tqdm_notebook(range(1, 61)):

  X_train, y_train, Hour_train, X_test = get_data(train, test, num)
  i = 0

  train_X, val_X, train_y, val_y = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, random_state=34)

  model = set_model()
  mc = ModelCheckpoint(PATHS + f'num_{num}_cv_study{i + 1}.h5', save_best_only=True, verbose=0, monitor = 'val_my_metric', mode = 'min', save_weights_only=True)
  reLR = ReduceLROnPlateau(monitor = 'val_my_metric', patience = 7,verbose = 0,factor = 0.5)

  history = model.fit(train_X, train_y, epochs = EPOCHS, validation_data = (val_X, val_y),
        verbose = 0, batch_size=BATCH_SIZE, callbacks = [mc, reLR])

  model.load_weights(PATHS + f'num_{num}_cv_study{i + 1}.h5')

  k_accuracy = '%.4f' % (model.evaluate(val_X, val_y)[1])
  k_loss = '%.4f' % (model.evaluate(val_X, val_y)[0])

  print(f'num: {num}, Auc: {k_accuracy}, Loss: {k_loss}')

  pred = model.predict(X_test)
  submission.iloc[(24 * 7) * (num - 1) : (24 * 7) * (num), 1] = pred

  # for i, (train_idx, val_idx) in enumerate(kfold.split(X_train, Hour_train)):
  #   train_X, val_X = X_train[train_idx], X_train[val_idx]
  #   train_y, val_y = y_train[train_idx], y_train[val_idx]

  #   model = set_model()
  #   mc = ModelCheckpoint(PATHS + f'num_{num}_cv_study{i + 1}.h5', save_best_only=True, verbose=0, monitor = 'val_my_metric', mode = 'min', save_weights_only=True)
  #   reLR = ReduceLROnPlateau(monitor = 'val_my_metric', patience = 7,verbose = 0,factor = 0.5)

  #   history = model.fit(train_X, train_y, epochs = EPOCHS, validation_data = (val_X, val_y),
  #           verbose = 0, batch_size=BATCH_SIZE, callbacks = [mc, reLR])

  #   model.load_weights(PATHS + f'num_{num}_cv_study{i + 1}.h5')

  #   k_accuracy = '%.4f' % (model.evaluate(val_X, val_y)[1])
  #   k_loss = '%.4f' % (model.evaluate(val_X, val_y)[0])

  #   accuracy.append(k_accuracy)
  #   losss.append(k_loss)

  # print(f'num: {num}, Auc: {accuracy}, Loss: {losss}')

# 예측

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold
from tqdm import tqdm_notebook

BATCH_SIZE = 256
EPOCHS = 100
n_split = 5
kfold = StratifiedKFold(n_splits = n_split, shuffle=True, random_state=22)
PATHS = '/content/drive/MyDrive/DataAnalysis/전력사용량 예측 AI 경진대회/Model/'

for num in tqdm_notebook(range(1, 61)):
  X_train, y_train, Hour_train, X_test = get_data(train, test, num)

  accuracy = []
  i_li = []

  for i, (train_idx, val_idx) in enumerate(kfold.split(X_train, Hour_train)):
    train_X, val_X = X_train[train_idx], X_train[val_idx]
    train_y, val_y = y_train[train_idx], y_train[val_idx]

    model = set_model()
    model.load_weights(PATHS + f'num_{num}_cv_study{i + 1}.h5')

    k_accuracy = model.evaluate(val_X, val_y)[1]

    accuracy.append(k_accuracy)

    if k_accuracy <= 100:
      i_li.append(i)

  preds = []
  for i in i_li:
    model = set_model()
    model.load_weights(PATHS + f'num_{num}_cv_study{i + 1}.h5')
    pred = model.predict(X_test)
    preds.append(pred)

  print(f'num: {num}, 1_Auc: {accuracy[0]}, 2_Auc: {accuracy[1]}, 3_Auc: {accuracy[2]}, 4_Auc: {accuracy[3]}, 5_Auc: {accuracy[4]}, i_li: {i_li}, len_preds: {len(preds)} \n')
  
  preds = np.mean(preds, axis=0)
  submission.iloc[(24 * 7) * (num - 1) : (24 * 7) * (num), 1] = preds

In [None]:
PATHS = '/content/drive/MyDrive/DataAnalysis/전력사용량 예측 AI 경진대회/Predict/'
submission.to_csv(PATHS + 'CNN1D_0616.csv', index=False)

In [None]:
submission.head()

In [None]:
submission.tail()