# 1. Colab drive mount

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, ConvLSTM1D, Input, Activation, BatchNormalization, Flatten, LSTM, GRU, SimpleRNN
from keras import backend as K
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, ReduceLROnPlateau
import kerastuner as kt
from kerastuner.tuners import RandomSearch
from kerastuner.tuners import Hyperband

# 2. Data

In [None]:
# Data 불러오기
df = pd.read_pickle("/content/drive/MyDrive/날씨/data_자외선/전처리_knn/knn_imp(0616).pkl")


# Time Encoding(month, hour)
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

df['month'] = df["date_time"].dt.month
df = encode(df, 'month', 12)

df['hour'] = df["date_time"].dt.hour
df = encode(df, 'hour', 23)

# Drop features
df.drop(columns=["sateza", "height", "landtype", "month", "hour"], inplace=True)

# Feature 재정렬
df = df[['date_time','stn', 'uv', 'month_sin', 'month_cos', 'hour_sin', 'hour_cos', 'lon', 'lat', 
        'band1', 'band2', 'band3', 'band4', 'band5',
       'band6', 'band7', 'band8', 'band9', 'band10', 'band11', 'band12',
       'band13', 'band14', 'band15', 'band16', 'solarza', 'esr']]

# 20, 21년 7~9월 Train Data
df_train_1 = df.loc[(df["date_time"] >= "2020-07-01") & (df["date_time"] < "2020-10-01")]
df_train_2 = df.loc[(df["date_time"] >= "2021-07-01") & (df["date_time"] < "2021-10-01")]
df_train_789 = pd.concat([df_train_1, df_train_2], axis=0)

# 19년 8월 Train Data
df_test = df.loc[(df["date_time"] >= "2019-08-01") & (df["date_time"] < "2019-09-01")]

df_train_789.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

df_train_789.head()

# 3. Scaling

In [None]:
# df_train_789
# Scaling 변수 설정
df_train_1= df_train_789.iloc[:, :7]

scaler = StandardScaler()
df_train_std = df_train_789.iloc[:, 7:]
df_train_std = scaler.fit_transform(df_train_std)
df_train_std = pd.DataFrame(df_train_std, columns=df_train_789.columns[7:])

df_train_789 = pd.concat([df_train_1, df_train_std], axis=1)

# df_test
df_test_1 = df_test.iloc[:, :7]

df_test_std = df_test.iloc[:, 7:]
df_test_std = scaler.transform(df_test_std)
df_test_std = pd.DataFrame(df_test_std, columns=df_test.columns[7:])

df_test = pd.concat([df_test_1, df_test_std], axis=1)

# 4. Build_Dataset

In [None]:
# 시계열 데이터를 원하는 길이만큼 묶어주는 함수 
def build_dataset(time_series, seq_length):
  dataX = []
  dataY = []
  
  for i in tqdm(range(len(time_series) - seq_length)):
    x = time_series.iloc[i:i+seq_length, :-1]
    y = time_series.iloc[i+seq_length, -1]

    dataX.append(x)
    dataY.append(y)

  return np.array(dataX), np.array(dataY)

In [None]:
# 함수를 사용하기 위해 변수 순서 재정렬
df_train_789 = df_train_789[['date_time','stn', 'month_sin', 'month_cos', 'hour_sin', 'hour_cos', 
        'band1', 'band2', 'band3', 'band4', 'band5',
       'band6', 'band7', 'band8', 'band9', 'band10', 'band11', 'band12',
       'band13', 'band14', 'band15', 'band16', 'solarza', 'esr', 'uv']]
  
df_test = df_test[['date_time','stn', 'month_sin', 'month_cos', 'hour_sin', 'hour_cos', 
        'band1', 'band2', 'band3', 'band4', 'band5', 
       'band6', 'band7', 'band8', 'band9', 'band10', 'band11', 'band12',
       'band13', 'band14', 'band15', 'band16', 'solarza', 'esr', 'uv']]

# date_time과 stn은 제외함
df_train_789_timeseries = df_train_789.iloc[:, 2:]
df_test_timeseries = df_test.iloc[:, 2:]

In [None]:
train_789_x, train_789_y = build_dataset(df_train_789_timeseries, 3)
test_x, test_y = build_dataset(df_test_timeseries, 3)

# 5. Models

In [None]:
# 학습 시 loss로 쓸 Custom RMSE 함수 
def rmse(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

## 1D CNN

### Model

In [None]:
def set_cnn1d(): # Causal Padding을 사용한 CNN1D 모델 구축함수

  nf = 16 
  fs = 3 
  padding = 'causal'  # Convolution을 진행할 때, 매 step에서 output이 오직 현시점의 input과 과거 시점들의 데이터에만 종속되도록하기 위해서 Causal Padding을 사용
  activation = 'relu'

  model = Sequential()

  model.add(Conv1D(filters = nf, kernel_size = fs, padding = padding, input_shape=(3, 22)))
  model.add(BatchNormalization()) # 비선형 성질을 유지 하면서 학습 될 수 있게 해주고, regularization 효과를 가지기 위해 배치정규화 설정
  model.add(Activation(activation = activation))

  model.add(Conv1D(filters = nf * 2, kernel_size = fs, padding = padding))
  model.add(BatchNormalization())  # 비선형 성질을 유지 하면서 학습 될 수 있게 해주고, regularization 효과를 가지기 위해 배치정규화 설정
  model.add(Activation(activation = activation))

  model.add(Conv1D(filters = nf * 4, kernel_size = fs, padding = padding))
  model.add(BatchNormalization())  # 비선형 성질을 유지 하면서 학습 될 수 있게 해주고, regularization 효과를 가지기 위해 배치정규화 설정
  model.add(Activation(activation = activation))

  # Convolution 연산을 진행한 결과를 다시 Flatten 하여 최종 값(Dense(1)) 도출
  model.add(Flatten())

  model.add(Dense(32, activation = activation)) 
  model.add(Dense(8, activation = activation)) 
  model.add(Dense(1))# output size 

  optimizer = keras.optimizers.Adam(learning_rate=0.0001)

  model.compile(loss = rmse, optimizer = optimizer, metrics=['mae', 'mse'])

  return model

In [None]:
cnn1d = set_cnn1d()

### Model fit

In [None]:
model_directory = '/content/drive/MyDrive/날씨/박형준/model/'
tensorboard_directory = '/content/drive/MyDrive/날씨/박형준/tensorboard/cnn1d/'

# Call-back 함수
# CheckPoint: Epoch 마다 val_loss를 확인하여, 값이 향상되었을 경우에만 저장
CP = ModelCheckpoint(filepath=model_directory+'cnn1d0625_789-{epoch:03d}-{val_loss:.4f}.hdf5',
            monitor='val_loss', save_weights_only=True, verbose=1, save_best_only=True, mode='min')

# 학습과정 진행사항 확인
TB = TensorBoard(log_dir=tensorboard_directory, write_graph=True, write_images=True)

# 모델의 개선이 없을 경우 Learning rate 조절
LR = ReduceLROnPlateau(monitor='val_loss',factor=0.8,patience=3, verbose=1, min_lr=1e-7)

CALLBACK = [CP, TB, LR]

cnn1d.fit(train_789_x, train_789_y, validation_split=0.2, shuffle=True, batch_size=8, callbacks=CALLBACK, epochs=20)

### Load & Predict

In [None]:
# loss가 가장 낮은 모델 weight를 가져오기 위해 모델 만들기
pretrained_model = set_cnn1d()
pretrained_model.load_weights('/content/drive/MyDrive/날씨/박형준/model/cnn1d0625_789-017-0.3714.hdf5')

# Create a new model by extracting layers from the original model:
extracted_layers = pretrained_model.layers[:]
cnn1d = keras.Sequential(extracted_layers)

pred = cnn1d.predict(test_x)
mean_squared_error(y_true=test_y, y_pred=pred) ** 0.5

## CNN-LSTM

### Model

In [None]:
def set_cnnlstm():

  nf = 16 
  fs = 3 
  padding = 'causal'

  model = Sequential()

  model.add(Conv1D(filters = nf * 2, kernel_size = fs, padding = padding, input_shape=(3, 22)))
  model.add(BatchNormalization())
  model.add(Activation(activation = activation))

  model.add(Conv1D(filters = nf * 4, kernel_size = fs, padding = padding))
  model.add(BatchNormalization())
  model.add(Activation(activation = activation))

  model.add(LSTM(16, return_sequences=True))
  model.add(LSTM(16))
  model.add(Dense(1, activation=activation))

  optimizer = keras.optimizers.Adam()

  model.compile(loss = rmse, optimizer = optimizer, metrics=['mae', 'mse'])

  return model

In [None]:
cnnlstm = set_cnnlstm()

### Model fit

In [None]:
model_directory = '/content/drive/MyDrive/날씨/박형준/model/'
tensorboard_directory = '/content/drive/MyDrive/날씨/박형준/tensorboard/cnnlstm/'

# Call-back 함수
# CheckPoint: Epoch 마다 val_loss를 확인하여, 값이 향상되었을 경우에만 저장
CP = ModelCheckpoint(filepath=model_directory+'cnnlstm0625-{epoch:03d}-{val_loss:.4f}.hdf5',
            monitor='val_loss', save_weights_only=True, verbose=1, save_best_only=True, mode='min')

# 학습과정 진행사항 확인
TB = TensorBoard(log_dir=tensorboard_directory, write_graph=True, write_images=True)

# 모델의 개선이 없을 경우 Learning rate 조절
LR = ReduceLROnPlateau(monitor='val_loss',factor=0.8,patience=3, verbose=1, min_lr=1e-7)

CALLBACK = [CP, TB, LR]

cnnlstm.fit(train_789_x, train_789_y, validation_split=0.2, batch_size=8, callbacks=CALLBACK, epochs=20)

### Load & Predict

In [None]:
# loss가 가장 낮은 모델 weight를 가져오기 위해 모델 만들기
pretrained_model = set_cnnlstm()
pretrained_model.load_weights('/content/drive/MyDrive/날씨/박형준/model/cnnlstm0625-020-0.3774.hdf5')

# Create a new model by extracting layers from the original model:
extracted_layers = pretrained_model.layers[:]
cnnlstm = keras.Sequential(extracted_layers)

pred = cnnlstm.predict(test_x)
mean_squared_error(y_true=test_y, y_pred=pred) ** 0.5

## SimpleRNN

### Model

In [None]:
def set_rnn():

  model = Sequential()

  model.add(SimpleRNN(16, input_shape=(3, 22), return_sequences=True))
  model.add(SimpleRNN(16))
  model.add(Dense(1, activation='relu'))

  optimizer = keras.optimizers.Adam()

  model.compile(loss = rmse, optimizer = optimizer, metrics=['mae', 'mse'])

  return model

In [None]:
rnn = set_rnn()

### Model fit

In [None]:
model_directory = '/content/drive/MyDrive/날씨/박형준/model/'
tensorboard_directory = '/content/drive/MyDrive/날씨/박형준/tensorboard/rnn/'

# Call-back 함수
# CheckPoint: Epoch 마다 val_loss를 확인하여, 값이 향상되었을 경우에만 저장
CP = ModelCheckpoint(filepath=model_directory+'rnn_789-{epoch:03d}-{val_loss:.4f}.hdf5',
            monitor='val_loss', save_weights_only=True, verbose=1, save_best_only=True, mode='min')

# 학습과정 진행사항 확인
TB = TensorBoard(log_dir=tensorboard_directory, write_graph=True, write_images=True)

# 모델의 개선이 없을 경우 Learning rate 조절
LR = ReduceLROnPlateau(monitor='val_loss',factor=0.8,patience=3, verbose=1, min_lr=1e-7)

CALLBACK = [CP, TB, LR]

rnn.fit(train_789_x, train_789_y, batch_size=8, callbacks=CALLBACK, validation_split=0.2, epochs=20)

### Load & Predict

In [None]:
# loss가 가장 낮은 모델 weight를 가져오기 위해 모델 만들기
pretrained_model = set_rnn()
pretrained_model.load_weights('/content/drive/MyDrive/날씨/박형준/model/rnn_789-019-0.3720.hdf5')

# Create a new model by extracting layers from the original model:
extracted_layers = pretrained_model.layers[:]
rnn = keras.Sequential(extracted_layers)
rnn.summary()

pred = rnn.predict(test_x)
mean_squared_error(y_true=test_y, y_pred=pred) ** 0.5

## LSTM

### Model

In [None]:
def set_lstm():

  model = Sequential()

  model.add(LSTM(64, input_shape=(3, 22), return_sequences=True))
  model.add(LSTM(64))
  model.add(Dense(1, activation='relu'))

  optimizer = keras.optimizers.Adam()

  model.compile(loss = rmse, optimizer = optimizer, metrics=['mae', 'mse'])

  return model

In [None]:
lstm = set_lstm()

### Model fit

In [None]:
model_directory = '/content/drive/MyDrive/날씨/박형준/model/lstm(64)_batch16/'
tensorboard_directory = '/content/drive/MyDrive/날씨/박형준/tensorboard/lstm(64)_batch16/'

# Call-back 함수
# CheckPoint: Epoch 마다 val_loss를 확인하여, 값이 향상되었을 경우에만 저장
CP = ModelCheckpoint(filepath=model_directory+'lstm(64)_batch16-{epoch:03d}-{val_loss:.4f}.hdf5',
            monitor='val_loss', save_weights_only=True, verbose=1, save_best_only=True, mode='min')

# 학습과정 진행사항 확인
TB = TensorBoard(log_dir=tensorboard_directory, write_graph=True, write_images=True)

# 모델의 개선이 없을 경우 Learning rate 조절
LR = ReduceLROnPlateau(monitor='val_loss',factor=0.8,patience=3, verbose=1, min_lr=1e-7)

CALLBACK = [CP, TB, LR]

lstm.fit(train_789_x, train_789_y, batch_size=16, callbacks=CALLBACK, shuffle=True, validation_split=0.2, epochs=20)

### Load & Predict

In [None]:
# loss가 가장 낮은 모델 weight를 가져오기 위해 모델 만들기
pretrained_model = set_lstm()
pretrained_model.load_weights('/content/drive/MyDrive/날씨/박형준/model/lstm_789-011-0.3706.hdf5')

# Create a new model by extracting layers from the original model:
extracted_layers = pretrained_model.layers[:]
lstm = keras.Sequential(extracted_layers)

pred = lstm.predict(test_x)
mean_squared_error(y_true=test_y, y_pred=pred) ** 0.5

## GRU

### Model

In [None]:
def set_gru():

  activation = 'relu'

  model = Sequential()

  model.add(GRU(64, input_shape=(3, 22), return_sequences=True))
  model.add(GRU(64))
  model.add(Dense(1, activation=activation))

  optimizer = keras.optimizers.Adam(learning_rate=0.001)

  model.compile(loss = rmse, optimizer = optimizer, metrics=['mae', 'mse'])

  return model

In [None]:
gru = set_gru()

### Model fit

In [None]:
model_directory = '/content/drive/MyDrive/날씨/박형준/model/gru(64)_batch16_epochs150/'
tensorboard_directory = '/content/drive/MyDrive/날씨/박형준/tensorboard/gru(64)_batch16_epochs150/'

# Call-back 함수
# CheckPoint: Epoch 마다 val_loss를 확인하여, 값이 향상되었을 경우에만 저장
CP = ModelCheckpoint(filepath=model_directory+'gru(64)_batch16_epochs150-{epoch:03d}-{val_loss:.4f}.hdf5',
            monitor='val_loss', save_weights_only=True, verbose=1, save_best_only=True, mode='min')

# 학습과정 진행사항 확인
TB = TensorBoard(log_dir=tensorboard_directory, write_graph=True, write_images=True)

# 모델의 개선이 없을 경우 Learning rate 조절
LR = ReduceLROnPlateau(monitor='val_loss',factor=0.8,patience=3, verbose=1, min_lr=1e-7)

CALLBACK = [CP, TB, LR]

gru.fit(train_789_x, train_789_y, batch_size=16, callbacks=CALLBACK, shuffle=True, validation_split=0.2, epochs=150)

### Load & Predict

In [None]:
# loss가 가장 낮은 모델 weight를 가져오기 위해 모델 만들기
pretrained_model = set_gru()
pretrained_model.load_weights('/content/drive/MyDrive/날씨/박형준/model/gru(64)_batch16_epochs150/gru(64)_batch16_epochs200-006-0.3888.hdf5')

# Create a new model by extracting layers from the original model:
extracted_layers = pretrained_model.layers[:]
gru = keras.Sequential(extracted_layers)

pred = gru.predict(test_x)
mean_squared_error(y_true=test_y, y_pred=pred) ** 0.5

# 6. Kerastuner-GRU

### Tuner(GRU)

In [None]:
def build_model(hp):
    model = keras.Sequential()

    model.add(GRU(units=hp.Int('units', min_value=16, max_value=64, step=8), 
                  input_shape=(3, 22), return_sequences=True))
    model.add(GRU(units=hp.Int('units', min_value=16, max_value=64, step=8)))
    model.add(Dense(1, activation='relu'))
    model.compile(
        optimizer=keras.optimizers.Adam(
        # 학습률은 자주 쓰이는 0.01, 0.001, 0.0001 3개의 값 중 탐색
            hp.Choice('learning_rate',
                      values=[1e-3, 1e-4])),
        loss=rmse,
        metrics=['mae', 'mse'])
    return model

# RandomSearch
# tuner = RandomSearch(
#     build_model, # HyperModel
#     objective='val_loss', #  최적화할 하이퍼모델
#     max_trials=5,
#     executions_per_trial=3, # 각 모델별 학습 회수
#     directory='/content/drive/MyDrive/날씨/박형준/kerastuner/gru/', # 사용된 parameter 저장할 폴더
#     project_name='gru') # 사용된 parameter 저장할 폴더


# Hyperband
tuner = kt.Hyperband(
        build_model, # HyperModel
        objective ='val_loss', #  최적화할 하이퍼모델
        max_epochs =10, # 각 모델별 학습 회수
        factor = 3,    # 한 번에 훈련할 모델 수 결정 변수
        directory ='/content/drive/MyDrive/날씨/박형준/kerastuner/gru_1/', # 사용된 parameter 저장할 폴더
        project_name ='gru_1') # 사용된 parameter 저장할 폴더
      
# 출처: https://iyk2h.tistory.com/145 [하루 2시간:티스토리]

### Tuner search

In [None]:
tuner.search(train_789_x, train_789_y,
             epochs=3,
             validation_split=0.2, batch_size=8)

### Get best model

In [None]:
models = tuner.get_best_models ( num_models = 2 )

In [None]:
tuner.results_summary ()

# Submission
- 처음 값 0으로 채우기
- label값 재정렬 해야함(시간순 다음 ,stn순)

In [None]:
df_sub = pd.read_csv("/content/drive/MyDrive/날씨/박형준/1-1_검증데이터셋.csv")
df_sub.set_index(['YearMonthDayHourMinute', 'STN'], inplace=True)
df_sub.sort_index(level='STN', inplace=True)

### Best pred

In [None]:
pred = gru.predict(test_x)
df_pred = pd.DataFrame(pred)

# 시계열 데이터 length만큼 앞에 빈 예측값은 채워줌
df_sub["UV"][3:] = df_pred[0]
df_sub["UV"][:3] = 0

df_sub.reset_index(inplace=True)
df_sub.sort_values(by=['YearMonthDayHourMinute', 'STN'], inplace=True)
df_sub.reset_index(drop=True, inplace=True)
df_sub

### To_csv

In [None]:
df_sub.to_csv("/content/drive/MyDrive/날씨/박형준/submission.csv", index=False, encoding='utf-8')