In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install --quiet shap

[?25l[K     |▋                               | 10 kB 24.4 MB/s eta 0:00:01[K     |█▏                              | 20 kB 10.0 MB/s eta 0:00:01[K     |█▊                              | 30 kB 14.1 MB/s eta 0:00:01[K     |██▎                             | 40 kB 6.1 MB/s eta 0:00:01[K     |██▉                             | 51 kB 6.4 MB/s eta 0:00:01[K     |███▌                            | 61 kB 7.6 MB/s eta 0:00:01[K     |████                            | 71 kB 8.1 MB/s eta 0:00:01[K     |████▋                           | 81 kB 7.6 MB/s eta 0:00:01[K     |█████▏                          | 92 kB 8.4 MB/s eta 0:00:01[K     |█████▊                          | 102 kB 6.5 MB/s eta 0:00:01[K     |██████▎                         | 112 kB 6.5 MB/s eta 0:00:01[K     |███████                         | 122 kB 6.5 MB/s eta 0:00:01[K     |███████▌                        | 133 kB 6.5 MB/s eta 0:00:01[K     |████████                        | 143 kB 6.5 MB/s eta 0:00:01[K  

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import warnings
import random
import os
import datetime
import torch
import shap

sys.path.append('/content/drive/MyDrive/KNP/src')

from torch.utils.data import TensorDataset  # 텐서데이터셋
from torch.utils.data import DataLoader  # 데이터로더

# Import user libraries
from config import *
from utils.utils import *
from utils.plot import *
from pipelines.weather_pipeline import *
from pipelines.train_pipeline import *
from utils.train_model import *
from models.LD import *
from utils.model_test import *

In [5]:
warnings.filterwarnings("ignore")

lstm_node = 16
dense_node = 2
learning_rate = config.lr
masking = 31
start_train = False

seed_everything()

In [6]:
print(f"{'Weather Pipeline Started':=^40}")

weather = load_weather('/content/drive/MyDrive/KNP/dataset/weather/')

weather_pipeline = get_weather_pipeline()

processed_weather = weather_pipeline.fit_transform(weather)

print(f"{'Train Pipeline Started':=^40}")

train = pd.read_csv("/content/drive/MyDrive/KNP/dataset/train/train.csv")

train_pipeline = load_train_pipeline(processed_weather, config.threhold, masking)

processed_train = pd.DataFrame(
    train_pipeline.fit_transform(train),
    columns=train_pipeline["final_pipe"].get_feature_names_out(),
)

scaler = train_pipeline["final_pipe"].named_transformers_["y_scaler"]

print(f"{'Test Pipeline Started':=^40}")

# make test dataset
test = load_test_data('/content/drive/MyDrive/KNP/dataset/test/')

processed_test = pd.DataFrame(
    train_pipeline.transform(test),
    columns=train_pipeline["final_pipe"].get_feature_names_out(),
)

print(f"{'Setting Dataset Started':=^40}")



In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 데이터를 정렬하여 전체 데이터의 70% 학습, 30% 테스트에 사용
trainX_tensor, trainY_tensor, validX_tensor, validY_tensor = set_pytorch_dataset(
    processed_train, device=device
)

testX = build_test_dataset(np.array(processed_test), config.window_size)
testX_tensor = torch.FloatTensor(testX).to(device)

# 텐서 형태로 데이터 정의
trainset = TensorDataset(trainX_tensor, trainY_tensor)

# 데이터로더는 기본적으로 2개의 인자를 입력받으며 배치크기는 통상적으로 2의 배수를 사용
trainloader = DataLoader(
    trainset, batch_size=config.batch_size, shuffle=True, drop_last=True
)

# 텐서 형태로 데이터 정의
validset = TensorDataset(validX_tensor, validY_tensor)

# 데이터로더는 기본적으로 2개의 인자를 입력받으며 배치크기는 통상적으로 2의 배수를 사용
validloader = DataLoader(
    validset, batch_size=config.batch_size, shuffle=False, drop_last=True
)


# 설정값
data_dim = processed_train.shape[1]
output_dim = 2


In [8]:
if start_train:
    print(f"{'Model Training Started':=^40}")

    # 모델 학습
    lstm_dense = LstmDense(
        data_dim, lstm_node, dense_node, config.window_size, output_dim, 2
    ).to(device)
    model, train_hist, valid_hist = train_model(
        lstm_dense,
        trainloader,
        validloader,
        device=device,
        lr=learning_rate,
        verbose=1,
        num_epochs=config.epochs,
        patience=config.es,
    )

if not start_train:
    # 불러오기
    model = LstmDense(
        data_dim, lstm_node, dense_node, config.window_size, output_dim, 2
    ).to(device)
    model.load_state_dict(torch.load("/content/drive/MyDrive/KNP/src/checkpoint.pt", map_location=device), strict=False)

In [25]:
np_processed_train = []

for idx, data in enumerate(trainloader):
  if (idx >= 1000):
    break
  np_processed_train.append(data[0])

In [26]:
np_processed_train[0].shape

torch.Size([32, 6, 32])

In [27]:
np_processed = torch.stack(np_processed_train, dim=0)

In [29]:
np_processed = torch.reshape(np_processed, (-1, 6, 32))

In [62]:
print(f"{'SHAP Started':=^40}")
explainer = shap.DeepExplainer(
        model, 
        np_processed)



In [63]:
model.train()

x_samples = np_processed[10000:10100]
print(len(x_samples))
shap_values = explainer.shap_values(x_samples)



In [65]:
expected_value = explainer.expected_value

In [66]:
expected_value

array([0.43105033, 0.42779154], dtype=float32)

In [75]:
shap_time_df = pd.DataFrame({
    "mean_abs_shap": np.mean(np.mean(np.abs(shap_values[0]), axis=0), axis=1), 
    "name": ['t-6', 't-5', 't-4', 't-3', 't-2', 't-1']
})
shap_time_df.sort_values("mean_abs_shap", ascending=False)

Unnamed: 0,mean_abs_shap,name
5,0.008248,t-1
4,0.002766,t-2
2,0.000599,t-4
3,0.000322,t-3
1,4.7e-05,t-5
0,2e-05,t-6


In [73]:
shap_df = pd.DataFrame({
    "max_mean_abs_shap": np.mean(np.mean(np.abs(shap_values[0]), axis=0), axis=0), 
    "min_mean_abs_shap": np.mean(np.mean(np.abs(shap_values[1]), axis=0), axis=0), 
    "name": processed_test.columns
})
shap_df.sort_values("max_mean_abs_shap", ascending=False)

Unnamed: 0,max_mean_abs_shap,min_mean_abs_shap,name
0,0.014278,0.014084,PIA205B-02A_MIN
1,0.012776,0.012639,PIA205B-02A_MAX
23,0.012003,0.011837,TI_P_MAX
6,0.007251,0.007214,TI_MEAN
18,0.004469,0.004421,PRESSURE_MAX_DIFF
11,0.00233,0.002302,Ground_temperature
24,0.001459,0.001443,TI_VOL_MAX
3,0.001347,0.00134,PRESSURE-S
25,0.001311,0.001302,TI_T_DIV
21,0.001151,0.001151,OUTLET_SUM


In [74]:
shap_df.sort_values("min_mean_abs_shap", ascending=False)

Unnamed: 0,max_mean_abs_shap,min_mean_abs_shap,name
0,0.014278,0.014084,PIA205B-02A_MIN
1,0.012776,0.012639,PIA205B-02A_MAX
23,0.012003,0.011837,TI_P_MAX
6,0.007251,0.007214,TI_MEAN
18,0.004469,0.004421,PRESSURE_MAX_DIFF
11,0.00233,0.002302,Ground_temperature
24,0.001459,0.001443,TI_VOL_MAX
3,0.001347,0.00134,PRESSURE-S
25,0.001311,0.001302,TI_T_DIV
21,0.001151,0.001151,OUTLET_SUM


In [None]:

model.eval()


print(f"{'Model Testing Started':=^40}")

# 예측 테스트

valid_pred_inverse, validY_inverse = model_valid(
    model, scaler, validX_tensor, validY_tensor
)
test_pred_inverse = model_test(model, scaler, testX_tensor)

# 성능 측정
mae = mae_score(valid_pred_inverse, validY_inverse)
print("MAE SCORE : ", mae)

nowDatetime = datetime.now().strftime("%Y%m%d%H%M%S")

file_name = f"{nowDatetime}_{mae:06f}"

# 모델 저장
submit_csv = result(test_pred_inverse, file_name)


print(f"{'Model Visualizing Started':=^40}")

# 시각화
# epoch_hist(train_hist, valid_hist, file_name)
# plot_two(valid_pred_inverse, validY_inverse, file_name)
# plot_diff(valid_pred_inverse, validY_inverse, file_name)
