In [1]:
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

from time import time
from pathlib import Path
import sys, os
from tqdm.auto import tqdm

import datetime
import numpy as np
import pandas as pd
import random

from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from scipy.stats import spearmanr
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, RandomOverSampler

import torch
from torch.utils.data import TensorDataset, DataLoader

from pytorch_grad_cam import GradCAM, ScoreCAM, GradCAMPlusPlus, AblationCAM, XGradCAM, EigenCAM
from pytorch_grad_cam.utils.image import show_cam_on_image
from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

np.set_printoptions(precision=3, suppress=True)

# pakage for study
from utils import *

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

torch.manual_seed(777)

print(device)

cuda


In [3]:
data_raw = pd.read_csv("data/2.kospi200futures_clustered.csv")
data_raw.date = pd.to_datetime(data_raw.date, format='%Y%m%d')
data_raw = data_raw.set_index(['date','time'])

drop_cols = ["1_updown", "5_updown", "21_updown", "close"]
target = "1_updown"

data = data_raw.dropna()

date_list = list(data.index.get_level_values("date").unique())

In [4]:
test_num = 26*5

date = datetime.datetime.strptime("2021-12-20", "%Y-%m-%d")
test_start_dates = [date]

for n in range(1,test_num):
    date = date - datetime.timedelta(days=14)
    test_start_dates.append(date)
    
test_start_dates = test_start_dates[::-1]

In [5]:
random.seed(907)

valid_size = 14
test_size = 14
    
batch_size = 64
n_epochs = 1000
T_max = n_epochs / 100

start_lr = 0.001
min_lr = 0.00001

# early stopping patience;
# how long to wait since the last time the validation loss improved
patience = 20
view_time = 4

feature_list = [x.split('_')[-1] for x in data.columns[:16]]
columns_list = data.drop(drop_cols, axis=1).columns
num_list = [x.split('_')[0] for x in columns_list[::16]]

# train months
train_n = 18
train_size = 30 * train_n

In [6]:
%%time

test_df = pd.DataFrame()
importance_0 = pd.DataFrame(columns=columns_list)
importance_1 = pd.DataFrame(columns=columns_list)
count_df = pd.DataFrame(columns=['count'])

valid_true_all = []
valid_pred_all = []
test_true_all = []
test_pred_all = []

count = 1
total_count = 26
for test_start in tqdm(test_start_dates[-total_count:]):
    
    print(f"{count}/{total_count} {test_start.strftime('%Y-%m-%d')} training start")
    
    train_data, valid_data, test_data = make_rolling(data, test_start, test_size, valid_size=valid_size, train_size=train_size)

    # Generate dataset
    train_X = train_data.drop(drop_cols, axis=1)
    train_y = train_data[target]

    valid_X = valid_data.drop(drop_cols, axis=1)
    valid_y = valid_data[target]

    test_X = test_data.drop(drop_cols,axis=1)
    test_y = test_data[target]

    test_indexs = test_data.index

    # Scaler
    scaler = RobustScaler()
    scaler.fit(train_X)

    train_X = scaler.transform(train_X)
    valid_X = scaler.transform(valid_X)
    test_X = scaler.transform(test_X)

    # SMOTE over sample for validation dataset
    smote = SMOTE(random_state=0, )
    randomoversampling = RandomOverSampler(random_state=0)
    try:
        valid_X_SMOTE, valid_y_SMOTE = smote.fit_resample(valid_X, valid_y)
    except:
        valid_X_SMOTE, valid_y_SMOTE = randomoversampling.fit_resample(valid_X, valid_y)

    # Reshape
    train_X = train_X.reshape(-1, 1, 16, 16)
    valid_X_SMOTE = valid_X_SMOTE.reshape(-1, 1, 16, 16)
    test_X = test_X.reshape(-1, 1, 16, 16)

    # convert to tensor
    train_X_tensor = torch.FloatTensor(train_X).to(device)
    train_y_tensor = torch.LongTensor(train_y.values).to(device)

    valid_X_tensor = torch.FloatTensor(valid_X_SMOTE).to(device)
    valid_y_tensor = torch.LongTensor(valid_y_SMOTE.values).to(device)

    test_X_tensor = torch.FloatTensor(test_X).to(device)
    test_y_tensor = torch.LongTensor(test_y.values).to(device)

    model = CNN(output_size=2).to(device)
    model.apply(weight_init_uniform)

    # loss & optimizer setting
    criterion = torch.nn.CrossEntropyLoss().to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=start_lr)
    scheduler = get_scheduler(optimizer, T_max, min_lr)

    # make dataset and dataloader
    train_dataset = TensorDataset(train_X_tensor, train_y_tensor)
    valid_dataset = TensorDataset(valid_X_tensor, valid_y_tensor)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

    model, train_loss, valid_loss = train_model(model, train_loader, valid_loader, optimizer, criterion, patience, n_epochs, view_time=view_time)

    # predict
    model = model.cpu()

    # set datasets
    train_X_tensor = train_X_tensor.to("cpu")
    train_y_tensor = train_y_tensor.to("cpu")

    valid_X = valid_X.reshape(-1, 1, 16, 16)
    valid_X_tensor = torch.FloatTensor(valid_X)
    valid_y_tensor = torch.LongTensor(valid_y.values)

    test_X_tensor = test_X_tensor.to("cpu")
    test_y_tensor = test_y_tensor.to("cpu")

    # predict
    train_y_pred = model(train_X_tensor).argmax(axis=1).tolist()
    valid_y_pred = model(valid_X_tensor).argmax(axis=1).tolist()
    test_y_pred = model(test_X_tensor).argmax(axis=1).tolist()

    valid_true_all += valid_y.to_list()
    valid_pred_all += valid_y_pred
    test_true_all += test_y.to_list()
    test_pred_all += test_y_pred

    test_df_temp = test_y.to_frame()
    test_df_temp['pred'] = test_y_pred
    test_df_temp['count'] = count

    test_df = pd.concat([test_df, test_df_temp])

    # average of correct answers

    target_layers = model.cnn_layer
    cam = GradCAM(model=model, target_layers=target_layers)

    correct_idx = [i for i, x in enumerate(np.array(valid_y) == np.array(valid_y_pred)) if x]
    np.random.seed(907)

    answer_0 = []
    answer_1 = []

    for num in correct_idx:
        answer = int(valid_y[num])
        X = torch.FloatTensor(valid_X[num])
        label = int(valid_y_pred[num])

        if answer == 0:
            # You can also pass aug_smooth=True and eigen_smooth=True, to apply smoothing.
            grayscale_cam = cam(input_tensor=X.unsqueeze(0), targets=[ClassifierOutputTarget(0)])
            # In this example grayscale_cam has only one image in the batch:
            grayscale_cam = grayscale_cam[0, :]

            answer_0.append(grayscale_cam)

        else:
            # You can also pass aug_smooth=True and eigen_smooth=True, to apply smoothing.
            grayscale_cam = cam(input_tensor=X.unsqueeze(0), targets=[ClassifierOutputTarget(1)])
            # In this example grayscale_cam has only one image in the batch:
            grayscale_cam = grayscale_cam[0, :]

            answer_1.append(grayscale_cam)

    idx = test_start.strftime('%Y-%m-%d')

    try:
        importance_0.loc[idx] = np.mean(answer_0, axis=0).reshape(-1)

    except:
        importance_0.loc[idx] = 0

    try:
        importance_1.loc[idx] = np.mean(answer_1, axis=0).reshape(-1)

    except:
        importance_1.loc[idx] = 0

    count_df.loc[idx] = count
    count += 1
    print("")

importance_0['count'] = count_df
importance_1['count'] = count_df

val_acc = accuracy_score(valid_true_all, valid_pred_all)
val_f1 = f1_score(valid_true_all, valid_pred_all, average='macro')

test_acc = accuracy_score(test_true_all, test_pred_all)
test_f1 = f1_score(test_true_all, test_pred_all, average='macro')

print(f"{train_n} months")
print(f'Valid Acc : {val_acc*100:.2f}% Valid F1 : {val_f1*100:.2f}%')
print(f'Test Acc : {test_acc*100:.2f}% Test F1 : {test_f1*100:.2f}%')
print("")
print("Valid Confusion Matrix")
print(confusion_matrix(valid_true_all, valid_pred_all))
print("")
print("Test Confusion Matrix")
print(confusion_matrix(test_true_all, test_pred_all))
print("")

importance_0.to_csv(f"importance/importance_0.csv")
importance_1.to_csv(f"importance/importance_1.csv")
test_df.to_csv(f"importance/pred.csv")

  0%|          | 0/26 [00:00<?, ?it/s]

1/26 2021-01-04 training start
[   5/1000] train_loss: 0.68977 train acc: 0.53 valid_loss: 0.69877 valid acc: 0.49
[  10/1000] train_loss: 0.68813 train acc: 0.53 valid_loss: 0.70170 valid acc: 0.47
[  15/1000] train_loss: 0.68303 train acc: 0.54 valid_loss: 0.71570 valid acc: 0.47
[  20/1000] train_loss: 0.67648 train acc: 0.54 valid_loss: 0.70641 valid acc: 0.47
[  25/1000] train_loss: 0.67023 train acc: 0.55 valid_loss: 0.70069 valid acc: 0.48
[  30/1000] train_loss: 0.66621 train acc: 0.55 valid_loss: 0.69253 valid acc: 0.50
[  35/1000] train_loss: 0.65403 train acc: 0.56 valid_loss: 0.70272 valid acc: 0.50
[  40/1000] train_loss: 0.64378 train acc: 0.57 valid_loss: 0.69225 valid acc: 0.50
[  45/1000] train_loss: 0.62458 train acc: 0.57 valid_loss: 0.69876 valid acc: 0.51

2/26 2021-01-18 training start
[   5/1000] train_loss: 0.69045 train acc: 0.53 valid_loss: 0.68873 valid acc: 0.51
[  10/1000] train_loss: 0.68832 train acc: 0.54 valid_loss: 0.69639 valid acc: 0.51
[  15/1000] t