In [None]:
!pip install scikit-learn==1.1 --user

In [None]:
import sklearn
sklearn.__version__

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

import os

import pandas as pd
import torch
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
pd.options.mode.chained_assignment = None

# fix_seed(0)

# Feature Engineering

In [None]:
# 데이터는 연속적이지가 않고, 뚝뚝 끊어지는 현상이 있음.
sc = MinMaxScaler()
file = "../../data/RAW_EEG/sub-01/sub-01_ses-01.csv"
tmp = pd.read_csv(file)
fig = go.Figure()
fig.add_trace(go.Line(x=tmp.index, y=tmp["A1"], name="A1"))
fig.add_trace(go.Line(x=tmp.index, y=tmp["event"]*100, name="event"))
fig.show()

In [None]:
# 컬럼별 범위 다름
tmp.agg(["min","max"])

In [None]:
# 파일별로 범위도 다름
pd.read_csv("../../data/RAW_EEG/sub-01/sub-01_ses-01.csv")["A1"].plot()
pd.read_csv("../../data/RAW_EEG/sub-01/sub-01_ses-02.csv")["A1"].plot()

In [None]:
# 파일간 변화량 비교
pd.read_csv("../../data/RAW_EEG/sub-01/sub-01_ses-01.csv")["A2"].diff(1).plot()
pd.read_csv("../../data/RAW_EEG/sub-01/sub-01_ses-02.csv")["A2"].diff(1).plot()

In [None]:
# 각 컬럼별로 변화량을 보면 이상치가 존재함, -50~50정도를 정상치의 기준으로 잡는것이 좋아보임.
sns.set(style="whitegrid")
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(12, 12))
sns.boxplot(tmp["A1"].diff(1), ax=axes[0][0])
sns.boxplot(tmp["A2"].diff(1), ax=axes[0][1])
sns.boxplot(tmp["A3"].diff(1), ax=axes[0][2])
sns.boxplot(tmp["A4"].diff(1), ax=axes[1][0])
sns.boxplot(tmp["A5"].diff(1), ax=axes[1][1])
sns.boxplot(tmp["A6"].diff(1), ax=axes[1][2])
sns.boxplot(tmp["A7"].diff(1), ax=axes[2][0])
sns.boxplot(tmp["A8"].diff(1), ax=axes[2][1])
sns.boxplot(tmp["A9"].diff(1), ax=axes[2][2])
plt.show()

# 데이터 변화량으로 변환
tmp.iloc[:,:-1] = tmp.iloc[:,:-1].diff(1).dropna().reset_index(drop=True)
tmp["A1"].plot()

In [None]:
# 각 컬럼별로 이상치 제거후 minmax 정규화하여 다시 변화량 확인
sc = MinMaxScaler()
sns.set(style="whitegrid")
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(12, 12))


sns.boxplot(tmp["A1"].diff(1), ax=axes[0][0])
sns.boxplot(tmp["A2"].diff(1), ax=axes[0][1])
sns.boxplot(tmp["A3"].diff(1), ax=axes[0][2])
sns.boxplot(tmp["A4"].diff(1), ax=axes[1][0])
sns.boxplot(tmp["A5"].diff(1), ax=axes[1][1])
sns.boxplot(tmp["A6"].diff(1), ax=axes[1][2])
sns.boxplot(tmp["A7"].diff(1), ax=axes[2][0])
sns.boxplot(tmp["A8"].diff(1), ax=axes[2][1])
sns.boxplot(tmp["A9"].diff(1), ax=axes[2][2])
plt.show()

# Preprocessing

In [None]:
''' 
    0. 파일 읽기
    1. (레이블 제외)데이터 변화량으로 변환
    2. 이상치 제거
    3. 파일별, 컬럼별 데이터 정규화(StandardScaler)
    4. concat 한 파일에 대해서 MinMax 정규화
'''
df = None
for csv in ["../../data/RAW_EEG/sub-04/sub-04_ses-01.csv", "../../data/RAW_EEG/sub-04/sub-04_ses-02.csv", "../../data/RAW_EEG/sub-04/sub-04_ses-03.csv"]:
    # 0. 파일 읽기
    tmp_df = pd.read_csv(csv)
    index = tmp_df["A1"]>-999999999
    tmp_df = tmp_df[index]
    
    if df is None:
        df = tmp_df
    else:
        df = pd.concat([df, tmp_df])
df = df.reset_index(drop=True)
df

In [None]:
seq_lengths = []#234440, 234440, 119140 파일의 첫시작은 3841
tmp = df.event.diff(1).dropna()
seq_lengths = [0] + tmp[tmp!=0].index.to_list()
seq_lengths = torch.tensor(seq_lengths)


In [None]:
diff_seq_lengths = seq_lengths[1:] - seq_lengths[:-1]
seq_indices = torch.stack([seq_lengths[:-1], diff_seq_lengths], dim=-1)

In [None]:
len(seq_indices)

In [None]:
seq_indices = seq_indices[seq_indices[:,-1] % 1153 == 0]
len(seq_indices)

In [None]:
result = []
for index, length in seq_indices:
    num_parts = length // 1153  # Determine the number of 1153 segments
    for i in range(num_parts):
        result.append([index + i * 1153, 1153])  # Append each part to the result list
        # result.append([index + i * 1153 + 1, 1152])  # Append each part to the result list
seq_indices = torch.tensor(result)  # Convert the result list back to a tensor
np_seq_indices = seq_indices.numpy()


In [None]:
import numpy as np
import pandas as pd

window_size = 1

# df_list_means = []
df_list_diff = []
df_list_y = []
cur_idx = 0

for i in range(len(np_seq_indices)):
    start = np_seq_indices[i][0]
    end = start + np_seq_indices[i][1]

    # means = df.iloc[start:end, :-1].rolling(window_size, min_periods=1).mean()
    diff = df.iloc[start:end, :-1].diff(window_size)
    y = df.iloc[start:end, -1:]

    # means = means.iloc[window_size:]
    diff = diff.iloc[window_size:]
    y = y.iloc[window_size:]
    
    add_len = (end - start - window_size)
    np_seq_indices[i] = (cur_idx, add_len)
    cur_idx = cur_idx + add_len

    # df_list_means.append(means)
    df_list_diff.append(diff)
    df_list_y.append(y)

# df_means = pd.concat(df_list_means)
df_diff = pd.concat(df_list_diff)
df_y = pd.concat(df_list_y)

new_column_names = ['diff_' + name for name in df.columns[:-1]]
df_diff.columns = new_column_names

In [None]:
df_combined = pd.concat([df_diff, df_y], axis=1)

In [None]:
df_combined.iloc[:, :-1] = df_combined.iloc[:, :-1].div(df_combined.iloc[:, :-1].abs().mean()) 

In [None]:
max_seq_size = seq_indices[:,1].max()
min_len = seq_indices[:,1].min()
print(max_seq_size)
print(min_len)

In [None]:
from sklearn.model_selection import train_test_split
def stratified_sequence_train_test_split(seq_indices, y, test_size=0.2, shuffle=True, random_state=None):
    # Get unique labels
    unique_labels = np.unique([y[seq_idx] for seq_idx, _ in seq_indices])

    train_seq_indices = []
    test_seq_indices = []

    # Loop over each unique label
    for label in unique_labels:
        # Find sequences of current label
        label_seq_indices = [seq_idx for seq_idx in seq_indices if y[seq_idx[0]] == label]

        # Perform train-test split for the current label sequences
        train_seq_indices_label, test_seq_indices_label = train_test_split(label_seq_indices, 
                                                                           test_size=test_size, 
                                                                           shuffle=shuffle,
                                                                           random_state=random_state)

        # Append split indices to the main lists
        train_seq_indices += train_seq_indices_label
        test_seq_indices += test_seq_indices_label

    return train_seq_indices, test_seq_indices


In [None]:
# Use the function
X, y = df_combined.iloc[:,:-1].values, df_combined.iloc[:,-1].values
train_seq_indices, test_seq_indices = stratified_sequence_train_test_split(seq_indices, y, test_size=0.2, shuffle=False)


# Training

In [None]:
import os
import sys
path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.
import torch
from ccnets.config import get_parser
from ccnets.ccnets import CCNets
from ccnets.resnets import ResNets
from ccnets.utils.loader import save_dataset, load_dataset
from nn.transformer import TFEncoder, TFDecoder
from ccnets.utils.log import create_log_details, create_log_name
from ccnets.utils.setting import set_random_seed

from torch.utils.tensorboard import SummaryWriter

In [None]:
import torch
import random

class SequenceDataset(torch.utils.data.Dataset):
    def __init__(self, x, y, seq_lengths):
        self.x = []
        self.y = []
        self.min_len = min_len
        for i in range(len(seq_lengths)):
            x_seq = x[seq_lengths[i][0]: seq_lengths[i][0]+seq_lengths[i][1]]
            y_seq = y[seq_lengths[i][0]: seq_lengths[i][0]+seq_lengths[i][1]]
            self.x.append(x_seq)
            self.y.append(y_seq)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [None]:
args = get_parser()
args.device = torch.device('cuda:0' if (torch.cuda.is_available() and args.ngpu > 0) else "cpu")

In [None]:
import IPython ; file_path = IPython.extract_module_locals()[1]['__vsc_ipynb_file__']
from pathlib import Path
file_name = Path(file_path).stem
model_path = path_append + f"models/{file_name}/"
temp_path = path_append + f"models/{'temp_'}{file_name}/"
log_path = path_append + f"log/{file_name}/"

if Path(temp_path).exists() is False: 
    os.mkdir(temp_path)

if Path(model_path).exists() is False: 
    os.mkdir(model_path)

if Path(log_path).exists() is False: 
    os.mkdir(log_path)  
    
args.model_path = model_path
args.temp_path = temp_path

In [None]:
data_path = path_append + f"data/custom_dataset/{file_name}/"
load_data = False
trainset = None
testset = None
if not os.path.isdir(data_path) or not load_data:
    trainset = SequenceDataset(X, y, train_seq_indices)
    testset = SequenceDataset(X, y, test_seq_indices)
    save_dataset(trainset, testset, data_path)
else:
    trainset, testset = load_dataset(data_path)

In [None]:
args.num_epoch = 5000
args.batch_size = 64
args.step_size = 20

args.num_layer = 3
args.hidden_size = 256 
args.lr = 2e-4

args.obs_size = 128
args.label_size = 14
args.explain_size = 14
args.seq_len = 60

args.num_checkpoints = 50
args.use_one_hot = True

args.use_reasoner_swap_inputs = False
args.use_producer_swap_inputs = True    

args.reasoner_joint_type = "none"
args.producer_joint_type = "none"

args.label_type = "UC" 

args.obs_fn = "none"
args.label_fn = "softmax"

args.use_report = True

In [None]:
args.loss_type = "MSE"
args.loss_reduction = "all"

log_details = create_log_details(args)
args.log = SummaryWriter(log_dir=create_log_name(log_path, log_details))

set_random_seed(0)
resnets = ResNets(args, TFEncoder, TFDecoder)
resnets.train(trainset, testset = testset)

In [None]:
args.loss_type = "L1"
args.error_type = "Sub" 
args.loss_reduction = "all"
args.error_reduction = "none"

log_details = create_log_details(args)
args.log = SummaryWriter(log_dir=create_log_name(log_path, log_details))

set_random_seed(0)
ccnets = CCNets(args, TFEncoder, TFDecoder, TFDecoder)
# ccnets.load_models()
ccnets.train(trainset, testset = testset)