In [1]:
import gc
import os
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pickle
from sklearn.utils import resample
from collections import Counter
import math
from sklearn.model_selection import train_test_split
import helper
import dataset
import torch
import torch.nn as nn
from resnet1d.net1d import Net1D

In [2]:
data_files = [name for name in os.listdir('./data')]

dfs = []

for i in range(len(data_files)):
    fname = './data/' + data_files[i]
    df = pd.read_csv(fname)
    # df.columns = [x.strip().lstrip() for x in df.columns]

    dfs.append(df)

df = pd.concat(dfs, axis=0, ignore_index=True)

df.shape

(2830743, 79)

In [3]:
# Remove spaces in the front and the end of the column names for better human reading
df.columns = [x.lstrip().strip().replace('�', '-') for x in df.columns]
df.shape

(2830743, 79)

In [4]:
# replace inf values to nan
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
df.shape

(2827876, 79)

In [5]:
# remove this when experimenting on the whole dataset
df = resample(df, replace=False, n_samples=20000, stratify=df['Label'])

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

pca__n_components = 20

preprocessor = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("normalize", MinMaxScaler()),
    ("PCA", PCA(n_components=pca__n_components))
])

label_encoder = LabelEncoder()

In [7]:
X = df.drop(['Label'], axis=1)
y = df['Label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)

# doing this since we need all labels in case there's any labels not found in test set
label_encoder.fit(y)

y_train = label_encoder.transform(y_train)
y_val = label_encoder.transform(y_val)

In [8]:
X_smote, _ = helper.smote_sampling(X_train, y_train)
X_gaussion = helper.add_gaussion(X_smote)
X_flip = helper.invert_array(X_train)

X_new = np.dstack([X_smote, X_gaussion, X_flip])
X_new = X_new.transpose((0, 2, 1))

X_new.shape

(16000, 3, 20)

In [9]:
# batch, channel, feature
X_train = np.expand_dims(X_train, axis=2).transpose((0, 2, 1))
X_train.shape

(16000, 1, 20)

In [10]:
org_dataset = dataset.NumpyDataset(X_train, y_train)
pos_dataset = dataset.NumpyDataset(X_new, y_train)

In [11]:
import time
import warnings
import optuna
from torch.utils.data import DataLoader
from pytorch_metric_learning.losses import SelfSupervisedLoss, NTXentLoss
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold
from models import Extractor, Projector, ExtractorMLP
from models import ExtractorMLP
from itertools import cycle
from torch.utils.tensorboard import SummaryWriter


device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
warnings.filterwarnings('ignore')
verbose = False

def objective(trial: optuna.Trial):
    start_time = time.perf_counter()

    hidden_size = trial.suggest_int('hidden_size', 16, 64) 
    embedding_size = trial.suggest_int('embedding_size', 16, 64)
    n_hidden = trial.suggest_int('n_hidden', 2, 5)
    kernel_size = trial.suggest_int('kernel_size', 3, 15)
    stride = trial.suggest_int('stride', 1, 3)
    act = trial.suggest_categorical('act', ['relu', 'swish'])

    params = {
        'hidden_size': hidden_size,
        'embedding_size': embedding_size,
        'n_hidden': n_hidden,
        'kernel_size': kernel_size,
        'stride': stride,
    }

    print(params)

    extractor_1d = ExtractorMLP(n_features=pca__n_components, 
                            n_channels=1,
                            embedding_size=embedding_size,
                            hidden_size=hidden_size,
                            n_hidden=n_hidden,
                            kernel_size=kernel_size, 
                            stride=stride,
                            act=act
                            ).to(device)

    extractor_3d = ExtractorMLP(n_features=pca__n_components, 
                            n_channels=X_new.shape[1],
                            embedding_size=embedding_size,
                            hidden_size=hidden_size,
                            n_hidden=n_hidden,
                            kernel_size=kernel_size, 
                            stride=stride,
                            act=act,
                            ).to(device)

    projector = Projector(embedding_size=embedding_size).to(device)

    lr = 1e-3
    weight_decay = 1e-4
    batch_size = 1024
    epochs = 200

    org_dataloader = DataLoader(org_dataset, batch_size=batch_size,)
    pos_dataloader = DataLoader(pos_dataset, batch_size=batch_size)

    optimizer_1d = torch.optim.Adam(extractor_1d.parameters(), lr=lr, weight_decay=weight_decay)
    optimizer_3d = torch.optim.Adam(extractor_3d.parameters(), lr=lr, weight_decay=weight_decay)
    optimizer_projector = torch.optim.Adam(projector.parameters(), lr=lr, weight_decay=weight_decay)

    loss_fn = NTXentLoss()
    loss_fn = SelfSupervisedLoss(loss_fn, symmetric=False).to(device)

    writer = SummaryWriter(log_dir='./runs')

    cnt = 1

    for epoch in range(epochs):
        total_loss = 0
        for item1, item2 in zip(org_dataloader, cycle(pos_dataloader)):
            X_batch, _ = item1
            X_new_batch, _ = item2

            X_batch = X_batch.to(device)
            X_new_batch = X_new_batch.to(device)

            optimizer_1d.zero_grad()
            optimizer_3d.zero_grad()
            optimizer_projector.zero_grad()

            embedding_1d = extractor_1d(X_batch)
            embedding_3d = extractor_3d(X_new_batch)
            projected_1d = projector(embedding_1d)
            projected_3d = projector(embedding_3d)

            loss = loss_fn(projected_1d, projected_3d)
            loss.backward()

            total_loss += loss.item()
        
            optimizer_1d.step()
            optimizer_3d.step()
            optimizer_projector.step()

        cnt += 1

        if verbose: print(f'[{epoch}/{epochs}]: {total_loss}')
        
        writer.add_scalar(f'Loss/train-{(hidden_size, n_hidden, embedding_size, kernel_size, stride)}', total_loss, cnt)
                        
    extractor_1d.eval()
    X_val_tensor = torch.tensor(np.expand_dims(X_val, axis=2).transpose((0, 2, 1)), dtype=torch.float32, device=device)

    X_val_embedding = extractor_1d(X_val_tensor)
    X_val_embedding = X_val_embedding.cpu().detach().numpy()

    mlp = MLPClassifier(max_iter=750, verbose=0)
    results = cross_validate(mlp, X_val_embedding, y_val, cv=5, scoring='f1_macro')

    end_time = time.perf_counter()
    elapsed_time = end_time - start_time
    print(f'elapsed {elapsed_time//60}:{elapsed_time % 60}')

    return np.average(results['test_score'])
            

In [12]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[I 2023-11-14 14:39:30,397] A new study created in memory with name: no-name-efa506b8-9217-4b9a-b132-df2e9dd92d45


{'hidden_size': 19, 'embedding_size': 27, 'n_hidden': 2, 'kernel_size': 4, 'stride': 1}


[I 2023-11-14 14:48:16,689] Trial 0 finished with value: 0.49807622462377477 and parameters: {'hidden_size': 19, 'embedding_size': 27, 'n_hidden': 2, 'kernel_size': 4, 'stride': 1, 'act': 'relu'}. Best is trial 0 with value: 0.49807622462377477.


elapsed 8.0:46.28999856700102
{'hidden_size': 58, 'embedding_size': 47, 'n_hidden': 5, 'kernel_size': 8, 'stride': 2}


[I 2023-11-14 14:56:27,699] Trial 1 finished with value: 0.08783164666659811 and parameters: {'hidden_size': 58, 'embedding_size': 47, 'n_hidden': 5, 'kernel_size': 8, 'stride': 2, 'act': 'swish'}. Best is trial 0 with value: 0.49807622462377477.


elapsed 8.0:11.007475680000425
{'hidden_size': 31, 'embedding_size': 17, 'n_hidden': 5, 'kernel_size': 6, 'stride': 2}


[I 2023-11-14 15:04:41,718] Trial 2 finished with value: 0.08783164666659811 and parameters: {'hidden_size': 31, 'embedding_size': 17, 'n_hidden': 5, 'kernel_size': 6, 'stride': 2, 'act': 'swish'}. Best is trial 0 with value: 0.49807622462377477.


elapsed 8.0:14.017650478001087
{'hidden_size': 64, 'embedding_size': 50, 'n_hidden': 2, 'kernel_size': 13, 'stride': 2}


[I 2023-11-14 15:13:23,512] Trial 3 finished with value: 0.5200893762085674 and parameters: {'hidden_size': 64, 'embedding_size': 50, 'n_hidden': 2, 'kernel_size': 13, 'stride': 2, 'act': 'swish'}. Best is trial 3 with value: 0.5200893762085674.


elapsed 8.0:41.79214929099908
{'hidden_size': 34, 'embedding_size': 51, 'n_hidden': 5, 'kernel_size': 15, 'stride': 2}


[I 2023-11-14 15:21:36,505] Trial 4 finished with value: 0.08783164666659811 and parameters: {'hidden_size': 34, 'embedding_size': 51, 'n_hidden': 5, 'kernel_size': 15, 'stride': 2, 'act': 'swish'}. Best is trial 3 with value: 0.5200893762085674.


elapsed 8.0:12.991294909999851
{'hidden_size': 54, 'embedding_size': 18, 'n_hidden': 5, 'kernel_size': 11, 'stride': 3}


[I 2023-11-14 15:30:07,966] Trial 5 finished with value: 0.183586205169287 and parameters: {'hidden_size': 54, 'embedding_size': 18, 'n_hidden': 5, 'kernel_size': 11, 'stride': 3, 'act': 'relu'}. Best is trial 3 with value: 0.5200893762085674.


elapsed 8.0:31.458900047000498
{'hidden_size': 46, 'embedding_size': 29, 'n_hidden': 2, 'kernel_size': 7, 'stride': 2}


[I 2023-11-14 15:38:54,594] Trial 6 finished with value: 0.528464718689946 and parameters: {'hidden_size': 46, 'embedding_size': 29, 'n_hidden': 2, 'kernel_size': 7, 'stride': 2, 'act': 'swish'}. Best is trial 6 with value: 0.528464718689946.


elapsed 8.0:46.626174796998384
{'hidden_size': 16, 'embedding_size': 46, 'n_hidden': 2, 'kernel_size': 12, 'stride': 2}


[I 2023-11-14 15:47:34,824] Trial 7 finished with value: 0.32517044665881223 and parameters: {'hidden_size': 16, 'embedding_size': 46, 'n_hidden': 2, 'kernel_size': 12, 'stride': 2, 'act': 'swish'}. Best is trial 6 with value: 0.528464718689946.


elapsed 8.0:40.228034413001296
{'hidden_size': 51, 'embedding_size': 26, 'n_hidden': 2, 'kernel_size': 10, 'stride': 3}


[I 2023-11-14 15:56:18,360] Trial 8 finished with value: 0.5798202281961586 and parameters: {'hidden_size': 51, 'embedding_size': 26, 'n_hidden': 2, 'kernel_size': 10, 'stride': 3, 'act': 'relu'}. Best is trial 8 with value: 0.5798202281961586.


elapsed 8.0:43.53411262699956
{'hidden_size': 59, 'embedding_size': 58, 'n_hidden': 4, 'kernel_size': 12, 'stride': 2}


[I 2023-11-14 16:04:28,866] Trial 9 finished with value: 0.08783164666659811 and parameters: {'hidden_size': 59, 'embedding_size': 58, 'n_hidden': 4, 'kernel_size': 12, 'stride': 2, 'act': 'swish'}. Best is trial 8 with value: 0.5798202281961586.


elapsed 8.0:10.504187264003122
{'hidden_size': 45, 'embedding_size': 32, 'n_hidden': 3, 'kernel_size': 9, 'stride': 3}


[I 2023-11-14 16:13:14,656] Trial 10 finished with value: 0.4342104249272367 and parameters: {'hidden_size': 45, 'embedding_size': 32, 'n_hidden': 3, 'kernel_size': 9, 'stride': 3, 'act': 'relu'}. Best is trial 8 with value: 0.5798202281961586.


elapsed 8.0:45.78837087899956
{'hidden_size': 47, 'embedding_size': 33, 'n_hidden': 3, 'kernel_size': 7, 'stride': 3}


[I 2023-11-14 16:21:59,326] Trial 11 finished with value: 0.3996741128504332 and parameters: {'hidden_size': 47, 'embedding_size': 33, 'n_hidden': 3, 'kernel_size': 7, 'stride': 3, 'act': 'relu'}. Best is trial 8 with value: 0.5798202281961586.


elapsed 8.0:44.667964407999534
{'hidden_size': 49, 'embedding_size': 27, 'n_hidden': 3, 'kernel_size': 5, 'stride': 1}


[I 2023-11-14 16:30:46,173] Trial 12 finished with value: 0.41234262878895045 and parameters: {'hidden_size': 49, 'embedding_size': 27, 'n_hidden': 3, 'kernel_size': 5, 'stride': 1, 'act': 'relu'}. Best is trial 8 with value: 0.5798202281961586.


elapsed 8.0:46.84529891499915
{'hidden_size': 40, 'embedding_size': 38, 'n_hidden': 2, 'kernel_size': 10, 'stride': 3}


[I 2023-11-14 16:39:31,788] Trial 13 finished with value: 0.501728769544843 and parameters: {'hidden_size': 40, 'embedding_size': 38, 'n_hidden': 2, 'kernel_size': 10, 'stride': 3, 'act': 'relu'}. Best is trial 8 with value: 0.5798202281961586.


elapsed 8.0:45.61302098899978
{'hidden_size': 39, 'embedding_size': 24, 'n_hidden': 4, 'kernel_size': 3, 'stride': 1}


[I 2023-11-14 16:47:54,375] Trial 14 finished with value: 0.16755391562849747 and parameters: {'hidden_size': 39, 'embedding_size': 24, 'n_hidden': 4, 'kernel_size': 3, 'stride': 1, 'act': 'swish'}. Best is trial 8 with value: 0.5798202281961586.


elapsed 8.0:22.584948515999713
{'hidden_size': 52, 'embedding_size': 38, 'n_hidden': 3, 'kernel_size': 8, 'stride': 3}


[I 2023-11-14 16:56:50,405] Trial 15 finished with value: 0.39097073310225344 and parameters: {'hidden_size': 52, 'embedding_size': 38, 'n_hidden': 3, 'kernel_size': 8, 'stride': 3, 'act': 'relu'}. Best is trial 8 with value: 0.5798202281961586.


elapsed 8.0:56.02818679200209
{'hidden_size': 31, 'embedding_size': 24, 'n_hidden': 2, 'kernel_size': 6, 'stride': 3}


[I 2023-11-14 17:05:43,037] Trial 16 finished with value: 0.44264491708866444 and parameters: {'hidden_size': 31, 'embedding_size': 24, 'n_hidden': 2, 'kernel_size': 6, 'stride': 3, 'act': 'swish'}. Best is trial 8 with value: 0.5798202281961586.


elapsed 8.0:52.63005767100185
{'hidden_size': 44, 'embedding_size': 32, 'n_hidden': 2, 'kernel_size': 10, 'stride': 1}


[I 2023-11-14 17:14:35,045] Trial 17 finished with value: 0.569318626718968 and parameters: {'hidden_size': 44, 'embedding_size': 32, 'n_hidden': 2, 'kernel_size': 10, 'stride': 1, 'act': 'relu'}. Best is trial 8 with value: 0.5798202281961586.


elapsed 8.0:52.006910266001796
{'hidden_size': 24, 'embedding_size': 34, 'n_hidden': 4, 'kernel_size': 14, 'stride': 1}


[I 2023-11-14 17:23:30,698] Trial 18 finished with value: 0.3209428212090625 and parameters: {'hidden_size': 24, 'embedding_size': 34, 'n_hidden': 4, 'kernel_size': 14, 'stride': 1, 'act': 'relu'}. Best is trial 8 with value: 0.5798202281961586.


elapsed 8.0:55.649752036999416
{'hidden_size': 42, 'embedding_size': 43, 'n_hidden': 3, 'kernel_size': 10, 'stride': 1}


[I 2023-11-14 17:32:24,956] Trial 19 finished with value: 0.5115163412331303 and parameters: {'hidden_size': 42, 'embedding_size': 43, 'n_hidden': 3, 'kernel_size': 10, 'stride': 1, 'act': 'relu'}. Best is trial 8 with value: 0.5798202281961586.


elapsed 8.0:54.25632870000118


In [13]:
from torch.utils.data import DataLoader
from pytorch_metric_learning.losses import SelfSupervisedLoss, NTXentLoss
from models import Extractor, Projector, ExtractorMLP

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# extractor_1d = Extractor(n_features=pca__n_components, n_channels=1).to(device)
# extractor_3d = Extractor(n_features=pca__n_components, n_channels=3).to(device)

extractor_1d = ExtractorMLP(n_features=pca__n_components, n_channels=1).to(device)
extractor_3d = Extractor(n_features=pca__n_components, n_channels=X_new.shape[1]).to(device)


projector = Projector().to(device)

lr = 1e-3
weight_decay = 1e-4
batch_size = 1024
epochs = 200

org_dataloader = DataLoader(org_dataset, batch_size=batch_size,)
pos_dataloader = DataLoader(pos_dataset, batch_size=batch_size)

optimizer_1d = torch.optim.Adam(extractor_1d.parameters(), lr=lr, weight_decay=weight_decay)
optimizer_3d = torch.optim.Adam(extractor_3d.parameters(), lr=lr, weight_decay=weight_decay)
optimizer_projector = torch.optim.Adam(projector.parameters(), lr=lr, weight_decay=weight_decay)

loss_fn = NTXentLoss()
loss_fn = SelfSupervisedLoss(loss_fn, symmetric=False).to(device)

In [14]:
from itertools import cycle
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(log_dir='./runs')

cnt = 1

for epoch in range(epochs):
    total_loss = 0
    for item1, item2 in zip(org_dataloader, cycle(pos_dataloader)):
        X_batch, _ = item1
        X_new_batch, _ = item2

        X_batch = X_batch.to(device)
        X_new_batch = X_new_batch.to(device)

        optimizer_1d.zero_grad()
        optimizer_3d.zero_grad()
        optimizer_projector.zero_grad()

        embedding_1d = extractor_1d(X_batch)
        embedding_3d = extractor_3d(X_new_batch)
        projected_1d = projector(embedding_1d)
        projected_3d = projector(embedding_3d)

        loss = loss_fn(projected_1d, projected_3d)
        loss.backward()

        total_loss += loss.item()
    
        optimizer_1d.step()
        optimizer_3d.step()
        optimizer_projector.step()

    cnt += 1

    print(f'[{epoch}/{epochs}]: {total_loss}')
    
    writer.add_scalar('Loss/train', total_loss, cnt)

[0/200]: 112.05561399459839
[1/200]: 110.48021125793457
[2/200]: 109.90722608566284
[3/200]: 107.56851625442505
[4/200]: 103.48046064376831
[5/200]: 98.32909202575684
[6/200]: 94.77883672714233
[7/200]: 91.1114730834961
[8/200]: 88.65277338027954
[9/200]: 85.65478372573853
[10/200]: 84.32544898986816
[11/200]: 80.29822444915771
[12/200]: 78.06781959533691
[13/200]: 76.78608655929565
[14/200]: 76.09594678878784
[15/200]: 75.09872150421143
[16/200]: 74.23813819885254
[17/200]: 73.74305152893066
[18/200]: 72.91334962844849
[19/200]: 72.11415243148804
[20/200]: 71.37584447860718
[21/200]: 71.2349591255188
[22/200]: 70.75021648406982
[23/200]: 70.38265824317932
[24/200]: 69.3975465297699
[25/200]: 69.01264643669128
[26/200]: 68.33845257759094
[27/200]: 68.07618451118469
[28/200]: 67.73077058792114
[29/200]: 67.36526417732239
[30/200]: 66.3752453327179
[31/200]: 66.33001828193665
[32/200]: 65.94526934623718
[33/200]: 65.87197661399841
[34/200]: 64.30150747299194
[35/200]: 63.66439986228943
[

In [15]:
X_val.shape

(4000, 20)

In [16]:
X_val_tensor = torch.tensor(np.expand_dims(X_val, axis=2).transpose((0, 2, 1)), dtype=torch.float32, device=device)

X_val_embedding = extractor_1d(X_val_tensor)

In [17]:
X_val_embedding = X_val_embedding.cpu().detach().numpy()
X_val_embedding.shape

(4000, 64)

In [23]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold

random_state = 42

mlp = MLPClassifier(max_iter=500, random_state=random_state)

cv = StratifiedKFold(n_splits=5, random_state=random_state, shuffle=True)

results = cross_validate(mlp, X_val_embedding, y_val, cv=cv, scoring='f1_macro')

# mlp.fit(X_val_embedding, y_val)

print(np.average(results['test_score']))
results



0.4410090710321265




{'fit_time': array([8.09933853, 6.75865793, 5.93444586, 5.98762298, 5.97618055]),
 'score_time': array([0.00265503, 0.00236368, 0.00260377, 0.00256729, 0.00236034]),
 'test_score': array([0.57589735, 0.43172675, 0.42881156, 0.32170462, 0.44690507])}

In [24]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate

mlp = MLPClassifier(max_iter=500, random_state=42)

cv = StratifiedKFold(5, random_state=42, shuffle=True)

results = cross_validate(mlp, X_val, y_val, cv=cv, scoring='f1_macro')

print(np.average(results['test_score']))
results





0.5958775258811952




{'fit_time': array([6.04724741, 6.15044379, 5.53555751, 5.52676415, 6.43290496]),
 'score_time': array([0.00250936, 0.00274992, 0.0025034 , 0.00246501, 0.00248027]),
 'test_score': array([0.66201233, 0.59172573, 0.69101491, 0.49118071, 0.54345395])}