In [1]:
import gc
import os
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pickle
from sklearn.utils import resample
from collections import Counter
import math
from sklearn.model_selection import train_test_split
import helper
import dataset
import torch
import torch.nn as nn
from resnet1d.net1d import Net1D

In [2]:
data_files = [name for name in os.listdir('./data')]

dfs = []

for i in range(len(data_files)):
    fname = './data/' + data_files[i]
    df = pd.read_csv(fname)
    # df.columns = [x.strip().lstrip() for x in df.columns]

    dfs.append(df)

df = pd.concat(dfs, axis=0, ignore_index=True)

df.shape

(2830743, 79)

In [3]:
# Remove spaces in the front and the end of the column names for better human reading
df.columns = [x.lstrip().strip().replace('�', '-') for x in df.columns]
df.shape

(2830743, 79)

In [4]:
# replace inf values to nan
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
df.shape

(2827876, 79)

In [5]:
# remove this when experimenting on the whole dataset
df = resample(df, replace=False, n_samples=20000, stratify=df['Label'])

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

pca__n_components = 20

preprocessor = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("normalize", MinMaxScaler()),
    ("PCA", PCA(n_components=pca__n_components))
])

label_encoder = LabelEncoder()

In [7]:
X = df.drop(['Label'], axis=1)
y = df['Label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)

# doing this since we need all labels in case there's any labels not found in test set
label_encoder.fit(y)

y_train = label_encoder.transform(y_train)
y_val = label_encoder.transform(y_val)

In [8]:
X_smote, _ = helper.smote_sampling(X_train, y_train)
X_gaussion = helper.add_gaussion(X_smote)
X_flip = helper.invert_array(X_train)

X_new = np.dstack([X_smote, X_gaussion, X_flip])
X_new = X_new.transpose((0, 2, 1))

X_new.shape

(16000, 3, 20)

In [9]:
# batch, channel, feature
X_train = np.expand_dims(X_train, axis=2).transpose((0, 2, 1))
X_train.shape

(16000, 1, 20)

In [10]:
org_dataset = dataset.NumpyDataset(X_train, y_train)
pos_dataset = dataset.NumpyDataset(X_new, y_train)

In [11]:
from torch.utils.data import DataLoader
from pytorch_metric_learning.losses import SelfSupervisedLoss, NTXentLoss
from models import Extractor, Projector, ExtractorMLP

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

extractor_1d = Extractor(n_features=pca__n_components, n_channels=1).to(device)
extractor_3d = Extractor(n_features=pca__n_components, n_channels=3).to(device)

# extractor_1d = ExtractorMLP(n_features=pca__n_components, n_channels=1).to(device)
# extractor_3d = Extractor(n_features=pca__n_components, n_channels=3).to(device)


projector = Projector().to(device)

lr = 1e-3
weight_decay = 1e-4
batch_size = 768
epochs = 200

org_dataloader = DataLoader(org_dataset, batch_size=batch_size,)
pos_dataloader = DataLoader(pos_dataset, batch_size=batch_size)

optimizer_1d = torch.optim.Adam(extractor_1d.parameters(), lr=lr, weight_decay=weight_decay)
optimizer_3d = torch.optim.Adam(extractor_3d.parameters(), lr=lr, weight_decay=weight_decay)
optimizer_projector = torch.optim.Adam(projector.parameters(), lr=lr, weight_decay=weight_decay)

loss_fn = NTXentLoss()
loss_fn = SelfSupervisedLoss(loss_fn, symmetric=False).to(device)

In [12]:
from itertools import cycle
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(log_dir='./runs')

cnt = 1

for epoch in range(epochs):
    total_loss = 0
    for item1, item2 in zip(org_dataloader, cycle(pos_dataloader)):
        X_batch, _ = item1
        X_new_batch, _ = item2

        X_batch = X_batch.to(device)
        X_new_batch = X_new_batch.to(device)

        optimizer_1d.zero_grad()
        optimizer_3d.zero_grad()
        optimizer_projector.zero_grad()

        embedding_1d = extractor_1d(X_batch)
        embedding_3d = extractor_3d(X_new_batch)
        projected_1d = projector(embedding_1d)
        projected_3d = projector(embedding_3d)

        loss = loss_fn(projected_1d, projected_3d)
        loss.backward()

        total_loss += loss.item()
    
        optimizer_1d.step()
        optimizer_3d.step()
        optimizer_projector.step()

    cnt += 1

    print(f'[{epoch}/{epochs}]: {total_loss}')
    
    writer.add_scalar('Loss/train', total_loss, cnt)

[0/200]: 138.25887727737427
[1/200]: 129.1943917274475
[2/200]: 112.87743520736694
[3/200]: 97.78681898117065
[4/200]: 87.58734321594238
[5/200]: 80.99812507629395
[6/200]: 77.1276319026947
[7/200]: 73.54132580757141
[8/200]: 70.65895175933838
[9/200]: 68.08123779296875
[10/200]: 65.85488414764404
[11/200]: 64.05440402030945
[12/200]: 62.77361512184143
[13/200]: 61.11018657684326
[14/200]: 59.63137483596802
[15/200]: 58.29786682128906
[16/200]: 57.53602623939514
[17/200]: 55.28964924812317
[18/200]: 54.52732014656067
[19/200]: 52.89133810997009
[20/200]: 51.78991341590881
[21/200]: 50.854888677597046
[22/200]: 49.62147259712219
[23/200]: 48.403154253959656
[24/200]: 48.74461889266968
[25/200]: 48.40158033370972
[26/200]: 47.0635529756546
[27/200]: 46.64481317996979
[28/200]: 49.47373044490814
[29/200]: 45.84053027629852
[30/200]: 45.90511131286621
[31/200]: 44.6131774187088
[32/200]: 46.40576481819153
[33/200]: 45.28226184844971
[34/200]: 44.606194376945496
[35/200]: 44.39935564994812


In [13]:
X_val.shape

(4000, 20)

In [14]:
X_val_tensor = torch.tensor(np.expand_dims(X_val, axis=2).transpose((0, 2, 1)), dtype=torch.float32, device=device)

X_val_embedding = extractor_1d(X_val_tensor)

In [15]:
X_val_embedding = X_val_embedding.cpu().detach().numpy()
X_val_embedding.shape

(4000, 64)

In [16]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate

mlp = MLPClassifier(max_iter=500)

cross_validate(mlp, X_val_embedding, y_val, cv=5, scoring='f1_macro')

# mlp.fit(X_val_embedding, y_val)





{'fit_time': array([3.65506697, 3.0108037 , 4.12173891, 4.20474267, 4.94763851]),
 'score_time': array([0.00244713, 0.00234199, 0.00295568, 0.00235939, 0.00236249]),
 'test_score': array([0.67292007, 0.61829282, 0.81724516, 0.68256309, 0.67819363])}

In [19]:
a = [0.67292007, 0.61829282, 0.81724516, 0.68256309, 0.67819363]
np.average(a)

0.693842954

In [17]:
X_val.shape

(4000, 20)

In [18]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate

mlp = MLPClassifier(max_iter=500)

cross_validate(mlp, X_val, y_val, cv=5, scoring='f1_macro')

# mlp.fit(X_val_embedding, y_val)







{'fit_time': array([5.38574505, 4.82814479, 6.06449866, 5.98018837, 6.25218582]),
 'score_time': array([0.00291109, 0.00246525, 0.00245214, 0.00243998, 0.00264978]),
 'test_score': array([0.71735317, 0.70128685, 0.79946408, 0.61093661, 0.57379517])}

In [20]:
a = [0.71735317, 0.70128685, 0.79946408, 0.61093661, 0.57379517]
np.average(a)

0.680567176