In [1]:
import gc
import os
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pickle
from sklearn.utils import resample
from collections import Counter
import math
from sklearn.model_selection import train_test_split
import helper
import dataset
import torch
import torch.nn as nn
from resnet1d.net1d import Net1D

In [2]:
data_files = [name for name in os.listdir('./data')]

dfs = []

for i in range(len(data_files)):
    fname = './data/' + data_files[i]
    df = pd.read_csv(fname)
    # df.columns = [x.strip().lstrip() for x in df.columns]

    dfs.append(df)

df = pd.concat(dfs, axis=0, ignore_index=True)

df.shape

(2830743, 79)

In [3]:
# Remove spaces in the front and the end of the column names for better human reading
df.columns = [x.lstrip().strip().replace('�', '-') for x in df.columns]
df.shape

(2830743, 79)

In [4]:
# replace inf values to nan
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
df.shape

(2827876, 79)

In [5]:
df_train, df_val = train_test_split(df, stratify=df['Label'], test_size=0.2, random_state=42)

In [6]:
X = df_val.drop(['Label'], axis=1)
y = df_val['Label']

In [7]:
with open('./dist/labelencoder-resnet.pkl', 'rb') as f:
    label_encoder = pickle.load(f)

with open('./dist/preprocessor-resnet.pkl', 'rb') as f:
    preprocessor = pickle.load(f)

In [8]:
preprocessor

In [9]:
label_encoder

In [10]:
X = preprocessor.transform(X)
y = label_encoder.transform(y)

In [11]:
X.shape

(565576, 20)

In [12]:
Counter(y)

Counter({0: 454265,
         4: 46025,
         10: 31761,
         2: 25605,
         3: 2059,
         7: 1587,
         11: 1180,
         6: 1159,
         5: 1100,
         1: 391,
         12: 301,
         14: 130,
         9: 7,
         13: 4,
         8: 2})

In [13]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

with open('./dist/resnet/extractor1d-170.pkl', 'rb') as f:
    extractor = pickle.load(f)

extractor.to(device)
extractor.eval()

Extractor(
  (resnet): Net1D(
    (first_conv): MyConv1dPadSame(
      (conv): Conv1d(1, 20, kernel_size=(16,), stride=(2,))
    )
    (first_bn): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (first_activation): Swish()
    (stage_list): ModuleList(
      (0): BasicStage(
        (block_list): ModuleList(
          (0): BasicBlock(
            (bn1): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation1): Swish()
            (do1): Dropout(p=0.5, inplace=False)
            (conv1): MyConv1dPadSame(
              (conv): Conv1d(20, 64, kernel_size=(1,), stride=(1,))
            )
            (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation2): Swish()
            (do2): Dropout(p=0.5, inplace=False)
            (conv2): MyConv1dPadSame(
              (conv): Conv1d(64, 64, kernel_size=(16,), stride=(2,), groups=4)
            )
          

In [14]:
X = np.expand_dims(X, axis=2).transpose((0, 2, 1))

dataset = dataset.NumpyDataset(X, y)
len(dataset)

565576

In [15]:
from torch.utils.data import DataLoader

embeddings = []
labels = []
batch_size = 1024
dataloader = DataLoader(dataset=dataset, batch_size=batch_size)

for X, y in dataloader:
    X = X.to(device)
    X_embedding = extractor(X)
    embeddings.append(X_embedding.cpu().detach().numpy())

    labels.append(y)

In [16]:
X_embedding = np.concatenate(embeddings, axis=0)
y = np.concatenate(labels, axis=0)
print(X_embedding.shape)
print(y.shape)

(565576, 34)
(565576,)


In [17]:
from sklearn.neural_network import  MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

knn_list = [5, 10]

estimators = [MLPClassifier(max_iter=1500), DecisionTreeClassifier(), GaussianNB(), LogisticRegression(), RandomForestClassifier()] \
    + [KNeighborsClassifier(n_neighbors=k) for k in knn_list]

estimators

[MLPClassifier(max_iter=1500),
 DecisionTreeClassifier(),
 GaussianNB(),
 LogisticRegression(),
 RandomForestClassifier(),
 KNeighborsClassifier(),
 KNeighborsClassifier(n_neighbors=10)]

In [21]:
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, accuracy_score, confusion_matrix

scoring = {
    'f1_weighted': make_scorer(f1_score, average='weighted', zero_division=0),
    'precision_weighted': make_scorer(precision_score, average='weighted', zero_division=0),
    'recall_weighted': make_scorer(recall_score, average='weighted', zero_division=0),
    'accuracy': make_scorer(accuracy_score),
    'f1_macro': make_scorer(f1_score, average='macro', zero_division=0),
    'precision_macro': make_scorer(precision_score, average='macro', zero_division=0),
    'recall_macro': make_scorer(recall_score, average='macro', zero_division=0),
}

performance = {}

# for model in estimators:
#     results = cross_validate(model, X_embedding, y, scoring=scoring)
#     performance[model.__repr__()] = results

#     print(f"finished evaluation on {model.__repr__()}")

In [24]:
model = MLPClassifier(max_iter=1500)
results = cross_validate(model, X_embedding, y, scoring=scoring, cv=5, n_jobs=-1)
performance[model.__repr__()] = results
results



{'fit_time': array([166.42514586, 157.03714609, 136.26153898, 197.37146068,
        171.93202519]),
 'score_time': array([0.20193934, 0.19396353, 0.20012093, 0.18774819, 0.19407964]),
 'test_f1_weighted': array([0.97883479, 0.97936683, 0.97634921, 0.98040365, 0.97574611]),
 'test_precision_weighted': array([0.97931893, 0.97945804, 0.97653705, 0.98076832, 0.97654373]),
 'test_recall_weighted': array([0.97948124, 0.97963135, 0.97705875, 0.98093091, 0.97628962]),
 'test_accuracy': array([0.97948124, 0.97963135, 0.97705875, 0.98093091, 0.97628962]),
 'test_f1_macro': array([0.58741459, 0.72630832, 0.63356629, 0.64073653, 0.63855176]),
 'test_precision_macro': array([0.70551311, 0.76129843, 0.73359411, 0.73829295, 0.74399737]),
 'test_recall_macro': array([0.5501699 , 0.71878893, 0.60628476, 0.61595616, 0.61176117])}

In [25]:
model = DecisionTreeClassifier()
results = cross_validate(model, X_embedding, y, scoring=scoring, cv=5, n_jobs=-1)
performance[model.__repr__()] = results
results



{'fit_time': array([59.23611736, 59.32007384, 55.62628341, 51.6815443 , 67.35578918]),
 'score_time': array([0.13991618, 0.14107871, 0.13596034, 0.13622999, 0.13545132]),
 'test_f1_weighted': array([0.98738211, 0.98689009, 0.98709306, 0.98744561, 0.98648729]),
 'test_precision_weighted': array([0.98743627, 0.98692753, 0.98711517, 0.98743651, 0.98654608]),
 'test_recall_weighted': array([0.98734043, 0.98686293, 0.9870751 , 0.98745524, 0.98644742]),
 'test_accuracy': array([0.98734043, 0.98686293, 0.9870751 , 0.98745524, 0.98644742]),
 'test_f1_macro': array([0.69775054, 0.68601978, 0.73376079, 0.73712639, 0.73303053]),
 'test_precision_macro': array([0.69477127, 0.68631164, 0.7330138 , 0.73611664, 0.73213589]),
 'test_recall_macro': array([0.70146638, 0.68692004, 0.73463156, 0.7382074 , 0.73473118])}

In [26]:
model = GaussianNB()
results = cross_validate(model, X_embedding, y, scoring=scoring, cv=5, n_jobs=-1)
performance[model.__repr__()] = results
results



{'fit_time': array([0.28843355, 0.2874701 , 0.28731942, 0.2847755 , 0.2877965 ]),
 'score_time': array([0.79663825, 0.77650261, 0.79313207, 0.78408289, 0.80303144]),
 'test_f1_weighted': array([0.37457768, 0.37566932, 0.37461429, 0.37570854, 0.37430191]),
 'test_precision_weighted': array([0.83005099, 0.83051978, 0.83036688, 0.83144327, 0.83133973]),
 'test_recall_weighted': array([0.32760175, 0.32876276, 0.32866552, 0.32869204, 0.32845334]),
 'test_accuracy': array([0.32760175, 0.32876276, 0.32866552, 0.32869204, 0.32845334]),
 'test_f1_macro': array([0.19517622, 0.18621929, 0.20780963, 0.20854229, 0.20961254]),
 'test_precision_macro': array([0.19142612, 0.18708565, 0.20489538, 0.20618553, 0.20473574]),
 'test_recall_macro': array([0.56287797, 0.50044977, 0.53455748, 0.537485  , 0.53887541])}

In [27]:
model = LogisticRegression()
results = cross_validate(model, X_embedding, y, scoring=scoring, cv=5, n_jobs=-1)
performance[model.__repr__()] = results
results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'fit_time': array([21.58843017, 20.90731668, 21.16089344, 21.00897908, 20.91812921]),
 'score_time': array([0.14626908, 0.16722083, 0.14620519, 0.14614367, 0.16546154]),
 'test_f1_weighted': array([0.88479326, 0.88665244, 0.8875006 , 0.88645218, 0.8856178 ]),
 'test_precision_weighted': array([0.88699302, 0.88719584, 0.88873644, 0.88695207, 0.88667799]),
 'test_recall_weighted': array([0.89676085, 0.89721964, 0.89829819, 0.89778544, 0.89619414]),
 'test_accuracy': array([0.89676085, 0.89721964, 0.89829819, 0.89778544, 0.89619414]),
 'test_f1_macro': array([0.27331843, 0.30776314, 0.29689274, 0.29522812, 0.3098183 ]),
 'test_precision_macro': array([0.37646876, 0.40727586, 0.40912155, 0.35962943, 0.42569776]),
 'test_recall_macro': array([0.2338843 , 0.26497161, 0.2542205 , 0.25733392, 0.26607499])}

In [28]:
model = RandomForestClassifier()
results = cross_validate(model, X_embedding, y, scoring=scoring, cv=5, n_jobs=-1)
performance[model.__repr__()] = results
results



{'fit_time': array([371.1968255 , 361.92434216, 375.25205827, 361.18098545,
        372.52428627]),
 'score_time': array([1.435673  , 1.53056431, 1.40030289, 1.4600637 , 1.43673682]),
 'test_f1_weighted': array([0.98942371, 0.98906899, 0.98888174, 0.9890501 , 0.98835721]),
 'test_precision_weighted': array([0.98952668, 0.98914059, 0.98893951, 0.989118  , 0.98845755]),
 'test_recall_weighted': array([0.9894091 , 0.98905539, 0.98891394, 0.9890377 , 0.98836582]),
 'test_accuracy': array([0.9894091 , 0.98905539, 0.98891394, 0.9890377 , 0.98836582]),
 'test_f1_macro': array([0.77304196, 0.82784117, 0.74650134, 0.75438048, 0.73143057]),
 'test_precision_macro': array([0.79017984, 0.84244688, 0.76774687, 0.76683713, 0.74962728]),
 'test_recall_macro': array([0.76140863, 0.81733849, 0.7331253 , 0.7455819 , 0.72050299])}

In [29]:
model = KNeighborsClassifier(n_neighbors=5)
results = cross_validate(model, X_embedding, y, scoring=scoring, cv=5, n_jobs=-1)
performance[model.__repr__()] = results
results



{'fit_time': array([0.06561232, 0.087497  , 0.07546878, 0.08045745, 0.06833076]),
 'score_time': array([230.21682787, 230.31205821, 230.59812593, 230.83106637,
        230.46116471]),
 'test_f1_weighted': array([0.98729713, 0.98689041, 0.98704019, 0.98735877, 0.98628104]),
 'test_precision_weighted': array([0.98737504, 0.98693816, 0.98703637, 0.98736193, 0.98635942]),
 'test_recall_weighted': array([0.98736695, 0.98693365, 0.98713698, 0.98741104, 0.98633249]),
 'test_accuracy': array([0.98736695, 0.98693365, 0.98713698, 0.98741104, 0.98633249]),
 'test_f1_macro': array([0.68386304, 0.73272743, 0.71212853, 0.71947947, 0.71283132]),
 'test_precision_macro': array([0.71963431, 0.76187522, 0.73350771, 0.73312319, 0.73921754]),
 'test_recall_macro': array([0.66732386, 0.71698418, 0.69955436, 0.71138847, 0.70166256])}

In [30]:
resnet_performance = performance

In [31]:
with open('./dist/mlp/extractor1d-final.pkl', 'rb') as f:
    extractor = pickle.load(f)

extractor.to(device)
extractor.eval()

ExtractorMLP(
  (conv1d): MyConv1dPadSame(
    (conv): Conv1d(1, 1, kernel_size=(1,), stride=(1,))
  )
  (linear_in): Linear(in_features=20, out_features=65, bias=True)
  (backbone): Sequential(
    (linear-0): Linear(in_features=65, out_features=65, bias=True)
    (act-0): ReLU()
  )
  (linear_out): Linear(in_features=65, out_features=39, bias=True)
)

## Eval of MLP feature extractor

In [32]:
embeddings = []
labels = []
batch_size = 1024
dataloader = DataLoader(dataset=dataset, batch_size=batch_size)

for X, y in dataloader:
    X = X.to(device)
    X_embedding = extractor(X)
    embeddings.append(X_embedding.cpu().detach().numpy())

    labels.append(y)

In [33]:
X_embedding = np.concatenate(embeddings, axis=0)
y = np.concatenate(labels, axis=0)
print(X_embedding.shape)
print(y.shape)

(565576, 39)
(565576,)


In [34]:
performance = {}

In [35]:
model = MLPClassifier(max_iter=1500)
results = cross_validate(model, X_embedding, y, scoring=scoring, cv=5, n_jobs=-1)
performance[model.__repr__()] = results
results



{'fit_time': array([173.43660378, 250.37338257, 204.78393364, 230.25995612,
        218.28875375]),
 'score_time': array([0.19567418, 0.1888206 , 0.18545556, 0.19332814, 0.1881671 ]),
 'test_f1_weighted': array([0.97294301, 0.97311998, 0.96981871, 0.96993273, 0.97171521]),
 'test_precision_weighted': array([0.97298603, 0.97394676, 0.97038282, 0.97007345, 0.97211775]),
 'test_recall_weighted': array([0.97346971, 0.97329267, 0.97049905, 0.97052557, 0.97235557]),
 'test_accuracy': array([0.97346971, 0.97329267, 0.97049905, 0.97052557, 0.97235557]),
 'test_f1_macro': array([0.59529719, 0.70798253, 0.63686329, 0.63913642, 0.63539577]),
 'test_precision_macro': array([0.6353439 , 0.79069836, 0.72898806, 0.7227021 , 0.74871888]),
 'test_recall_macro': array([0.57164786, 0.68844148, 0.62080139, 0.61491127, 0.61107615])}

In [36]:
model = DecisionTreeClassifier()
results = cross_validate(model, X_embedding, y, scoring=scoring, cv=5, n_jobs=-1)
performance[model.__repr__()] = results
results



{'fit_time': array([59.69146943, 60.86335802, 60.53958392, 58.92786765, 58.45958686]),
 'score_time': array([0.15050602, 0.14560103, 0.13920021, 0.14548159, 0.14313459]),
 'test_f1_weighted': array([0.98541622, 0.98460092, 0.98455867, 0.98501738, 0.98434932]),
 'test_precision_weighted': array([0.98543979, 0.98466417, 0.98461457, 0.98504498, 0.98441759]),
 'test_recall_weighted': array([0.98540436, 0.9845467 , 0.98452018, 0.98499757, 0.98429916]),
 'test_accuracy': array([0.98540436, 0.9845467 , 0.98452018, 0.98499757, 0.98429916]),
 'test_f1_macro': array([0.68748369, 0.67738097, 0.75886651, 0.72737169, 0.71750545]),
 'test_precision_macro': array([0.68818464, 0.6715667 , 0.7393209 , 0.72678312, 0.71412234]),
 'test_recall_macro': array([0.68729999, 0.68397813, 0.80296057, 0.72814764, 0.72106488])}

In [37]:
model = GaussianNB()
results = cross_validate(model, X_embedding, y, scoring=scoring, cv=5, n_jobs=-1)
performance[model.__repr__()] = results
results



{'fit_time': array([0.36845875, 0.33458734, 0.33143544, 0.33512092, 0.35239267]),
 'score_time': array([1.04530835, 1.04858422, 1.04656935, 1.05090714, 1.04216456]),
 'test_f1_weighted': array([0.27123848, 0.27369736, 0.26685644, 0.26905476, 0.26686778]),
 'test_precision_weighted': array([0.70860048, 0.70726864, 0.71122055, 0.70675038, 0.71329233]),
 'test_recall_weighted': array([0.22269175, 0.225284  , 0.22033329, 0.22156213, 0.22028909]),
 'test_accuracy': array([0.22269175, 0.225284  , 0.22033329, 0.22156213, 0.22028909]),
 'test_f1_macro': array([0.09684734, 0.10362612, 0.1046864 , 0.10583196, 0.10464445]),
 'test_precision_macro': array([0.12570968, 0.12876713, 0.13713622, 0.13513139, 0.1352716 ]),
 'test_recall_macro': array([0.38873486, 0.38953417, 0.34733323, 0.41517073, 0.3733317 ])}

In [38]:
model = LogisticRegression()
results = cross_validate(model, X_embedding, y, scoring=scoring, cv=5, n_jobs=-1)
performance[model.__repr__()] = results
results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'fit_time': array([23.25376678, 22.78696346, 22.86691022, 22.31853104, 22.75242496]),
 'score_time': array([0.12863803, 0.14065766, 0.13071847, 0.15730619, 0.13449574]),
 'test_f1_weighted': array([0.75310883, 0.7501117 , 0.75470249, 0.75394247, 0.7503404 ]),
 'test_precision_weighted': array([0.72488134, 0.72033803, 0.72986602, 0.72696844, 0.72120301]),
 'test_recall_weighted': array([0.81034513, 0.80650665, 0.81311939, 0.81266852, 0.80760288]),
 'test_accuracy': array([0.81034513, 0.80650665, 0.81311939, 0.81266852, 0.80760288]),
 'test_f1_macro': array([0.1272218 , 0.11813455, 0.1252431 , 0.13029233, 0.12418836]),
 'test_precision_macro': array([0.22793583, 0.22518488, 0.25419168, 0.25952885, 0.24773003]),
 'test_recall_macro': array([0.11167787, 0.10788488, 0.11161699, 0.11493427, 0.11105216])}

In [39]:
model = RandomForestClassifier()
results = cross_validate(model, X_embedding, y, scoring=scoring, cv=5, n_jobs=-1)
performance[model.__repr__()] = results
results



{'fit_time': array([449.3694284 , 452.0125289 , 458.23820972, 451.32117128,
        456.1089251 ]),
 'score_time': array([1.6756351 , 1.61664271, 1.65439057, 1.60502362, 1.61988711]),
 'test_f1_weighted': array([0.98787431, 0.98738314, 0.98781638, 0.9879079 , 0.98693816]),
 'test_precision_weighted': array([0.98797071, 0.98745838, 0.98786133, 0.9879532 , 0.9870056 ]),
 'test_recall_weighted': array([0.9878797 , 0.98738452, 0.98785307, 0.98792379, 0.98694249]),
 'test_accuracy': array([0.9878797 , 0.98738452, 0.98785307, 0.98792379, 0.98694249]),
 'test_f1_macro': array([0.69964425, 0.74917297, 0.74480161, 0.75056617, 0.7345489 ]),
 'test_precision_macro': array([0.72013778, 0.76558458, 0.76315061, 0.76867173, 0.74910663]),
 'test_recall_macro': array([0.6855506 , 0.73788132, 0.73343956, 0.73678061, 0.72398668])}

In [40]:
model = KNeighborsClassifier(n_neighbors=5)
results = cross_validate(model, X_embedding, y, scoring=scoring, cv=5, n_jobs=-1)
performance[model.__repr__()] = results
results



{'fit_time': array([0.05596185, 0.09860778, 0.09392333, 0.08121467, 0.07886219]),
 'score_time': array([244.57812047, 244.20218039, 244.36461544, 243.87439919,
        244.22715616]),
 'test_f1_weighted': array([0.98289896, 0.98208473, 0.98268914, 0.98295472, 0.98150413]),
 'test_precision_weighted': array([0.98305189, 0.98220479, 0.9827039 , 0.98294762, 0.9815571 ]),
 'test_recall_weighted': array([0.98298207, 0.98221279, 0.98285815, 0.98306149, 0.98165584]),
 'test_accuracy': array([0.98298207, 0.98221279, 0.98285815, 0.98306149, 0.98165584]),
 'test_f1_macro': array([0.66302971, 0.69745916, 0.68955249, 0.69621496, 0.68163318]),
 'test_precision_macro': array([0.70975693, 0.74811792, 0.71863573, 0.71271544, 0.70868597]),
 'test_recall_macro': array([0.64664276, 0.6807488 , 0.67791724, 0.68592188, 0.67087058])}

In [41]:
mlp_performance = performance