In [3]:
import os
import random
import numpy as np
import torch
import pathlib
from tqdm import tqdm

from utils.common import load_file_lists

In [4]:
def create_features(split_path):
    binary = {row[0]: row[1:] for row in np.load(os.path.join(split_path, "binary.npy"), allow_pickle=True)}
    X_train = []
    Y_train = []
    test_data = load_file_lists([os.path.join(split_path, "train.npy"), os.path.join(split_path, "valid.npy")])
    for idx, filename in tqdm(test_data):
        filename = os.path.join("../data/mtat/emb", str(pathlib.Path(filename).with_suffix(".npy")))
        file_data = np.load(filename, allow_pickle=True).flatten()
        X_train.append(file_data)
        Y_train.append(binary[int(idx)])
    X_train = np.array(X_train)
    Y_train = np.array(Y_train)
    return X_train, Y_train

In [5]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

In [6]:
set_seed(123456)

Random seed set as 123456


In [7]:
dataset_name = "mtat-20"
split_path = "../split"
features_path = "../data/mtat/features/"

split_path = os.path.join(split_path, dataset_name)
os.makedirs(split_path, exist_ok=True)
X_train, Y_train = create_features(split_path)

100%|██████████| 17746/17746 [02:12<00:00, 134.38it/s]


In [8]:
X_train

array([[2.6140246 , 1.8475785 , 4.3649316 , ..., 1.1230563 , 3.333457  ,
        2.867814  ],
       [2.4309678 , 2.4298744 , 2.9125767 , ..., 1.5540198 , 3.5779722 ,
        3.57896   ],
       [2.5092711 , 2.2790258 , 2.4283175 , ..., 0.48853248, 3.4483852 ,
        2.2871022 ],
       ...,
       [2.4309678 , 2.3061476 , 2.722368  , ..., 1.680212  , 3.0588841 ,
        2.6399963 ],
       [3.4527833 , 2.1705675 , 2.8483565 , ..., 1.5671974 , 3.0898876 ,
        2.31349   ],
       [2.4309678 , 3.3939528 , 3.5358424 , ..., 1.3863649 , 3.490217  ,
        4.095427  ]], dtype=float32)

In [9]:
Y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]])

In [10]:
from components.tester import SklearnTester
from utils.config import Config
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.simplefilter(action='ignore', category=UndefinedMetricWarning)

config = Config(model=KNeighborsClassifier(), model_filename_path="../models", data_path='../data',
                dataset_split_path="../split", dataset_name=f"mtat-20")
tester = SklearnTester(config)

results_20 = {}
for i in range(1, 16):
    print(f"K = {i}")
    model = KNeighborsClassifier(n_neighbors=i)
    model.fit(X_train, Y_train)
    stats = tester.test(model)
    results_20[i] = [stats.f1_score, stats.roc_auc, stats.pr_auc]

[2023-05-02 20:23:17] Tester initialised with CUDA: False and mode: TEST
Cannot use cuda for model, defaulting to cpu
K = 1


100%|██████████| 1972/1972 [00:14<00:00, 132.08it/s]


[2023-05-02 20:23:56] F1 Score: 0.5092
[2023-05-02 20:23:56] AUC/ROC: 0.7263
[2023-05-02 20:23:56] AUC/PR: 0.3431
K = 2


100%|██████████| 1972/1972 [00:13<00:00, 144.46it/s]


[2023-05-02 20:24:36] F1 Score: 0.5343
[2023-05-02 20:24:36] AUC/ROC: 0.7851
[2023-05-02 20:24:36] AUC/PR: 0.4208
K = 3


100%|██████████| 1972/1972 [00:13<00:00, 140.96it/s]


[2023-05-02 20:25:15] F1 Score: 0.5189
[2023-05-02 20:25:15] AUC/ROC: 0.8194
[2023-05-02 20:25:15] AUC/PR: 0.4650
K = 4


100%|██████████| 1972/1972 [00:14<00:00, 140.21it/s]


[2023-05-02 20:25:53] F1 Score: 0.5489
[2023-05-02 20:25:53] AUC/ROC: 0.8410
[2023-05-02 20:25:53] AUC/PR: 0.4929
K = 5


100%|██████████| 1972/1972 [00:13<00:00, 142.55it/s]


[2023-05-02 20:26:33] F1 Score: 0.5121
[2023-05-02 20:26:33] AUC/ROC: 0.8518
[2023-05-02 20:26:33] AUC/PR: 0.5084
K = 6


100%|██████████| 1972/1972 [00:13<00:00, 147.89it/s]


[2023-05-02 20:27:11] F1 Score: 0.5375
[2023-05-02 20:27:11] AUC/ROC: 0.8609
[2023-05-02 20:27:11] AUC/PR: 0.5194
K = 7


100%|██████████| 1972/1972 [00:13<00:00, 150.92it/s]


[2023-05-02 20:27:47] F1 Score: 0.4981
[2023-05-02 20:27:47] AUC/ROC: 0.8687
[2023-05-02 20:27:47] AUC/PR: 0.5297
K = 8


100%|██████████| 1972/1972 [00:13<00:00, 144.53it/s]


[2023-05-02 20:28:26] F1 Score: 0.5244
[2023-05-02 20:28:26] AUC/ROC: 0.8747
[2023-05-02 20:28:26] AUC/PR: 0.5375
K = 9


100%|██████████| 1972/1972 [00:15<00:00, 124.64it/s]


[2023-05-02 20:29:05] F1 Score: 0.4947
[2023-05-02 20:29:05] AUC/ROC: 0.8798
[2023-05-02 20:29:05] AUC/PR: 0.5463
K = 10


100%|██████████| 1972/1972 [00:13<00:00, 142.46it/s]


[2023-05-02 20:29:43] F1 Score: 0.5222
[2023-05-02 20:29:43] AUC/ROC: 0.8843
[2023-05-02 20:29:43] AUC/PR: 0.5531
K = 11


100%|██████████| 1972/1972 [00:12<00:00, 153.00it/s]


[2023-05-02 20:30:19] F1 Score: 0.4908
[2023-05-02 20:30:19] AUC/ROC: 0.8868
[2023-05-02 20:30:19] AUC/PR: 0.5573
K = 12


100%|██████████| 1972/1972 [00:12<00:00, 159.45it/s]


[2023-05-02 20:30:54] F1 Score: 0.5107
[2023-05-02 20:30:54] AUC/ROC: 0.8890
[2023-05-02 20:30:54] AUC/PR: 0.5568
K = 13


100%|██████████| 1972/1972 [00:12<00:00, 153.41it/s]


[2023-05-02 20:31:30] F1 Score: 0.4860
[2023-05-02 20:31:30] AUC/ROC: 0.8913
[2023-05-02 20:31:30] AUC/PR: 0.5606
K = 14


100%|██████████| 1972/1972 [00:15<00:00, 127.84it/s]


[2023-05-02 20:32:09] F1 Score: 0.5035
[2023-05-02 20:32:09] AUC/ROC: 0.8927
[2023-05-02 20:32:09] AUC/PR: 0.5622
K = 15


100%|██████████| 1972/1972 [00:13<00:00, 149.98it/s]


[2023-05-02 20:32:46] F1 Score: 0.4806
[2023-05-02 20:32:46] AUC/ROC: 0.8943
[2023-05-02 20:32:46] AUC/PR: 0.5638


In [11]:
dataset_name = "mtat-10"
split_path = "../split"
features_path = "../data/mtat/features/"

split_path = os.path.join(split_path, dataset_name)
os.makedirs(split_path, exist_ok=True)
X_train, Y_train = create_features(split_path)

100%|██████████| 14822/14822 [01:59<00:00, 124.28it/s]


In [12]:
X_train

array([[2.4309678 , 2.0419495 , 2.9835021 , ..., 0.596608  , 3.3791773 ,
        2.851194  ],
       [2.4309678 , 2.2831523 , 3.713408  , ..., 2.5958302 , 3.2312474 ,
        2.98357   ],
       [2.4309678 , 2.447584  , 3.6947517 , ..., 1.0246372 , 3.075337  ,
        2.389415  ],
       ...,
       [2.4309678 , 2.4188912 , 2.992663  , ..., 1.7392542 , 3.2222848 ,
        2.2493355 ],
       [2.4584339 , 2.48222   , 4.454659  , ..., 2.301653  , 3.4233615 ,
        3.2996736 ],
       [2.4309678 , 3.1145184 , 2.6744354 , ..., 0.98018384, 3.6385753 ,
        3.348161  ]], dtype=float32)

In [13]:
Y_train

array([[0, 1, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1]])

In [14]:
from components.tester import SklearnTester
from utils.config import Config
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.simplefilter(action='ignore', category=UndefinedMetricWarning)

config = Config(model=KNeighborsClassifier(), model_filename_path="../models", data_path='../data',
                dataset_split_path="../split", dataset_name=f"mtat-10")
tester = SklearnTester(config)

results_10 = {}
for i in range(1, 16):
    print(f"K = {i}")
    model = KNeighborsClassifier(n_neighbors=i)
    model.fit(X_train, Y_train)
    stats = tester.test(model)
    results_10[i] = [stats.f1_score, stats.roc_auc, stats.pr_auc]

[2023-05-02 20:34:47] Tester initialised with CUDA: False and mode: TEST
Cannot use cuda for model, defaulting to cpu
K = 1


100%|██████████| 1648/1648 [00:13<00:00, 126.54it/s]


[2023-05-02 20:35:16] F1 Score: 0.6058
[2023-05-02 20:35:16] AUC/ROC: 0.7591
[2023-05-02 20:35:16] AUC/PR: 0.4586
K = 2


100%|██████████| 1648/1648 [00:11<00:00, 148.11it/s]


[2023-05-02 20:35:44] F1 Score: 0.6370
[2023-05-02 20:35:44] AUC/ROC: 0.8221
[2023-05-02 20:35:44] AUC/PR: 0.5562
K = 3


100%|██████████| 1648/1648 [00:11<00:00, 149.28it/s]


[2023-05-02 20:36:11] F1 Score: 0.6206
[2023-05-02 20:36:11] AUC/ROC: 0.8470
[2023-05-02 20:36:11] AUC/PR: 0.5940
K = 4


100%|██████████| 1648/1648 [00:10<00:00, 152.71it/s]


[2023-05-02 20:36:37] F1 Score: 0.6469
[2023-05-02 20:36:37] AUC/ROC: 0.8639
[2023-05-02 20:36:37] AUC/PR: 0.6208
K = 5


100%|██████████| 1648/1648 [00:10<00:00, 162.27it/s]


[2023-05-02 20:37:02] F1 Score: 0.6262
[2023-05-02 20:37:02] AUC/ROC: 0.8744
[2023-05-02 20:37:02] AUC/PR: 0.6349
K = 6


100%|██████████| 1648/1648 [00:10<00:00, 164.61it/s]


[2023-05-02 20:37:28] F1 Score: 0.6449
[2023-05-02 20:37:28] AUC/ROC: 0.8797
[2023-05-02 20:37:28] AUC/PR: 0.6451
K = 7


100%|██████████| 1648/1648 [00:09<00:00, 176.66it/s]


[2023-05-02 20:37:53] F1 Score: 0.6201
[2023-05-02 20:37:53] AUC/ROC: 0.8851
[2023-05-02 20:37:53] AUC/PR: 0.6562
K = 8


100%|██████████| 1648/1648 [00:09<00:00, 166.58it/s]


[2023-05-02 20:38:18] F1 Score: 0.6370
[2023-05-02 20:38:18] AUC/ROC: 0.8876
[2023-05-02 20:38:18] AUC/PR: 0.6612
K = 9


100%|██████████| 1648/1648 [00:10<00:00, 157.06it/s]


[2023-05-02 20:38:44] F1 Score: 0.6123
[2023-05-02 20:38:44] AUC/ROC: 0.8906
[2023-05-02 20:38:44] AUC/PR: 0.6652
K = 10


100%|██████████| 1648/1648 [00:11<00:00, 148.32it/s]


[2023-05-02 20:39:12] F1 Score: 0.6355
[2023-05-02 20:39:12] AUC/ROC: 0.8949
[2023-05-02 20:39:12] AUC/PR: 0.6713
K = 11


100%|██████████| 1648/1648 [00:10<00:00, 153.08it/s]


[2023-05-02 20:39:40] F1 Score: 0.6130
[2023-05-02 20:39:40] AUC/ROC: 0.8976
[2023-05-02 20:39:40] AUC/PR: 0.6741
K = 12


100%|██████████| 1648/1648 [00:11<00:00, 139.46it/s]


[2023-05-02 20:40:09] F1 Score: 0.6298
[2023-05-02 20:40:09] AUC/ROC: 0.8986
[2023-05-02 20:40:09] AUC/PR: 0.6778
K = 13


100%|██████████| 1648/1648 [00:09<00:00, 166.05it/s]


[2023-05-02 20:40:35] F1 Score: 0.6134
[2023-05-02 20:40:35] AUC/ROC: 0.9000
[2023-05-02 20:40:35] AUC/PR: 0.6808
K = 14


100%|██████████| 1648/1648 [00:10<00:00, 155.24it/s]


[2023-05-02 20:41:03] F1 Score: 0.6290
[2023-05-02 20:41:03] AUC/ROC: 0.9018
[2023-05-02 20:41:03] AUC/PR: 0.6838
K = 15


100%|██████████| 1648/1648 [00:10<00:00, 154.83it/s]


[2023-05-02 20:41:29] F1 Score: 0.6153
[2023-05-02 20:41:29] AUC/ROC: 0.9025
[2023-05-02 20:41:29] AUC/PR: 0.6870


In [15]:
import pandas as pd
df_10 = pd.DataFrame.from_dict(results_10, orient="index", columns=["f1_score", "auc_roc", "auc_pr"])
df_10

Unnamed: 0,f1_score,auc_roc,auc_pr
1,0.605759,0.759068,0.458636
2,0.636979,0.822133,0.556212
3,0.620557,0.847026,0.594049
4,0.64689,0.863885,0.620804
5,0.626241,0.874351,0.634899
6,0.644876,0.879716,0.645081
7,0.620055,0.885109,0.656168
8,0.637034,0.887627,0.661191
9,0.612339,0.890648,0.665248
10,0.635542,0.894942,0.671304


In [16]:
df_20 = pd.DataFrame.from_dict(results_20, orient="index", columns=["f1_score", "auc_roc", "auc_pr"])
df_20

Unnamed: 0,f1_score,auc_roc,auc_pr
1,0.509241,0.726319,0.343053
2,0.534299,0.785071,0.420765
3,0.518905,0.819422,0.465024
4,0.548916,0.840968,0.492936
5,0.512107,0.851774,0.50835
6,0.53748,0.860853,0.51942
7,0.498075,0.868748,0.529737
8,0.524424,0.874681,0.537454
9,0.494672,0.879849,0.546305
10,0.522233,0.884281,0.553077


In [26]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=1, cols=3, subplot_titles=['F1 Score', 'Area under ROC curve', 'Area under PR curve'])
fig.add_trace(go.Scatter(x=df_10.index, y=df_10["f1_score"], name='mtat-10'), col=1, row=1)
fig.add_trace(go.Scatter(x=df_20.index, y=df_20["f1_score"], name='mtat-20'), col=1, row=1)
fig.add_trace(go.Scatter(x=df_10.index, y=df_10["auc_roc"], name='mtat-10'), col=2, row=1)
fig.add_trace(go.Scatter(x=df_20.index, y=df_20["auc_roc"], name='mtat-20'), col=2, row=1)
fig.add_trace(go.Scatter(x=df_10.index, y=df_10["auc_pr"], name='mtat-10'), col=3, row=1)
fig.add_trace(go.Scatter(x=df_20.index, y=df_20["auc_pr"], name='mtat-20'), col=3, row=1)

fig.update_layout(({"title": {"text": "Strojenie parametru K", "x": 0.5},
                    "showlegend": True,
                    "template": "plotly_white"}))
fig.show()
fig.write_image("images/fig9.png", width=1000, height=500)

In [29]:
import pickle
from components.tester import SklearnTester
from utils.config import Config
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.simplefilter(action='ignore', category=UndefinedMetricWarning)

for n in [10, 20]:
    config = Config(model=KNeighborsClassifier(), model_filename_path="../models", data_path='../data',
                    dataset_split_path="../split", dataset_name=f"mtat-{n}")
    tester = SklearnTester(config)

    dataset_name = f"mtat-{n}"
    split_path = "../split"
    features_path = "../data/mtat/features/"

    split_path = os.path.join(split_path, dataset_name)
    os.makedirs(split_path, exist_ok=True)
    X_train, Y_train = create_features(split_path)

    model = KNeighborsClassifier(n_neighbors=4)
    model.fit(X_train, Y_train)
    model_filename = os.path.join("../models", model.__class__.__name__, config.dataset_name, "model.bin")
    pickle.dump(model, open(model_filename, 'wb+'))
    stats = tester.test(model)

[2023-05-02 21:00:29] Tester initialised with CUDA: False and mode: TEST
Cannot use cuda for model, defaulting to cpu


100%|██████████| 14822/14822 [01:51<00:00, 133.28it/s]
100%|██████████| 1648/1648 [00:12<00:00, 132.47it/s]


[2023-05-02 21:02:53] F1 Score: 0.6469
[2023-05-02 21:02:53] AUC/ROC: 0.8639
[2023-05-02 21:02:53] AUC/PR: 0.6208
[2023-05-02 21:02:53] Tester initialised with CUDA: False and mode: TEST
Cannot use cuda for model, defaulting to cpu


100%|██████████| 17746/17746 [02:20<00:00, 126.57it/s]
100%|██████████| 1972/1972 [00:14<00:00, 132.50it/s]


[2023-05-02 21:06:33] F1 Score: 0.5489
[2023-05-02 21:06:33] AUC/ROC: 0.8410
[2023-05-02 21:06:33] AUC/PR: 0.4929
