In [1]:
import os
import pathlib
import random

import numpy as np
import torch
from tqdm import tqdm

from utils.common import load_file_lists

In [2]:
def create_features(split_path):
    binary = {row[0]: row[1:] for row in np.load(os.path.join(split_path, "binary.npy"), allow_pickle=True)}
    X_train = []
    Y_train = []
    test_data = load_file_lists([os.path.join(split_path, "train.npy"), os.path.join(split_path, "valid.npy")])
    for idx, filename in tqdm(test_data):
        filename = os.path.join("../data/mtat/emb", str(pathlib.Path(filename).with_suffix(".npy")))
        file_data = np.load(filename, allow_pickle=True).flatten()
        X_train.append(file_data)
        Y_train.append(binary[int(idx)])
    X_train = np.array(X_train)
    Y_train = np.array(Y_train)
    return X_train, Y_train

In [3]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

In [4]:
set_seed(123456)

Random seed set as 123456


In [5]:
dataset_name = "mtat-10"
split_path = "../split"
features_path = "../data/mtat/features/"

split_path = os.path.join(split_path, dataset_name)
os.makedirs(split_path, exist_ok=True)
X_train, Y_train = create_features(split_path)

100%|██████████| 14822/14822 [02:15<00:00, 109.32it/s]


In [6]:
X_train

array([[2.4309678 , 2.0419495 , 2.9835021 , ..., 0.596608  , 3.3791773 ,
        2.851194  ],
       [2.4309678 , 2.2831523 , 3.713408  , ..., 2.5958302 , 3.2312474 ,
        2.98357   ],
       [2.4309678 , 2.447584  , 3.6947517 , ..., 1.0246372 , 3.075337  ,
        2.389415  ],
       ...,
       [2.4309678 , 2.4188912 , 2.992663  , ..., 1.7392542 , 3.2222848 ,
        2.2493355 ],
       [2.4584339 , 2.48222   , 4.454659  , ..., 2.301653  , 3.4233615 ,
        3.2996736 ],
       [2.4309678 , 3.1145184 , 2.6744354 , ..., 0.98018384, 3.6385753 ,
        3.348161  ]], dtype=float32)

In [7]:
Y_train

array([[0, 1, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1]])

In [9]:
from sklearn.exceptions import UndefinedMetricWarning
from utils.config import Config
from components.tester import SklearnTester
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier

import warnings
warnings.simplefilter(action='ignore', category=UndefinedMetricWarning)


config = Config(model=None, model_filename_path="../models", data_path='../data',
                dataset_split_path="../split", dataset_name=f"mtat-10")
tester = SklearnTester(config)

models = [
    RandomForestClassifier(),
    DecisionTreeClassifier(),
    KNeighborsClassifier(),
    RadiusNeighborsClassifier()
]
for model in models:
    model.fit(X_train, Y_train)
    stats = tester.test(model)

[2023-05-04 13:24:06] Tester initialised with CUDA: False and mode: TEST
Cannot use cuda for model, defaulting to cpu


100%|██████████| 1648/1648 [00:11<00:00, 139.58it/s]


[2023-05-04 13:28:37] F1 Score: 0.4837
[2023-05-04 13:28:37] AUC/ROC: 0.8994
[2023-05-04 13:28:37] AUC/PR: 0.6859


KeyboardInterrupt: 