In [11]:
import os
import pathlib
import random

import numpy as np
import torch
from tqdm import tqdm

from utils.common import load_file_lists

In [12]:
def create_features(split_path):
    binary = {row[0]: row[1:] for row in np.load(os.path.join(split_path, "binary.npy"), allow_pickle=True)}
    X_train = []
    Y_train = []
    test_data = load_file_lists([os.path.join(split_path, "train.npy"), os.path.join(split_path, "valid.npy")])
    for idx, filename in tqdm(test_data):
        filename = os.path.join("../data/mtat/emb", str(pathlib.Path(filename).with_suffix(".npy")))
        file_data = np.load(filename, allow_pickle=True).flatten()
        X_train.append(file_data)
        Y_train.append(binary[int(idx)])
    X_train = np.array(X_train)
    Y_train = np.array(Y_train)
    return X_train, Y_train

In [13]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

In [14]:
set_seed(123456)

Random seed set as 123456


In [15]:
dataset_name = "mtat-20"
split_path = "../split"
features_path = "../data/mtat/features/"

split_path = os.path.join(split_path, dataset_name)
os.makedirs(split_path, exist_ok=True)
X_train, Y_train = create_features(split_path)

100%|██████████| 17746/17746 [02:37<00:00, 113.02it/s]


In [16]:
X_train

array([[2.6140246 , 1.8475785 , 4.3649316 , ..., 1.1230563 , 3.333457  ,
        2.867814  ],
       [2.4309678 , 2.4298744 , 2.9125767 , ..., 1.5540198 , 3.5779722 ,
        3.57896   ],
       [2.5092711 , 2.2790258 , 2.4283175 , ..., 0.48853248, 3.4483852 ,
        2.2871022 ],
       ...,
       [2.4309678 , 2.3061476 , 2.722368  , ..., 1.680212  , 3.0588841 ,
        2.6399963 ],
       [3.4527833 , 2.1705675 , 2.8483565 , ..., 1.5671974 , 3.0898876 ,
        2.31349   ],
       [2.4309678 , 3.3939528 , 3.5358424 , ..., 1.3863649 , 3.490217  ,
        4.095427  ]], dtype=float32)

In [17]:
Y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]])

In [20]:
from sklearn.exceptions import UndefinedMetricWarning
from utils.config import Config
from components.tester import SklearnTester
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier

import warnings
warnings.simplefilter(action='ignore', category=UndefinedMetricWarning)


config = Config(model=None, model_filename_path="../models", data_path='../data',
                dataset_split_path="../split", dataset_name=f"mtat-20")
tester = SklearnTester(config)

models = [
    RandomForestClassifier(bootstrap=True,
                           max_depth=20,
                           max_features='sqrt',
                           n_jobs=4,
                           random_state=1,
                           warm_start=True),
    DecisionTreeClassifier(max_depth=20,
                           max_features='sqrt',
                           random_state=1),
    KNeighborsClassifier(),
    RadiusNeighborsClassifier(radius=100.)
]
for model in models:
    model.fit(X_train, Y_train)
    stats = tester.test(model)

[2023-05-04 13:09:37] Tester initialised with CUDA: False and mode: TEST
Cannot use cuda for model, defaulting to cpu


100%|██████████| 1972/1972 [00:13<00:00, 151.50it/s]


[2023-05-04 13:12:15] F1 Score: 0.3023
[2023-05-04 13:12:15] AUC/ROC: 0.9033
[2023-05-04 13:12:15] AUC/PR: 0.5749


100%|██████████| 1972/1972 [00:13<00:00, 147.99it/s]


[2023-05-04 13:12:35] F1 Score: 0.3547
[2023-05-04 13:12:35] AUC/ROC: 0.6436
[2023-05-04 13:12:35] AUC/PR: 0.2215


100%|██████████| 1972/1972 [00:10<00:00, 185.49it/s]


[2023-05-04 13:13:12] F1 Score: 0.5121
[2023-05-04 13:13:12] AUC/ROC: 0.8518
[2023-05-04 13:13:12] AUC/PR: 0.5084


100%|██████████| 1972/1972 [00:12<00:00, 151.72it/s]


[2023-05-04 13:14:01] F1 Score: 0.0138
[2023-05-04 13:14:01] AUC/ROC: 0.7805
[2023-05-04 13:14:01] AUC/PR: 0.3197
