In [1]:
import os
import pathlib
import random

import numpy as np
import torch
from tqdm import tqdm

from utils.common import load_file_lists

In [2]:
def create_features(split_path):
    binary = {row[0]: row[1:] for row in np.load(os.path.join(split_path, "binary.npy"), allow_pickle=True)}
    X_train = []
    Y_train = []
    test_data = load_file_lists([os.path.join(split_path, "train.npy"), os.path.join(split_path, "valid.npy")])
    for idx, filename in tqdm(test_data):
        filename = os.path.join("../data/mtat/emb", str(pathlib.Path(filename).with_suffix(".npy")))
        file_data = np.load(filename, allow_pickle=True).flatten()
        X_train.append(file_data)
        Y_train.append(binary[int(idx)])
    X_train = np.array(X_train)
    Y_train = np.array(Y_train)
    return X_train, Y_train

In [3]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

In [4]:
set_seed(123456)

Random seed set as 123456


In [5]:
dataset_name = "mtat-20"
split_path = "../split"
features_path = "../data/mtat/features/"

split_path = os.path.join(split_path, dataset_name)
os.makedirs(split_path, exist_ok=True)
X_train, Y_train = create_features(split_path)

100%|██████████| 17746/17746 [02:30<00:00, 117.89it/s]


In [6]:
X_train

array([[2.6140246 , 1.8475785 , 4.3649316 , ..., 1.1230563 , 3.333457  ,
        2.867814  ],
       [2.4309678 , 2.4298744 , 2.9125767 , ..., 1.5540198 , 3.5779722 ,
        3.57896   ],
       [2.5092711 , 2.2790258 , 2.4283175 , ..., 0.48853248, 3.4483852 ,
        2.2871022 ],
       ...,
       [2.4309678 , 2.3061476 , 2.722368  , ..., 1.680212  , 3.0588841 ,
        2.6399963 ],
       [3.4527833 , 2.1705675 , 2.8483565 , ..., 1.5671974 , 3.0898876 ,
        2.31349   ],
       [2.4309678 , 3.3939528 , 3.5358424 , ..., 1.3863649 , 3.490217  ,
        4.095427  ]], dtype=float32)

In [7]:
Y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]])

In [8]:
from sklearn.exceptions import UndefinedMetricWarning
from utils.config import Config
from components.tester import SklearnTester
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier

import warnings
warnings.simplefilter(action='ignore', category=UndefinedMetricWarning)


config = Config(model_filename_path="../models", data_path='../data',
                dataset_split_path="../split", dataset_name=f"mtat-20")
tester = SklearnTester(config)

models = [
    RandomForestClassifier(),
    DecisionTreeClassifier(),
    KNeighborsClassifier(),
    RadiusNeighborsClassifier(radius=1000.0)
]
for model in models:
    model.fit(X_train, Y_train)
    stats = tester.test(model)

[2023-05-04 14:31:05] Tester initialised with CUDA: False and mode: TEST


100%|██████████| 1972/1972 [00:16<00:00, 119.78it/s]


[2023-05-04 14:38:32] F1 Score: 0.3045
[2023-05-04 14:38:32] AUC/ROC: 0.9023
[2023-05-04 14:38:32] AUC/PR: 0.5641


100%|██████████| 1972/1972 [00:16<00:00, 116.35it/s]


[2023-05-04 14:57:40] F1 Score: 0.3616
[2023-05-04 14:57:40] AUC/ROC: 0.6420
[2023-05-04 14:57:40] AUC/PR: 0.2220


100%|██████████| 1972/1972 [00:15<00:00, 128.12it/s]


[2023-05-04 14:58:26] F1 Score: 0.5121
[2023-05-04 14:58:26] AUC/ROC: 0.8518
[2023-05-04 14:58:26] AUC/PR: 0.5084


100%|██████████| 1972/1972 [00:14<00:00, 134.90it/s]


[2023-05-04 14:59:09] F1 Score: 0.0000
[2023-05-04 14:59:09] AUC/ROC: 0.5000
[2023-05-04 14:59:09] AUC/PR: 0.1154


In [18]:
import pandas as pd

data = {
    "RandomForestClassifier": [0.3045, 0.9023, 0.5641],
    "DecisionTreeClassifier": [0.3616, 0.6420, 0.2220],
    "KNeighborsClassifier": [0.5121, 0.8518, 0.5084],
}

df = pd.DataFrame.from_dict(data, orient="index", columns=["f1_score", "roc_auc", "pr_auc"])
print(df)

                        f1_score  roc_auc  pr_auc
RandomForestClassifier    0.3045   0.9023  0.5641
DecisionTreeClassifier    0.3616   0.6420  0.2220
KNeighborsClassifier      0.5121   0.8518  0.5084


In [26]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=1, cols=1, subplot_titles=['F1 Score'])
fig.add_trace(go.Bar(y=df.index, x=df["f1_score"], orientation="h", text=df["f1_score"]), row=1, col=1),
fig.update_layout(({"title": {"text":"Wstępne porównanie skuteczności algorytmów końcowych (OpenL3)", "x": 0.5},
                    "showlegend": False,
                    "template": "plotly_white"}))
# Change the bar mode
fig.show()
fig.write_image("images/openl3-algorithm.png", width=1000, height=500)