In [1]:
import os
import random
import numpy as np
import torch

from sklearn.model_selection import train_test_split
from models.common import load_file_lists
from models.preprocessor import OpenL3PreProcessor

In [2]:
split_path = "../split/mtat-20/"
os.makedirs(split_path, exist_ok=True)

In [3]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

set_seed(123456)

Random seed set as 123456


In [4]:
p = OpenL3PreProcessor(input_path="../data/mtat/mp3",
                       output_path="../data/mtat/emb",
                       suffix="npy")
# print(load_file_lists(["../split/mtat/train.npy", "../split/mtat/valid.npy", "../split/mtat/test.npy"])[:, 1])
data = load_file_lists([
    os.path.join(split_path, "train.npy"),
    os.path.join(split_path, "valid.npy"),
    os.path.join(split_path, "test.npy")
])
p.run(files=data[:, 1])

100%|██████████| 19718/19718 [01:25<00:00, 230.66it/s]


In [5]:
binary = {row[0]: row[1:] for row in np.load(os.path.join(split_path, "binary.npy"), allow_pickle=True)}
tags = np.load(os.path.join(split_path, "tags.npy"), allow_pickle=True)

In [6]:
import pathlib

X = []
Y = []
for idx, filename in data:
    filename = os.path.join("../data/mtat/emb", str(pathlib.Path(filename).with_suffix(".npy")))
    file_data = np.load(filename, allow_pickle=True).flatten()
    X.append(file_data)
    Y.append(binary[int(idx)])

In [7]:
X_np = np.array(X)
np.save("X.npy", X_np)
X_np

[array([2.6140246, 1.8475785, 4.3649316, ..., 1.1230563, 3.333457 ,
        2.867814 ], dtype=float32),
 array([2.4309678, 2.4298744, 2.9125767, ..., 1.5540198, 3.5779722,
        3.57896  ], dtype=float32),
 array([2.5092711 , 2.2790258 , 2.4283175 , ..., 0.48853248, 3.4483852 ,
        2.2871022 ], dtype=float32),
 array([2.4309678 , 2.3749325 , 2.9955316 , ..., 0.37670907, 3.0354125 ,
        2.2796388 ], dtype=float32),
 array([2.4309678 , 1.6277139 , 2.884448  , ..., 0.99905354, 3.2566793 ,
        2.8214357 ], dtype=float32),
 array([2.4309678, 2.5410569, 3.0848236, ..., 1.0592364, 3.3978274,
        3.5770059], dtype=float32),
 array([2.4309678 , 2.489874  , 3.1051142 , ..., 0.73338133, 3.5630233 ,
        2.630619  ], dtype=float32),
 array([2.4309678, 1.9654976, 3.032537 , ..., 2.141547 , 3.1074781,
        2.63015  ], dtype=float32),
 array([2.4309678, 2.0541818, 3.188023 , ..., 1.6346337, 3.5005744,
        3.5917418], dtype=float32),
 array([2.4309678, 1.9988879, 3.6360612,

In [8]:
Y_np = np.array(Y)
np.save("y.npy", Y_np)
Y_np

[array([0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]),
 array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1]),
 array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0]),
 array([1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]),
 array([0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]),
 array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=1)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model = RandomForestClassifier(bootstrap=True,
                             max_depth=20,
                             max_features='sqrt',
                             n_jobs=4,
                             random_state=1,
                             warm_start=True)
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
print(classification_report(y_pred, Y_test))

              precision    recall  f1-score   support

           0       0.47      0.92      0.62       612
           1       0.59      0.78      0.67       785
           2       0.07      0.61      0.13       101
           3       0.54      0.83      0.65       487
           4       0.14      0.60      0.22       149
           5       0.11      0.80      0.19        88
           6       0.07      0.57      0.13        81
           7       0.57      0.98      0.72       353
           8       0.08      0.66      0.14        68
           9       0.35      1.00      0.51       184
          10       0.32      0.90      0.47       176
          11       0.15      0.71      0.24       105
          12       0.36      0.78      0.49       195
          13       0.01      0.75      0.01         4
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.01      1.00      0.02         4
          17       0.39    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

model = DecisionTreeClassifier(max_depth=20,
                               max_features='sqrt',
                               random_state=1)
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
print(classification_report(y_pred, Y_test))

              precision    recall  f1-score   support

           0       0.53      0.51      0.52      1222
           1       0.55      0.53      0.54      1080
           2       0.28      0.28      0.28       894
           3       0.54      0.55      0.55       730
           4       0.37      0.36      0.37       687
           5       0.29      0.30      0.29       636
           6       0.32      0.31      0.32       648
           7       0.56      0.57      0.56       596
           8       0.26      0.26      0.26       566
           9       0.39      0.43      0.41       478
          10       0.39      0.40      0.39       484
          11       0.29      0.35      0.32       433
          12       0.43      0.40      0.41       448
          13       0.23      0.21      0.22       495
          14       0.18      0.18      0.18       411
          15       0.20      0.19      0.19       384
          16       0.20      0.20      0.20       351
          17       0.52    

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import ClassifierChain
from sklearn.metrics import classification_report

model = ClassifierChain(LogisticRegression(solver='saga',
                                           max_iter=1000,
                                           random_state=1),
                        random_state=1)
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
print(classification_report(y_pred, Y_test))



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

model = MultiOutputClassifier(LogisticRegression(solver='saga',
                                                 max_iter=1000,
                                                 random_state=1),
                              n_jobs=4)
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
print(classification_report(y_pred, Y_test))

In [None]:
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

model = MultiOutputClassifier(SVC(max_iter=1000, random_state=1),
                              n_jobs=4)
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
print(classification_report(y_pred, Y_test))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

model = KNeighborsClassifier()
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
print(classification_report(y_pred, Y_test))

In [None]:
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.metrics import classification_report

model = RadiusNeighborsClassifier()
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
print(classification_report(y_pred, Y_test))