In [281]:
from random import Random

import numpy as np
import pandas as pd
import os

from tqdm import tqdm as tqdm
from torch.utils.data import Dataset
import sklearn
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import umap.umap_ as umap
from sklearn.manifold import TSNE
import seaborn as sns
import matplotlib.pyplot as plt



In [282]:
# Seed for reproducibility
SEED = 42
np.random.seed(SEED)

# Path to the .npz file
DATASET_PATH = "../MLPC2025_classification"

ANNOTATIONS_PATH = DATASET_PATH + "/annotations.csv"
#ANNOTATIONS_TEXT_EMBEDDINGS_PATH = DATASET_PATH + "/annotations_text_embeddings.npz"

METADATA_PATH = DATASET_PATH + "/metadata.csv"
#METADATA_TITLE_EMBEDDINGS_PATH = DATASET_PATH + "/metadata_title_embeddings.npz"
#METADATA_KEYWORDS_EMBEDDINGS_PATH = DATASET_PATH + "/metadata_keywords_embeddings.npz"

AUDIO_PATHS = DATASET_PATH + "/audio"
AUDIO_FEATURES_PATHS = DATASET_PATH + "/audio_features"

LABELS_PATH = DATASET_PATH + "/labels"

In [283]:
annotations = pd.read_csv(ANNOTATIONS_PATH)
annotations.head()

Unnamed: 0,task_id,filename,annotator,text,onset,offset,time,original_caption,categories
0,161976549,117126.mp3,1145579747015607221221744067969991550764671773...,An alarm is ringing loudly and repeatedly nearby.,0,10,345,Raw loud alarm sound repeatedly ringing nearby,['Alarm']
1,161976549,117126.mp3,1145579747015607221221744067969991550764671773...,An alarm is ringing repeatedly nearby.,12,23,345,Clean alarm sound repeatedly ringing nearby,['Alarm']
2,161976550,118234.mp3,5022633589939139634134314703519782680423201448...,An alarm clock is beeping continuously.,0,13,919,Alarm clock beeping continuesly,"['Alarm', 'Beep/Bleep']"
3,161976550,118234.mp3,5022633589939139634134314703519782680423201448...,An alarm clock is beeping continuously.,15,28,919,Alarm clock beeping continuesly,"['Alarm', 'Beep/Bleep']"
4,161976551,119173.mp3,8105077500224920444298835829881210427871190692...,A car alarm sounds loudly in a steady pattern.,0,20,2162,a car alarm sounds loudly in a steady pattern ...,"['Alarm', 'Car']"


In [284]:
# Remove unnecessary columns for this task
annotations = annotations.drop(columns=['task_id'])
annotations['original_index'] = annotations.index
annotations.head()

Unnamed: 0,filename,annotator,text,onset,offset,time,original_caption,categories,original_index
0,117126.mp3,1145579747015607221221744067969991550764671773...,An alarm is ringing loudly and repeatedly nearby.,0,10,345,Raw loud alarm sound repeatedly ringing nearby,['Alarm'],0
1,117126.mp3,1145579747015607221221744067969991550764671773...,An alarm is ringing repeatedly nearby.,12,23,345,Clean alarm sound repeatedly ringing nearby,['Alarm'],1
2,118234.mp3,5022633589939139634134314703519782680423201448...,An alarm clock is beeping continuously.,0,13,919,Alarm clock beeping continuesly,"['Alarm', 'Beep/Bleep']",2
3,118234.mp3,5022633589939139634134314703519782680423201448...,An alarm clock is beeping continuously.,15,28,919,Alarm clock beeping continuesly,"['Alarm', 'Beep/Bleep']",3
4,119173.mp3,8105077500224920444298835829881210427871190692...,A car alarm sounds loudly in a steady pattern.,0,20,2162,a car alarm sounds loudly in a steady pattern ...,"['Alarm', 'Car']",4


In [285]:
metadata = pd.read_csv(METADATA_PATH)
metadata.head()

Unnamed: 0,filename,keywords,freesound_id,sound_link,manufacturer,license,title,description,num_downloads,geotag,start_time_s,end_time_s
0,321771.mp3,"Interior, AMB, Italy, Distant, Speech, Reverb",321771,https://freesound.org/people/Skjor1/sounds/321...,Skjor1,http://creativecommons.org/publicdomain/zero/1.0/,Interior Ambience + Distant Reverberant Speech...,Interior Ambience + Distant Reverberant Speech...,120,,5,27
1,451371.mp3,"kids, throaty, crowd, India, distant, traffic,...",451371,https://freesound.org/people/kyles/sounds/451371/,kyles,http://creativecommons.org/publicdomain/zero/1.0/,election rally crowd and speech with distant t...,election rally crowd and speech with distant t...,122,,120,144
2,199414.mp3,"broadcast, speech, radio",199414,https://freesound.org/people/martinimeniscus/s...,martinimeniscus,http://creativecommons.org/publicdomain/zero/1.0/,"Old Radio Speech Background, higher FF125.aif",Background noise for an old radio broadcast sp...,391,,102,130
3,410952.mp3,"loop2017, atmos, dolby, speech, ableton",410952,https://freesound.org/people/lietoofine/sounds...,lietoofine,https://creativecommons.org/licenses/by/4.0/,dolby atmos speech.wav,dolby atmos speech @Loop2017,193,52.479543 13.500279,31,54
4,203908.mp3,"dr-40, project, speech, student, italian, reci...",203908,https://freesound.org/people/s9ames/sounds/203...,s9ames,http://creativecommons.org/licenses/by/3.0/,bologna speech Italian2,recorded with a tascam dr-40 in a sound studio...,526,,29,45


In [286]:
# Remove unnecessary columns for this task
metadata = metadata.drop(columns=['freesound_id', 'sound_link', 'manufacturer', 'license', 'num_downloads', 'geotag', 'start_time_s', 'end_time_s'])
metadata.head()

Unnamed: 0,filename,keywords,title,description
0,321771.mp3,"Interior, AMB, Italy, Distant, Speech, Reverb",Interior Ambience + Distant Reverberant Speech...,Interior Ambience + Distant Reverberant Speech...
1,451371.mp3,"kids, throaty, crowd, India, distant, traffic,...",election rally crowd and speech with distant t...,election rally crowd and speech with distant t...
2,199414.mp3,"broadcast, speech, radio","Old Radio Speech Background, higher FF125.aif",Background noise for an old radio broadcast sp...
3,410952.mp3,"loop2017, atmos, dolby, speech, ableton",dolby atmos speech.wav,dolby atmos speech @Loop2017
4,203908.mp3,"dr-40, project, speech, student, italian, reci...",bologna speech Italian2,recorded with a tascam dr-40 in a sound studio...


Idea for splitting the data:

Use some fixed percentages for training, validation, and test sets (e.g., 70% training, 15% validation, 15% test).
Then, for each label, ensure that the same percentage of files is allocated to each set. This way, you maintain the label distribution across all sets.

Additionally, some resampling (e.g. SMOTE), undersampling (e.g. RUS, TOMEK) or class weighting techniques can be applied to maintain balance during training.

In [287]:
class AudioClassificationDataset(Dataset):

    def __init__(self, data_dir, audio_features_subset, return_snippets, context_length, testing, testing_percentage):
        """
        data_dir: Path to the dataset directory
        audio_features_subset: List of audio features to be used
        return_snippets: If False, return whole files, if True, return snippets of size context_length
        context_length: Length of the snippets to be returned (ideally uneven number)
        testing: If True, use a small subset of the audio files for testing
        """

        self.return_snippets = return_snippets
        self.context_length = context_length

        audio_path = os.path.join(data_dir, 'audio')

        # For testing purposes, use a smaller subset of the audio files (just copy some audio files from the audio directory
        # to a new directory at the same level)
        #audio_path = os.path.join(data_dir, 'audio_subset_test')

        self.audio_file_basenames = [os.path.splitext(os.path.basename(name))[0] for name in os.listdir(audio_path) if name.endswith('.mp3')]
        if testing:
            testing_count = int(len(self.audio_file_basenames) * testing_percentage)
            self.audio_file_basenames = self.audio_file_basenames[:testing_count] # Use only a small subset of the audio files for testing purposes
        
        
        self.audio_file_features = []
        self.audio_file_labels = []

        for audio_file_idx, audio_file_basename in enumerate(tqdm(self.audio_file_basenames)):

            # Remove unwanted features and store remaining features
            audio_features_dict = dict(np.load(os.path.join(data_dir, 'audio_features', audio_file_basename + '.npz')))
            audio_features = [audio_features_dict[wanted_feature] for wanted_feature in audio_features_subset]

            n_frames = audio_features[0].shape[0]

            # store whole file at once
            if not self.return_snippets:
                self.audio_file_features.append(audio_features)
            # store single snippets
            else:
                # Pad so that edge-frames have context
                padding = ((context_length//2, context_length//2), (0, 0))
                audio_features = [np.pad(array=feature, pad_width=padding, mode='constant', constant_values=0) for feature in audio_features]

                for frame_idx in range(n_frames):
                    frame_features = [audio_feature[frame_idx:frame_idx + context_length] for audio_feature in audio_features]
                    self.audio_file_features.append((audio_file_idx, frame_idx, frame_features))

            # Store labels
            audio_labels_dict = dict(np.load(os.path.join(data_dir, 'labels', audio_file_basename + '_labels.npz')))

            n_labels = len(audio_labels_dict.keys())
            file_labels = np.zeros((n_frames, n_labels))
            for i, (label_name, label_values) in enumerate(audio_labels_dict.items()):

                # If there are multiple label sets -> multiple annotators
                if label_values.shape[1] != 1:
                    # Like discussed in the exercise class, we choose one at random
                    label_values = label_values[:, np.random.randint(0, label_values.shape[1]), np.newaxis]

                file_labels[:, i] = label_values[:, 0]

            # store labels for whole file at once
            if not self.return_snippets:
                self.audio_file_labels.append(file_labels)
            # store labels for single snippets
            else:
                for frame_idx, frame_labels in enumerate(file_labels):
                    self.audio_file_labels.append((audio_file_idx, frame_idx, frame_labels))

    def __len__(self):
            return len(self.audio_file_labels)

    def __getitem__(self, idx):
        if not self.return_snippets:
            return self.audio_file_basenames[idx], self.audio_file_features[idx], self.audio_file_labels[idx]
        else:
            file_idx, frame_idx, frame_features = self.audio_file_features[idx]
            file_idx, frame_idx, frame_labels = self.audio_file_labels[idx]
            return file_idx, self.audio_file_basenames[file_idx], frame_idx, frame_features, frame_labels

In [288]:
all_labels = ['Airplane', 'Alarm', 'Beep/Bleep', 'Bell', 'Bicycle', 'Bird Chirp', 'Bus', 'Car', 'Cat Meow', 'Chainsaw', 'Clapping', 'Cough', 'Cow Moo', 'Cowbell', 'Crying', 'Dog Bark', 'Doorbell', 'Drip', 'Drums', 'Fire', 'Footsteps', 'Guitar', 'Hammer', 'Helicopter', 'Hiccup', 'Horn Honk', 'Horse Neigh', 'Insect Buzz', 'Jackhammer', 'Laughter', 'Lawn Mower', 'Motorcycle', 'Piano', 'Pig Oink', 'Power Drill', 'Power Saw', 'Rain', 'Rooster Crow', 'Saxophone', 'Sewing Machine', 'Sheep/Goat Bleat', 'Ship/Boat', 'Shout', 'Singing', 'Siren', 'Sneeze', 'Snoring', 'Speech', 'Stream/River', 'Thunder', 'Train', 'Truck', 'Trumpet', 'Vacuum Cleaner', 'Violin', 'Washing Machine', 'Waves', 'Wind']

all_features = ['embeddings', 'melspectrogram', 'mfcc', 'mfcc_delta', 'mfcc_delta2', 'flatness', 'centroid', 'flux', 'energy', 'power', 'bandwidth', 'contrast', 'zerocrossingrate']

data = AudioClassificationDataset(
    data_dir = DATASET_PATH,
    audio_features_subset = ['embeddings', 'melspectrogram', 'mfcc', 'contrast'],
    return_snippets = True,
    context_length = 1,
    testing = True, # Set to False for full dataset
    testing_percentage = 0.1
)
print(f"Dataset size: {len(data)}")

100%|██████████| 823/823 [00:13<00:00, 61.02it/s]


Dataset size: 152317


In [289]:
print("This is what the data looks like:")
for i, example in enumerate(data):
    if not data.return_snippets:
        audio_name, features, labels = example
        print(f"Name: {audio_name}, Feature dim: {[f.shape for f in features]}, Label dim: {labels.shape}")
    elif data.return_snippets:
        file_idx, audio_name, frame_idx, features, labels = example
        print(f"Name: {audio_name}, Frame: {frame_idx}, Feature dim: {[f.shape for f in features]}, Label dim: {labels.shape}")

    if i > 300:
        break

This is what the data looks like:
Name: 100300, Frame: 0, Feature dim: [(1, 768), (1, 64), (1, 32), (1, 7)], Label dim: (58,)
Name: 100300, Frame: 1, Feature dim: [(1, 768), (1, 64), (1, 32), (1, 7)], Label dim: (58,)
Name: 100300, Frame: 2, Feature dim: [(1, 768), (1, 64), (1, 32), (1, 7)], Label dim: (58,)
Name: 100300, Frame: 3, Feature dim: [(1, 768), (1, 64), (1, 32), (1, 7)], Label dim: (58,)
Name: 100300, Frame: 4, Feature dim: [(1, 768), (1, 64), (1, 32), (1, 7)], Label dim: (58,)
Name: 100300, Frame: 5, Feature dim: [(1, 768), (1, 64), (1, 32), (1, 7)], Label dim: (58,)
Name: 100300, Frame: 6, Feature dim: [(1, 768), (1, 64), (1, 32), (1, 7)], Label dim: (58,)
Name: 100300, Frame: 7, Feature dim: [(1, 768), (1, 64), (1, 32), (1, 7)], Label dim: (58,)
Name: 100300, Frame: 8, Feature dim: [(1, 768), (1, 64), (1, 32), (1, 7)], Label dim: (58,)
Name: 100300, Frame: 9, Feature dim: [(1, 768), (1, 64), (1, 32), (1, 7)], Label dim: (58,)
Name: 100300, Frame: 10, Feature dim: [(1, 768

In [290]:
def custom_train_test_split(custom_dataset, val_size=0.2, test_size=0.2):
    # Creates a dataset split for training, validation and test set, while keeping contents from one file contained in one set,
    # like was discussed in the exercise session

    n_files = len(custom_dataset.audio_file_basenames)
    dataset_size = len(custom_dataset)

    if custom_dataset.return_snippets:

        # For each file get the index of the first frame
        file_start_end_index_dict = {}

        for idx, (file_idx, frame_idx, _) in enumerate(custom_dataset.audio_file_labels):
            if idx == 0 and frame_idx == 0:
                file_start_end_index_dict[file_idx] = [idx, None]
            elif idx != 0 and frame_idx == 0:
                file_start_end_index_dict[file_idx] = [idx, None]
                file_start_end_index_dict[file_idx-1][1] = idx
            elif idx == dataset_size - 1:
                file_start_end_index_dict[file_idx][1] = idx

        # Get indices of files for which to include the snippets in the splits
        file_indices = list(range(n_files))
        np.random.shuffle(file_indices)
        shuffled_file_indices = file_indices

        train_size_files = n_files - int(n_files * (val_size + test_size))
        val_size_files = int(n_files * val_size)

        train_file_indices = shuffled_file_indices[:train_size_files]
        val_file_indices = shuffled_file_indices[train_size_files:(train_size_files+val_size_files)]
        test_file_indices = shuffled_file_indices[(train_size_files+val_size_files):]

        # For each file index in the lists get corresponding indices of the snippets
        indices = []
        for i, file_indices in enumerate([train_file_indices, val_file_indices, test_file_indices]):
            indices.append([])
            for file_idx in file_indices:
                start_idx, end_idx = file_start_end_index_dict[file_idx]
                indices[i].extend(list(range(start_idx, end_idx)))

        train_indices = indices[0]
        val_indices = indices[1]
        test_indices = indices[2]


    else: # 1 to 1 correspondence between files and dataset entries
        indices = list(range(dataset_size))
        np.random.shuffle(indices)
        shuffled_indices = indices

        train_size = dataset_size - int(dataset_size * (val_size + test_size))
        val_size = int(dataset_size * val_size)

        train_indices = shuffled_indices[:train_size]
        val_indices = shuffled_indices[train_size:(train_size+val_size)]
        test_indices = shuffled_indices[(train_size+val_size):]


    return train_indices, val_indices, test_indices

In [291]:
train_set, test_set, val_set = custom_train_test_split(data, val_size=0.2, test_size=0.1)
print("Train, test and validation set sizes:")
print(f"-Train: {len(train_set)}")
print(f"-Test: {len(test_set)}")
print(f"-Validation: {len(val_set)}")

Train, test and validation set sizes:
-Train: 106970
-Test: 30212
-Validation: 15134


In [292]:
# Get feature dimensions+borders
feature_dims = [feature.shape[0] * feature.shape[1] for feature in data[0][3]]
total_feature_dim = np.sum(feature_dims, dtype=int)

feature_dim_borders = np.cumsum(feature_dims)
feature_dim_borders = np.insert(feature_dim_borders, 0, 0)

In [293]:
# Create data to train and evaluate our models
X_train = np.zeros((len(train_set), total_feature_dim))
Ys_train = np.zeros((len(train_set), len(all_labels)), dtype=int)
for i, idx in enumerate(train_set):
    file_idx, audio_name, frame_idx, features, labels = data[idx]
    X_train[i] = np.concatenate(features, axis=1).flatten()
    labels[labels > 1] = 1
    Ys_train[i] = labels
    
X_val = np.zeros((len(val_set), total_feature_dim))
Ys_val = np.zeros((len(val_set), len(all_labels)), dtype=int)
for i, idx in enumerate(val_set):
    file_idx, audio_name, frame_idx, features, labels = data[idx]
    X_val[i] = np.concatenate(features, axis=1).flatten()
    labels[labels > 1] = 1
    Ys_val[i] = labels

X_test = np.zeros((len(test_set), total_feature_dim))
Ys_test = np.zeros((len(test_set), len(all_labels)), dtype=int)
for i, idx in enumerate(test_set):
    file_idx, audio_name, frame_idx, features, labels = data[idx]
    X_test[i] = np.concatenate(features, axis=1).flatten()
    labels[labels > 1] = 1
    Ys_test[i] = labels

In [294]:
# Normalize the data using training set statistics (to avoid data leakage)
train_mean, train_std = np.mean(X_train, axis=0, keepdims=True), np.std(X_train, axis=0, keepdims=True)

X_train = (X_train - train_mean) / train_std
X_val = (X_val - train_mean) / train_std
X_test = (X_test - train_mean) / train_std

In [295]:
from sklearn.metrics import (f1_score, balanced_accuracy_score)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier

import itertools

#import h2o
#from h2o.automl import H2OAutoML
#from h2o.estimators import H2ORandomForestEstimator
#from h2o.frame import H2OFrame


import torch
if torch.cuda.is_available():
    print("CUDA is available. PyTorch will use the GPU.")
    print(f"CUDA version: {torch.version.cuda}")
    device = torch.device("cuda")
else:
    print("CUDA is NOT available. PyTorch will use the CPU.")
    device = torch.device("cpu")

SEED = 42

CUDA is available. PyTorch will use the GPU.
CUDA version: 12.4


In [296]:
param_grid_rf = {
    'n_estimators': [10, 30, 50],
    'max_depth': [5, 10, None],
    #'min_samples_split': [2, 5, 10, 20],
    'criterion': ['gini', 'entropy'],
}
best_score = 0
best_params = None
best_model = None

for n_estimators, max_depth, criterion in itertools.product(
        param_grid_rf['n_estimators'],
        param_grid_rf['max_depth'],
        param_grid_rf['criterion']):
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, criterion=criterion, random_state=SEED)
    model = MultiOutputClassifier(model, n_jobs=-1)
    model.fit(X_train, Ys_train)
    Y_pred = model.predict(X_val)

    f1_scores = []
    balanced_accuracy_scores = []
    n_labels = len(all_labels)

    Ys_prediction_val = model.predict(X_val)
    for i in range(n_labels):
        if np.sum(Ys_val[:, i]) == 0: continue # if there is a label with no positive samples -> skip it (can be uncommented but will drag down overall score)

        f1 = f1_score(y_true=Ys_val[:, i], y_pred=Ys_prediction_val[:, i], zero_division=0)
        ba = balanced_accuracy_score(y_true=Ys_val[:, i], y_pred=Ys_prediction_val[:, i])

        f1_scores.append(f1)
        balanced_accuracy_scores.append(ba)

    # Macro-average across labels
    f1_score_macro = np.mean(f1_scores)
    balanced_accuracy_score_macro = np.mean(balanced_accuracy_scores)

    if f1_score_macro > best_score:
        best_score = f1_score_macro
        best_params = {
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            #'min_samples_split': min_samples_split,
            'criterion': criterion
        }
        best_model = model

    print(f"Hyperparameters: n_estimators={n_estimators}, max_depth={max_depth}, criterion={criterion}")
    print(f"Macro-Averaged F1 score: {f1_score_macro}")
    print(f"Macro-Averaged Balanced Accuracy score:{balanced_accuracy_score_macro}")
    print(f"--------------------------------------------------------------------------")

print("\nBest Parameters:")
print(best_params)
print(f"Best f1 score: {best_score:.4f}")

Hyperparameters: n_estimators=10, max_depth=5, criterion=gini
Macro-Averaged F1 score: 0.26105353981490714
Macro-Averaged Balanced Accuracy score:0.596774465321511
--------------------------------------------------------------------------
Hyperparameters: n_estimators=10, max_depth=5, criterion=entropy
Macro-Averaged F1 score: 0.253772141458592
Macro-Averaged Balanced Accuracy score:0.5975521576720962
--------------------------------------------------------------------------
Hyperparameters: n_estimators=10, max_depth=10, criterion=gini
Macro-Averaged F1 score: 0.3310502946344165
Macro-Averaged Balanced Accuracy score:0.6303230151908324
--------------------------------------------------------------------------
Hyperparameters: n_estimators=10, max_depth=10, criterion=entropy
Macro-Averaged F1 score: 0.32593691487185883
Macro-Averaged Balanced Accuracy score:0.6258562586091321
--------------------------------------------------------------------------
Hyperparameters: n_estimators=10, ma

KeyboardInterrupt: 

In [264]:
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'degree': [2, 3, 4],
}
best_score = 0
best_params = None
best_model = None

# Ensure there are at least two classes for every target column
valid_columns = [i for i in range(Ys_train.shape[1]) if len(np.unique(Ys_train[:, i])) > 1]
Ys_train_filtered = Ys_train[:, valid_columns]
Ys_val_filtered = Ys_val[:, valid_columns]


for c, kernel, degree in itertools.product(
        param_grid_svm['C'],
        param_grid_svm['kernel'],
        param_grid_svm['degree']):
    svm = SVC(C=c, kernel=kernel, degree=degree, random_state=SEED)
    model = MultiOutputClassifier(svm, n_jobs=-1)
    model.fit(X_train, Ys_train_filtered)
    Y_pred = model.predict(X_val)

    # If you removed columns, handle predictions for constant columns
    constant_predictions = {i: np.unique(Ys_train[:, i])[0] for i in range(Ys_train.shape[1]) if i not in valid_columns}


    f1_scores = []
    balanced_accuracy_scores = []
    n_labels = len(all_labels)

    Ys_prediction_val = model.predict(X_val)
    for i in range(Ys_train_filtered.shape[1] if valid_columns else n_labels):
        if np.sum(Ys_val_filtered[:, i]) == 0: continue # if there is a label with no positive samples -> skip it (can be uncommented but will drag down overall score)

        f1 = f1_score(y_true=Ys_val_filtered[:, i], y_pred=Ys_prediction_val[:, i], zero_division=0)
        ba = balanced_accuracy_score(y_true=Ys_val_filtered[:, i], y_pred=Ys_prediction_val[:, i])

        f1_scores.append(f1)
        balanced_accuracy_scores.append(ba)

    # Macro-average across labels
    f1_score_macro = np.mean(f1_scores)
    balanced_accuracy_score_macro = np.mean(balanced_accuracy_scores)

    if f1_score_macro > best_score:
        best_score = f1_score_macro
        best_params = {
            'C': c,
            'kernel': kernel,
            'degree': degree,
        }
        best_model = model

    print(f"Hyperparameters: C={c}, kernel={kernel}, degree={degree}")
    print(f"Macro-Averaged F1 score: {f1_score_macro}")
    print(f"Macro-Averaged Balanced Accuracy score:{balanced_accuracy_score_macro}")
    print(f"--------------------------------------------------------------------------")

print("\nBest Parameters:")
print(best_params)
print(f"Best f1 score: {best_score:.4f}")

Hyperparameters: C=0.1, kernel=linear, degree=2
Macro-Averaged F1 score: 0.47258330126087833
Macro-Averaged Balanced Accuracy score:0.7314200017779149
--------------------------------------------------------------------------
Hyperparameters: C=0.1, kernel=linear, degree=3
Macro-Averaged F1 score: 0.47258330126087833
Macro-Averaged Balanced Accuracy score:0.7314200017779149
--------------------------------------------------------------------------
Hyperparameters: C=0.1, kernel=linear, degree=4
Macro-Averaged F1 score: 0.47258330126087833
Macro-Averaged Balanced Accuracy score:0.7314200017779149
--------------------------------------------------------------------------
Hyperparameters: C=0.1, kernel=rbf, degree=2
Macro-Averaged F1 score: 0.45648632808911305
Macro-Averaged Balanced Accuracy score:0.7149666231617566
--------------------------------------------------------------------------
Hyperparameters: C=0.1, kernel=rbf, degree=3
Macro-Averaged F1 score: 0.45648632808911305
Macro-Ave

In [244]:
# Train our classifier
clf = DecisionTreeClassifier(min_impurity_decrease=10e-4, min_samples_leaf=5, max_depth=50)
#clf = RandomForestClassifier(n_estimators=200, min_impurity_decrease=10e-4, min_samples_leaf=5)
#clf = SVC(kernel='linear', max_iter=5)

clf = MultiOutputClassifier(clf, n_jobs=-1)
clf.fit(X_train, Ys_train)

KeyboardInterrupt: 

In [18]:
# Evaluate the classifier on the validation set
f1_scores = []
balanced_accuracy_scores = []
n_labels = len(all_labels)

Ys_prediction_val = clf.predict(X_val)
for i in range(n_labels):
    if np.sum(Ys_val[:, i]) == 0: continue # if there is a label with no positive samples -> skip it (can be uncommented but will drag down overall score)
        
    f1 = f1_score(y_true=Ys_val[:, i], y_pred=Ys_prediction_val[:, i], zero_division=0)
    ba = balanced_accuracy_score(y_true=Ys_val[:, i], y_pred=Ys_prediction_val[:, i])

    f1_scores.append(f1)
    balanced_accuracy_scores.append(ba)

# Macro-average across labels
f1_score_macro = np.mean(f1_scores)
balanced_accuracy_score_macro = np.mean(balanced_accuracy_scores)

print("Macro-Averaged F1 score:", f1_score_macro)
print("Macro-Averaged Balanced Accuracy score:", balanced_accuracy_score_macro)

Macro-Averaged F1 score: 0.18675230801098355
Macro-Averaged Balanced Accuracy score: 0.5781976565967923
