In [1]:
import sys

import numpy as np
import pandas as pd

sys.path.append("../../")

from collections import Counter

from helpers.reduce import load_and_distill
from helpers.split import tag_label_feature_split
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE, RandomOverSampler

In [2]:
base_genres = [
    "genre_blues",
    "genre_classical",
    "genre_country",
    "genre_disco",
    "genre_hiphop",
    "genre_jazz",
    "genre_metal",
    "genre_pop",
    "genre_reggae",
    "genre_rock",
]

In [3]:
def to_base_genre_label(index):
    """Return the base genre label for a dense genre index."""
    return base_genres[index]


def to_base_genre_index(label):
    """Return the dense genre index for a base genre label."""
    return base_genres.index(label)


def sparse_to_dense(labels):
    return np.argmax(labels.to_numpy(), axis=1)

In [4]:
df = load_and_distill(multi_label=False, tags=True)

### Undersample the 7 Most Represented Genres

Reconstitute the dataset by drawing random samples from the seven most represented genres.  Maintain all data from the three least represented genres.

In [5]:
def undersample(data, count=500):
    return pd.concat(
        [
            df[df.genre_pop == 1].sample(n=count, axis=0),
            df[df.genre_rock == 1].sample(n=count, axis=0),
            df[df.genre_classical == 1].sample(n=count, axis=0),
            df[df.genre_hiphop == 1].sample(n=count, axis=0),
            df[df.genre_jazz == 1].sample(n=count, axis=0),
            df[df.genre_metal == 1].sample(n=count, axis=0),
            df[df.genre_reggae == 1].sample(n=count, axis=0),
            df[df.genre_blues == 1],
            df[df.genre_country == 1],
            df[df.genre_disco == 1],
        ]
    )


df_undersampled = undersample(df)
tags, one_hot_encoded_labels, X = tag_label_feature_split(df_undersampled)

In [6]:
one_hot_encoded_labels.sum(axis=0).sort_values(ascending=False)

genre_classical    500
genre_hiphop       500
genre_jazz         500
genre_metal        500
genre_pop          500
genre_reggae       500
genre_rock         500
genre_blues        468
genre_country      409
genre_disco        361
dtype: int64

In [7]:
y = np.array(sparse_to_dense(one_hot_encoded_labels))
X = X.filter(regex="^mfcc_mean_", axis=1)

In [8]:
X

Unnamed: 0,mfcc_mean_0,mfcc_mean_1,mfcc_mean_2,mfcc_mean_3,mfcc_mean_4,mfcc_mean_5,mfcc_mean_6,mfcc_mean_7,mfcc_mean_8,mfcc_mean_9,mfcc_mean_10,mfcc_mean_11,mfcc_mean_12
21203,-731.881775,167.506622,29.992678,1.775042,1.936840,-9.039070,-9.526371,-4.563849,-16.687853,-7.122833,-15.314518,-16.636948,-13.811142
20220,-664.110291,105.939636,22.050243,38.953522,13.110357,10.277453,-1.718638,1.985022,-0.669255,-0.344212,-0.499223,-2.054155,-3.001413
16124,-723.657471,196.583786,24.963108,31.553154,18.826328,2.369676,6.779541,4.450876,-1.235345,-2.750925,-1.712977,6.077117,6.785930
25296,-742.324829,139.031128,52.591686,29.302963,12.369719,10.226247,-0.952145,4.378653,-3.542637,4.350736,0.654128,-2.043166,-1.407209
3264,-725.935974,147.901428,31.002018,-7.678528,-15.015887,-4.974175,-1.185524,2.995163,-0.389467,-2.269662,-0.995318,-1.642543,-2.220340
...,...,...,...,...,...,...,...,...,...,...,...,...,...
54236,-690.560669,98.972618,-15.824314,12.942833,5.728589,0.392425,1.604278,0.400300,-2.473933,-0.332941,4.158844,-1.676747,-1.073218
54368,-666.252075,79.289360,29.025909,30.546505,7.162126,19.828424,11.959628,11.099643,6.065190,8.011757,3.916399,1.223691,2.666147
54749,-698.271912,121.905960,-9.023279,9.455445,-2.001290,-6.252756,-3.672734,4.375753,2.454486,5.336717,2.882187,1.964112,-2.579552
54794,-744.579224,99.917603,33.313694,18.761524,11.070148,13.391991,3.485552,0.974179,1.767911,8.345790,8.030129,8.866822,8.322719


### Naive RandomOverSample to Boost the 3 Least Represented Genres

In [9]:
X_resampled, y_resampled = RandomOverSampler(random_state=0).fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))

[(0, 500), (1, 500), (2, 500), (3, 500), (4, 500), (5, 500), (6, 500), (7, 500), (8, 500), (9, 500)]


### SMOTE to Boost the 3 Least Represented Genres

In [10]:
X_SMOTE, y_SMOTE = SMOTE().fit_resample(X, y)
print(sorted(Counter(y_SMOTE).items()))

[(0, 500), (1, 500), (2, 500), (3, 500), (4, 500), (5, 500), (6, 500), (7, 500), (8, 500), (9, 500)]


### SMOTETomek Combined Approach to Balance the Data
#### On the Full Dataset

In [11]:
tags, one_hot_encoded_labels, X = tag_label_feature_split(df)
y = np.array(sparse_to_dense(one_hot_encoded_labels))
X = X.filter(regex="^mfcc_mean_", axis=1)

X_SMOTE_tomek, y_SMOTE_tomek = SMOTETomek(random_state=0).fit_resample(X, y)
print(sorted(Counter(y_SMOTE_tomek).items()))

[(0, 5327), (1, 5120), (2, 5340), (3, 5334), (4, 5298), (5, 5302), (6, 5335), (7, 5011), (8, 5324), (9, 5157)]


#### On the Undersampled Dataset

In [12]:
tags, one_hot_encoded_labels, X = tag_label_feature_split(df_undersampled)
y = np.array(sparse_to_dense(one_hot_encoded_labels))
X = X.filter(regex="^mfcc_mean_", axis=1)

X_SMOTE_tomek, y_SMOTE_tomek = SMOTETomek(random_state=0).fit_resample(X, y)
print(sorted(Counter(y_SMOTE_tomek).items()))

[(0, 404), (1, 439), (2, 451), (3, 434), (4, 385), (5, 373), (6, 427), (7, 357), (8, 386), (9, 384)]
