In [1]:
import sys

import numpy as np
import pandas as pd

sys.path.append("../../")

from collections import Counter

from helpers.reduce import load_and_distill
from helpers.split import tag_label_feature_split
from helpers.constants import BASE_GENRES, BASE_FEATURES
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE, RandomOverSampler

In [2]:
def to_base_genre_label(index):
    """Return the base genre label for a dense genre index."""
    return base_genres[index]


def to_base_genre_index(label):
    """Return the dense genre index for a base genre label."""
    return base_genres.index(label)


def sparse_to_dense(labels):
    return np.argmax(labels.to_numpy(), axis=1)

In [3]:
df = load_and_distill(labels=BASE_GENRES, multi_label=False, tags='all')

### Undersample the 7 Most Represented Genres

Reconstitute the dataset by drawing random samples from the seven most represented genres.  Maintain all data from the three least represented genres.

In [4]:
def undersample(data, count=500):
    return pd.concat(
        [
            df[df.genre_pop == 1].sample(n=count, axis=0),
            df[df.genre_rock == 1].sample(n=count, axis=0),
            df[df.genre_classical == 1].sample(n=count, axis=0),
            df[df.genre_hiphop == 1].sample(n=count, axis=0),
            df[df.genre_jazz == 1].sample(n=count, axis=0),
            df[df.genre_metal == 1].sample(n=count, axis=0),
            df[df.genre_reggae == 1].sample(n=count, axis=0),
            df[df.genre_blues == 1],
            df[df.genre_country == 1],
            df[df.genre_disco == 1],
        ]
    )


df_undersampled = undersample(df)
tags, one_hot_encoded_labels, X = tag_label_feature_split(df_undersampled)

In [5]:
one_hot_encoded_labels.sum(axis=0).sort_values(ascending=False)

genre_classical    500
genre_hiphop       500
genre_jazz         500
genre_metal        500
genre_pop          500
genre_reggae       500
genre_rock         500
genre_blues        468
genre_country      409
genre_disco        361
dtype: int64

In [6]:
y = np.array(sparse_to_dense(one_hot_encoded_labels))
X = X[BASE_FEATURES]

In [7]:
X

Unnamed: 0,mfcc_mean_0,mfcc_mean_1,mfcc_mean_2,mfcc_mean_3,mfcc_mean_4,mfcc_mean_5,mfcc_mean_6,mfcc_mean_7,mfcc_mean_8,mfcc_mean_9,mfcc_mean_10,mfcc_mean_11,mfcc_mean_12
18183,-845.949951,190.001877,40.886948,19.058676,7.475121,-5.069788,-11.156761,-9.279266,-10.838486,-7.980818,-11.176950,-6.671234,-10.689099
4376,-675.974121,147.425217,17.031549,17.740648,0.163708,11.081747,6.440608,6.823104,-1.570915,0.286370,-4.004776,0.173801,0.255744
377,-795.804504,178.116562,-23.244530,3.117461,10.202637,3.900532,-2.431671,-3.431746,-13.765501,1.795233,-4.988559,-9.001773,0.339130
39536,-685.986816,122.242622,33.153530,20.784653,7.018640,16.715908,8.795200,5.464747,2.879173,7.134067,2.685183,-1.243326,1.316974
8028,-726.523987,114.114700,-7.052966,13.107974,1.581876,-1.773843,9.136472,-3.089243,0.950243,4.520816,0.782044,3.289428,-1.136206
...,...,...,...,...,...,...,...,...,...,...,...,...,...
54236,-690.560669,98.972618,-15.824314,12.942833,5.728589,0.392425,1.604278,0.400300,-2.473933,-0.332941,4.158844,-1.676747,-1.073218
54368,-666.252075,79.289360,29.025909,30.546505,7.162126,19.828424,11.959628,11.099643,6.065190,8.011757,3.916399,1.223691,2.666147
54749,-698.271912,121.905960,-9.023279,9.455445,-2.001290,-6.252756,-3.672734,4.375753,2.454486,5.336717,2.882187,1.964112,-2.579552
54794,-744.579224,99.917603,33.313694,18.761524,11.070148,13.391991,3.485552,0.974179,1.767911,8.345790,8.030129,8.866822,8.322719


### Naive RandomOverSample to Boost the 3 Least Represented Genres

In [8]:
X_resampled, y_resampled = RandomOverSampler(random_state=0).fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))

[(0, 500), (1, 500), (2, 500), (3, 500), (4, 500), (5, 500), (6, 500), (7, 500), (8, 500), (9, 500)]


### SMOTE to Boost the 3 Least Represented Genres

In [9]:
X_SMOTE, y_SMOTE = SMOTE().fit_resample(X, y)
print(sorted(Counter(y_SMOTE).items()))

[(0, 500), (1, 500), (2, 500), (3, 500), (4, 500), (5, 500), (6, 500), (7, 500), (8, 500), (9, 500)]


### SMOTETomek Combined Approach to Balance the Data
#### On the Full Dataset

In [10]:
tags, one_hot_encoded_labels, X = tag_label_feature_split(df)
y = np.array(sparse_to_dense(one_hot_encoded_labels))
X = X.filter(regex="^mfcc_mean_", axis=1)

X_SMOTE_tomek, y_SMOTE_tomek = SMOTETomek(random_state=0).fit_resample(X, y)
print(sorted(Counter(y_SMOTE_tomek).items()))

[(0, 5327), (1, 5120), (2, 5340), (3, 5334), (4, 5298), (5, 5302), (6, 5335), (7, 5011), (8, 5324), (9, 5157)]


#### On the Undersampled Dataset

In [11]:
tags, one_hot_encoded_labels, X = tag_label_feature_split(df_undersampled)
y = np.array(sparse_to_dense(one_hot_encoded_labels))
X = X.filter(regex="^mfcc_mean_", axis=1)

X_SMOTE_tomek, y_SMOTE_tomek = SMOTETomek(random_state=0).fit_resample(X, y)
print(sorted(Counter(y_SMOTE_tomek).items()))

[(0, 389), (1, 431), (2, 457), (3, 430), (4, 389), (5, 383), (6, 421), (7, 366), (8, 376), (9, 378)]
