In [40]:
import sys

import pandas as pd
import numpy as np
import scipy.stats as stats

sys.path.append("../../")

from helpers.stitch import ReadFilesIntoDataframe
from helpers.constants import BASE_GENRES
from helpers.split import tag_label_feature_split


In [41]:
read_file = ReadFilesIntoDataframe()
df = read_file.read_mtg_jamendo_files()

In [42]:
df = df[df[BASE_GENRES].sum(axis=1) == 1]

In [43]:
_, labels, features = tag_label_feature_split(df)

In [44]:
print(features.shape)

(21351, 2743)


In [45]:
features = features.select_dtypes('float')

In [46]:
features.describe()

Unnamed: 0,lowlevel_average_loudness,barkbands_crest_dmean,barkbands_crest_dmean2,barkbands_crest_dvar,barkbands_crest_dvar2,barkbands_crest_max,barkbands_crest_mean,barkbands_crest_median,barkbands_crest_min,barkbands_crest_var,...,mfcc_icov_12_3,mfcc_icov_12_4,mfcc_icov_12_5,mfcc_icov_12_6,mfcc_icov_12_7,mfcc_icov_12_8,mfcc_icov_12_9,mfcc_icov_12_10,mfcc_icov_12_11,mfcc_icov_12_12
count,21351.0,21351.0,21351.0,21351.0,21351.0,21351.0,21351.0,21351.0,21351.0,21351.0,...,21351.0,21351.0,21351.0,21351.0,21351.0,21351.0,21351.0,21351.0,21351.0,21351.0
mean,0.7443498,2.462434,3.9289,6.210709,15.158013,25.854999,12.000211,11.31328,2.852339,21.667845,...,-0.162449,0.176953,-0.179994,0.167798,-0.142433,0.107581,-0.072594,0.044366,-0.027876,0.023342
std,0.271953,0.579051,0.960884,2.564386,5.955108,1.094183,2.210466,2.503172,0.689549,8.042571,...,11.435135,12.931136,14.051897,13.782604,11.896853,8.87151,5.578327,2.848559,1.106226,0.279874
min,6.453744e-07,0.351595,0.563872,0.569924,1.012607,16.006996,5.443738,4.688963,1.314467,3.992804,...,-1083.037964,-381.466125,-1511.472046,-463.868744,-1180.914551,-302.384247,-494.101471,-101.814491,-88.717659,-10.098447
25%,0.6517185,2.04605,3.256222,4.255442,10.712004,25.429896,10.443477,9.5922,2.348297,15.99255,...,-0.001121,-0.000865,-0.001194,-0.000675,-0.002348,-0.000553,-0.002841,0.000903,-0.01213,0.012467
50%,0.8695447,2.427489,3.89098,5.776865,14.360209,26.20726,11.861193,11.042865,2.692859,20.328146,...,-0.000381,-0.000125,-0.000117,0.000268,-0.001088,0.000597,-0.000988,0.002632,-0.007718,0.016239
75%,0.9376186,2.848913,4.573359,7.793416,18.890983,26.625534,13.385814,12.707327,3.21165,25.715657,...,0.000236,0.00072,0.000765,0.001394,-5.4e-05,0.002041,0.000321,0.00539,-0.004812,0.021521
max,0.9865745,5.275131,8.676435,22.468172,49.750587,26.999872,23.248888,25.616434,6.555689,78.48497,...,249.826416,1340.655151,458.798737,1441.866455,403.275208,830.281799,193.423141,240.288925,40.621906,24.246027


In [47]:
def p_val_plots(genre, features):
    genre_features = features[labels[genre] == 1]
    non_genre_features = features[labels[genre] == 0]

    p_vals = []

    for i in genre_features.columns:
        # more than 10 samples for the feature to assume normality for t-test
        if len(genre_features[genre_features[i] != 0].index) >= 10:
            p_vals.append(stats.ttest_ind(genre_features[i], non_genre_features[i], equal_var = False).pvalue)

    # keeping column where p value < 0.05
    features_p_val = pd.DataFrame(features.columns, columns=[genre])
    p_vals = pd.DataFrame(p_vals, columns=['p_val'])
    features_p_val = pd.concat([features_p_val, p_vals], axis=1)
    features_p_val = features_p_val[features_p_val['p_val'] <= 0.05]
    features_no_relation = features_p_val[features_p_val['p_val'] > 0.05]
    features_p_val = features_p_val.sort_values(by='p_val',ignore_index=True)

    # print top 12 features with the lowest p_val
    # print(features_p_val[:12])


    # fig, axes = plt.subplots(3, 4, facecolor='white')
    # fig.set_size_inches(25, 15)

    # for i in range(12):
    #     row = math.floor(i/4)
    #     col = i %4

    #     sns.scatterplot(data = df, x = genre, y = features_p_val['column'][i], ax=axes[row,col])

    #     p_val = features_p_val['p_val'][i]
    #     axes[row,col].title.set_text(f'p_value: {p_val}')

    return features_p_val, features_no_relation

In [53]:
features_ranked = None
features_no_relation = None

for i in BASE_GENRES:
    features_p_val, features_no_rel = p_val_plots(i, features)

    if features_ranked is not None:
        features_ranked = pd.concat([features_ranked, features_p_val[i]], axis = 1)
        features_no_relation = pd.concat([features_no_relation, features_no_rel[i]], axis = 1)

    else:
        features_ranked = pd.DataFrame(features_p_val[i].tolist(), columns=[i])
        features_no_relation = pd.DataFrame(features_no_rel[i].tolist(), columns=[i])

In [54]:
features_ranked

Unnamed: 0,genre_blues,genre_classical,genre_country,genre_disco,genre_hiphop,genre_jazz,genre_metal,genre_pop,genre_reggae,genre_rock
0,hpcp_max_11,lowlevel_average_loudness,hpcp_max_33,hpcp_max_12,gfcc_icov_1_1,melbands_median_23,dissonance_mean,spectral_contrast_valleys_max_5,gfcc_icov_1_12,lowlevel_average_loudness
1,spectral_kurtosis_mean,spectral_contrast_coeffs_dvar2_1,hpcp_max_31,hpcp_max_10,erbbands_skewness_dmean,erbbands_median_23,spectral_contrast_valleys_mean_2,spectral_kurtosis_median,gfcc_icov_0_1,spectral_flux_max
2,hpcp_max_12,spectral_contrast_coeffs_dvar2_0,spectral_kurtosis_mean,hpcp_max_18,gfcc_icov_2_0,erbbands_median_24,spectral_kurtosis_max,gfcc_icov_6_7,spectral_contrast_valleys_max_4,spectral_flux_mean
3,hpcp_dvar2_35,spectral_contrast_coeffs_dvar_2,spectral_skewness_dvar,gfcc_icov_0_0,beats_loudness_band_ratio_median_2,erbbands_median_25,spectral_contrast_valleys_max_3,gfcc_icov_7_6,chords_strength_var,spectral_flux_var
4,hpcp_max_14,spectral_contrast_coeffs_dvar_1,spectral_kurtosis_max,gfcc_icov_1_11,beats_loudness_band_ratio_median_1,erbbands_median_26,spectral_kurtosis_dmean,gfcc_icov_10_11,gfcc_icov_6_5,spectral_kurtosis_dmean
...,...,...,...,...,...,...,...,...,...,...
2277,,spectral_contrast_coeffs_min_0,,,,,,,,
2278,,hpcp_var_24,,,,,,,,
2279,,melbands_min_7,,,,,,,,
2280,,spectral_rolloff_max,,,,,,,,


features with p values bigger than 0.05, indicating that we reject the null hypothesis that this genre's feature is different than the same feature in other genre

In [55]:
features_no_relation

Unnamed: 0,genre_blues,genre_classical,genre_country,genre_disco,genre_hiphop,genre_jazz,genre_metal,genre_pop,genre_reggae,genre_rock


In [62]:
total_ranked = pd.DataFrame({
    'columns': features.columns, 
    'rankings': np.zeros(shape = len(features.columns))
})

In [63]:
for i in range(len(features.columns)):
    for g in BASE_GENRES:
        indexes = features_ranked[g][features_ranked[g] == features.columns[i]].index.to_list()
        if len(indexes) > 0:
            index = int(indexes[0])
        else:
            index = 2282

        total_ranked.iloc[[i], [1]] = int(total_ranked.iloc[i]['rankings']) + index


To rank usefulness of each feature using the t-test ranking, I summed the ranking of each feature from each genre. (the lower the better)

In [66]:
total_ranked.sort_values(by='rankings',ignore_index=True).head(10)

Unnamed: 0,columns,rankings
0,gfcc_mean_2,2271.0
1,gfcc_icov_1_1,2468.0
2,spectral_skewness_mean,2480.0
3,gfcc_icov_2_0,2828.0
4,spectral_kurtosis_mean,2938.0
5,lowlevel_average_loudness,2972.0
6,gfcc_icov_3_0,3123.0
7,gfcc_icov_4_1,3143.0
8,spectral_contrast_valleys_dmean2_2,3149.0
9,gfcc_icov_0_1,3241.0
