In [16]:
import pandas as pd

def extract_short_features(window_size, signals):

    print("extracting short features with window size " + str(window_size))
    short_features_with_headers = [feature_extraction(s, 44100, window_size, window_size) for s in signals]

    headers = short_features_with_headers[0][1]

    transposed = []
    for j in range(len(short_features_with_headers[0][0])):
        new_row = []
        for row in short_features_with_headers:
            new_row.append(row[0][j])
        transposed.append(new_row)

    return pd.DataFrame(transposed, index= headers).transpose(copy=False)


In [17]:
import numpy as np

def mean_features(short_features_df):

    means_df = short_features_df.copy()

    for j in range(0, means_df.shape[1]):
        for i in range(0, means_df.shape[0]):
            means_df.iloc[i, j] = np.mean(means_df.iloc[i, j])
            
    return means_df

In [None]:
import scipy.stats as stats

perfect = [1,2,3,4,5,6,7]
score = lambda x : stats.spearmanr(x, perfect)[1]

def seq_forward_selection(means_df):
    selected_attributes = []
    all_attributes = means_df.columns.to_list()
    scores = []
    for m in range(0, 10):
        scores.append({})

        for col in all_attributes:
            selected_attributes.append(col)
            scores[m][col] = score([np.linalg.norm(means_df.loc[i, selected_attributes] - means_df.loc[0, selected_attributes]) for i in range(1,8)])
            selected_attributes.pop()
        
        min = 2
        mincol = ""
        for col in scores[m]:
            if scores[m][col] == 0:
                mincol = col
                min = 0
                break
            if scores[m][col] < min:
                min = scores[m][col]
                mincol = col

        selected_attributes.append(mincol)
        all_attributes.remove(mincol)

        print(f"{m + 1}. Selected attributes are {", ".join(selected_attributes)}. score: {min}")
    return scores

In [None]:
from pyAudioAnalysis.pyAudioAnalysis.audioBasicIO import read_audio_file, stereo_to_mono
from pyAudioAnalysis.pyAudioAnalysis.ShortTermFeatures import feature_extraction

# récupération des signaux de chaque fichiers mixés (en mono)
signals = [stereo_to_mono(read_audio_file(f"data/{i}.mp3")[1]) for i in range(0, 8)]

for windowsize in [1000, 2000, 5000, 10000, 30000]:
    # extraction des 64 features courtes et moyenne sur chacune
    meanft = mean_features(extract_short_features(windowsize, signals))

    # selection sequentielle d'un ensemble d'attribut pour minimiser la valeur p du coefficient de spearman
    seq_forward_selection(meanft)



extracting short features with window size 1000
0. Selected attributes are spectral_centroid. score: 0.0004541491691941689
1. Selected attributes are spectral_centroid, chroma_10. score: 0
2. Selected attributes are spectral_centroid, chroma_10, spectral_flux. score: 0
3. Selected attributes are spectral_centroid, chroma_10, spectral_flux, chroma_2. score: 0
4. Selected attributes are spectral_centroid, chroma_10, spectral_flux, chroma_2, chroma_5. score: 0
5. Selected attributes are spectral_centroid, chroma_10, spectral_flux, chroma_2, chroma_5, chroma_8. score: 0
6. Selected attributes are spectral_centroid, chroma_10, spectral_flux, chroma_2, chroma_5, chroma_8, chroma_12. score: 0
7. Selected attributes are spectral_centroid, chroma_10, spectral_flux, chroma_2, chroma_5, chroma_8, chroma_12, chroma_std. score: 0
8. Selected attributes are spectral_centroid, chroma_10, spectral_flux, chroma_2, chroma_5, chroma_8, chroma_12, chroma_std, chroma_7. score: 0
9. Selected attributes are 

## Discussion

On peut voir que pour toutes les tailles de fenêtres, seuls 2 attributs suffisent à atteindre le score parfait de 0. (l'ordre est conservé avec ces atributs)
Seulement, pour la taille de fenêtre 5000, il en suffit que d'un seul: l'attribut `spectral_centroid`

On devrait donc pouvoir utiliser cette taille de fenêtre et cet attribut pour prédire la préférence de cet utilisateur, mais le problème est que 7 morceaux est vraiment trop peu pour être sûr de ce résultat. Comme indiqué sur [la page de la corrélation de Spearman sur la documentation de SciPy](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html): 

> Although calculation of the p-value does not make strong assumptions about the distributions underlying the samples, it is only accurate for very large samples (>500 observations).

Il faudrait donc un plus grand nombre de morceau pour un résultat plus sûr.