In [1]:
import os
from os.path import join
from os import listdir
import pandas as pd
import numpy as np
import librosa

from itertools import product

import IPython.display as ipd
import matplotlib.pyplot as plt

from joblib import Parallel, delayed

from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from scipy.stats import skew

from utils.main_utils import get_readers, get_chapters_info, collect_paths_with_meta
from utils.scoring import cross_val

In [2]:
from typing import List, Tuple, Dict

In [3]:
audio_path = 'data/dev-clean/'
SEED = 0

N_CHROMA = 88
FRAME_LENGTH = 1024          
HOP_LENGTH = FRAME_LENGTH // 2

In [4]:
def get_features(sample_path: str) ->np.ndarray:
    y, sr = librosa.load(sample_path)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr, n_chroma=N_CHROMA, hop_length=HOP_LENGTH, n_fft=FRAME_LENGTH)
    zcr = librosa.feature.zero_crossing_rate(y, hop_length=HOP_LENGTH, frame_length=FRAME_LENGTH)
    rms = librosa.feature.rms(y, hop_length=HOP_LENGTH, frame_length=FRAME_LENGTH)

    features = []
    for el in [chroma_stft, zcr, rms]:
        features.append(el.mean(1))
        features.append(el.std(1))

    return np.hstack(features)

In [5]:
def process_batch(samples_paths: List[str]) -> Dict[str, np.ndarray]:
    res = {}
    for sample_path in samples_paths:
        res[sample_path] = get_features(sample_path)
    return res

In [6]:
def generate_features(meta_paths: pd.DataFrame, features_names: List[str],
                      n_jobs: int, batch_size: int) -> pd.DataFrame:
    
    batch_number = int(np.ceil(len(meta_paths)/batch_size))
    print(f'batch number: {batch_number}')
    jobs = []
    for i in range(batch_number):
        jobs.append(delayed(process_batch)(meta_paths.path[i*batch_size:(i+1)*batch_size].values))
    features = Parallel(n_jobs=n_jobs, verbose=10)(jobs)

    features_dict = {}
    for el in features:
        for k,v in el.items():
            features_dict[k] = v
    features_df = pd.DataFrame(features_dict).T
    features_df.columns = features_names
    features_df.index.name = 'path'
    
    return features_df

In [7]:
readers = get_readers('data/speakers.tsv', audio_path)
meta_paths = collect_paths_with_meta(audio_path, readers)

In [8]:
features_names = [f'chroma_stft_{i}_{aggr}' for i, aggr in product(range(N_CHROMA), ['mean', 'std'])]
features_names += [f'{method}_{aggr}' for method, aggr in product(['zcr', 'rms',],
                                                                 ['mean', 'std'])]
len(features_names)

180

In [9]:
features = generate_features(meta_paths, features_names, n_jobs=12, batch_size=100)
features.head()

batch number: 58


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:   35.2s
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:   42.2s
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:  1.1min
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  1.4min
[Parallel(n_jobs=12)]: Done  41 out of  58 | elapsed:  1.8min remaining:   45.2s
[Parallel(n_jobs=12)]: Done  47 out of  58 | elapsed:  2.1min remaining:   29.4s
[Parallel(n_jobs=12)]: Done  53 out of  58 | elapsed:  2.2min remaining:   12.4s
[Parallel(n_jobs=12)]: Done  58 out of  58 | elapsed:  2.3min finished


Unnamed: 0_level_0,chroma_stft_0_mean,chroma_stft_0_std,chroma_stft_1_mean,chroma_stft_1_std,chroma_stft_2_mean,chroma_stft_2_std,chroma_stft_3_mean,chroma_stft_3_std,chroma_stft_4_mean,chroma_stft_4_std,...,chroma_stft_85_mean,chroma_stft_85_std,chroma_stft_86_mean,chroma_stft_86_std,chroma_stft_87_mean,chroma_stft_87_std,zcr_mean,zcr_std,rms_mean,rms_std
path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
data/dev-clean/1919/142785/1919_142785_000005_000002.wav,0.257667,0.260941,0.260627,0.260974,0.264403,0.268702,0.265345,0.25973,0.259609,0.261992,...,0.287626,0.277788,0.274903,0.274345,0.274439,0.278929,0.13836,0.139642,0.073803,0.054839
data/dev-clean/1919/142785/1919_142785_000118_000001.wav,0.351155,0.339509,0.331138,0.324826,0.317682,0.311728,0.3038,0.296491,0.297595,0.300528,...,0.306288,0.305334,0.298272,0.298105,0.300521,0.307202,0.149536,0.135147,0.040383,0.034122
data/dev-clean/1919/142785/1919_142785_000035_000001.wav,0.216299,0.212288,0.216035,0.21883,0.221038,0.224242,0.22555,0.229322,0.233747,0.234828,...,0.255618,0.248775,0.244253,0.246457,0.252221,0.260247,0.144588,0.14032,0.046175,0.032386
data/dev-clean/1919/142785/1919_142785_000064_000003.wav,0.30673,0.291617,0.290036,0.291296,0.291558,0.287382,0.28183,0.282077,0.286006,0.288771,...,0.300353,0.301541,0.298994,0.295679,0.29472,0.303719,0.182428,0.169379,0.055092,0.043736
data/dev-clean/1919/142785/1919_142785_000071_000000.wav,0.275765,0.27127,0.266984,0.269012,0.272405,0.262805,0.270713,0.274939,0.273037,0.268459,...,0.278703,0.276195,0.276083,0.281105,0.271816,0.268462,0.202299,0.201331,0.07304,0.048869


In [10]:
dataset = meta_paths.join(features, on='path')
dataset.gender = dataset.gender.apply(lambda x: int(x=='F'))
dataset.head()

Unnamed: 0,reader,gender,path,chroma_stft_0_mean,chroma_stft_0_std,chroma_stft_1_mean,chroma_stft_1_std,chroma_stft_2_mean,chroma_stft_2_std,chroma_stft_3_mean,...,chroma_stft_85_mean,chroma_stft_85_std,chroma_stft_86_mean,chroma_stft_86_std,chroma_stft_87_mean,chroma_stft_87_std,zcr_mean,zcr_std,rms_mean,rms_std
0,1919,1,data/dev-clean/1919/142785/1919_142785_000005_...,0.257667,0.260941,0.260627,0.260974,0.264403,0.268702,0.265345,...,0.287626,0.277788,0.274903,0.274345,0.274439,0.278929,0.13836,0.139642,0.073803,0.054839
1,1919,1,data/dev-clean/1919/142785/1919_142785_000118_...,0.351155,0.339509,0.331138,0.324826,0.317682,0.311728,0.3038,...,0.306288,0.305334,0.298272,0.298105,0.300521,0.307202,0.149536,0.135147,0.040383,0.034122
2,1919,1,data/dev-clean/1919/142785/1919_142785_000035_...,0.216299,0.212288,0.216035,0.21883,0.221038,0.224242,0.22555,...,0.255618,0.248775,0.244253,0.246457,0.252221,0.260247,0.144588,0.14032,0.046175,0.032386
3,1919,1,data/dev-clean/1919/142785/1919_142785_000064_...,0.30673,0.291617,0.290036,0.291296,0.291558,0.287382,0.28183,...,0.300353,0.301541,0.298994,0.295679,0.29472,0.303719,0.182428,0.169379,0.055092,0.043736
4,1919,1,data/dev-clean/1919/142785/1919_142785_000071_...,0.275765,0.27127,0.266984,0.269012,0.272405,0.262805,0.270713,...,0.278703,0.276195,0.276083,0.281105,0.271816,0.268462,0.202299,0.201331,0.07304,0.048869


In [11]:
dataset.shape

(5736, 183)

In [12]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
pca = PCA(n_components=50, random_state=SEED)
svd = TruncatedSVD(n_components=50, random_state=SEED)

In [13]:
first_feature_indx = dataset.columns.to_list().index('path') + 1

In [14]:
clf = SVC(gamma='scale', kernel='poly', degree=2, random_state=SEED)
_, mean_scores = cross_val(clf, readers, dataset, kf=skf, decomposer=pca, ffi=first_feature_indx)
print(mean_scores)

{'accuracy': 0.66, 'f1': 0.62, 'matthews': 0.37}


In [15]:
clf = SVC(gamma='scale', kernel='poly', degree=2, random_state=SEED)
_, mean_scores = cross_val(clf, readers, dataset, kf=skf, decomposer=svd, ffi=first_feature_indx)
print(mean_scores)

{'accuracy': 0.83, 'f1': 0.84, 'matthews': 0.68}


In [16]:
clf = SVC(gamma='scale', kernel='poly', degree=2, random_state=SEED)
_, mean_scores = cross_val(clf, readers, dataset, kf=skf, decomposer=None, ffi=first_feature_indx)
print(mean_scores)

{'accuracy': 0.78, 'f1': 0.77, 'matthews': 0.59}


In [17]:
clf = SVC(gamma='scale', kernel='rbf', random_state=SEED)
_, mean_scores = cross_val(clf, readers, dataset, kf=skf, decomposer=pca, ffi=first_feature_indx)
print(mean_scores)

{'accuracy': 0.84, 'f1': 0.85, 'matthews': 0.7}


In [18]:
clf = SVC(gamma='scale', kernel='rbf', random_state=SEED)
_, mean_scores = cross_val(clf, readers, dataset, kf=skf, decomposer=svd, ffi=first_feature_indx)
print(mean_scores)

{'accuracy': 0.84, 'f1': 0.85, 'matthews': 0.7}


In [19]:
clf = SVC(gamma='scale', kernel='rbf', random_state=SEED)
_, mean_scores = cross_val(clf, readers, dataset, kf=skf, decomposer=None, ffi=first_feature_indx)
print(mean_scores)

{'accuracy': 0.85, 'f1': 0.85, 'matthews': 0.71}


In [20]:
clf = KNeighborsClassifier(n_neighbors=3, p=2)
_, mean_scores = cross_val(clf, readers, dataset, kf=skf, decomposer=pca, ffi=first_feature_indx)
print(mean_scores)

{'accuracy': 0.78, 'f1': 0.78, 'matthews': 0.59}


In [21]:
clf = KNeighborsClassifier(n_neighbors=3, p=2)
_, mean_scores = cross_val(clf, readers, dataset, kf=skf, decomposer=None, ffi=first_feature_indx)
print(mean_scores)

{'accuracy': 0.78, 'f1': 0.78, 'matthews': 0.59}


In [22]:
rf = RandomForestClassifier(random_state=SEED)
_, mean_scores = cross_val(rf, readers, dataset, kf=skf, decomposer=pca, ffi=first_feature_indx)
print(mean_scores)

{'accuracy': 0.84, 'f1': 0.85, 'matthews': 0.69}


In [23]:
rf = RandomForestClassifier(random_state=SEED)
_, mean_scores = cross_val(rf, readers, dataset, kf=skf, decomposer=None, ffi=first_feature_indx)
print(mean_scores)

{'accuracy': 0.84, 'f1': 0.85, 'matthews': 0.69}


In [24]:
# Вычислим важность признаков
fi_args = np.argsort(rf.feature_importances_)[::-1]
[features_names[i] for i in fi_args][:10]

['chroma_stft_36_std',
 'chroma_stft_35_mean',
 'chroma_stft_34_mean',
 'chroma_stft_32_std',
 'chroma_stft_36_mean',
 'chroma_stft_33_mean',
 'chroma_stft_37_mean',
 'chroma_stft_34_std',
 'chroma_stft_35_std',
 'chroma_stft_31_std']

In [25]:
reduced_columns = [el for el in dataset.columns if not el.startswith('chroma')]
reduced_dataset = dataset.loc[:, reduced_columns].copy()

In [26]:
rf = RandomForestClassifier(random_state=SEED)
_, mean_scores = cross_val(rf, readers, reduced_dataset, kf=skf, decomposer=None, ffi=first_feature_indx)
print(mean_scores)

{'accuracy': 0.52, 'f1': 0.53, 'matthews': 0.1}


* Благодаря отказу от использования признаков, основанных на вычислении средних коээфициентов фурье в непересекающихся оконах заданной ширины, удалось выбросить ограничение на минимальную длину аудиофайла.
* Все еще самыми сильными признакми являются признаки, построенные на "нотах", то есть на тоне звука. Это поддтверждает исходную теорию, что существенное различие женского и мужского голоса приходится на высоту звучания. * Дополнительно хотелось бы обратить внимание на последние ячейки -- как сильно упало качество при отказе от признаков, основанных на нотах