In [1]:
import os
from os.path import join
from os import listdir
import pandas as pd
import numpy as np
from itertools import product

from joblib import Parallel, delayed

from sklearn.model_selection import KFold
from sklearn.svm import SVC

In [2]:
from typing import List, Tuple, Dict

In [3]:
import librosa

In [4]:
audio_path = 'data/dev-clean/'
SEED = 0

N_CHROMA = 88
FRAME_LENGTH = 1024
HOP_LENGTH = 512

In [5]:
speakers = pd.read_csv('data/speakers.tsv', sep='\t').reset_index()
speakers.columns = ['READER', 'GENDER', 'SUBSET', 'NAME']
speakers.head()

Unnamed: 0,READER,GENDER,SUBSET,NAME
0,14,F,train-clean-360,Kristin LeMoine
1,16,F,train-clean-360,Alys AtteWater
2,17,M,train-clean-360,Gord Mackenzie
3,19,F,train-clean-100,Kara Shallenberg
4,20,F,train-other-500,Gesine


In [6]:
speakers_in_data = list(map(int, listdir(audio_path)))
len(speakers_in_data)

40

In [7]:
speakers = speakers.merge(pd.Series(speakers_in_data, name='READER'), on='READER')
speakers.set_index('READER', inplace=True)
speakers.head()

Unnamed: 0_level_0,GENDER,SUBSET,NAME
READER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
84,F,dev-clean,Christie Nowak
174,M,dev-clean,Peter Eastman
251,M,dev-clean,Mark Nelson
422,M,dev-clean,President Lethe
652,M,dev-clean,Scott Walter


In [8]:
with open('./data/CHAPTERS.txt', 'r') as f:
    raw_text = list(map(lambda x: x.strip('\n'), f.readlines()))

header_end = 0
for i, line in enumerate(raw_text):
    if not line.startswith(';'):
        header_end = i
        break
        
table = [map(str.strip, line.split('|')) for line in raw_text[header_end:]]
columns = list(map(lambda x: x.strip(';').strip(), raw_text[header_end-1].split('|')))
chapters_df = pd.DataFrame(table, columns=columns).astype(dtype={'ID':int, 'READER': int, 'MINUTES': float})
chapters_df.head()

Unnamed: 0,ID,READER,MINUTES,SUBSET,PROJ.,BOOK ID,CH. TITLE,PROJECT TITLE
0,1,110,19.77,train-other-500,53,1023,In Chancery,Bleak House
1,2,110,10.3,train-other-500,53,1023,In Fashion,Bleak House
2,159,4174,7.67,train-other-500,68,2184,Letter XXV,Unbeaten Tracks in Japan
3,198,19,8.42,train-clean-100,219,121,Chapter 01,Northanger Abbey
4,199,98,11.68,train-clean-360,219,121,Chapter 02,Northanger Abbey


In [9]:
speakers.merge(chapters_df, on='READER')[['GENDER', 'MINUTES']].groupby('GENDER').sum()

Unnamed: 0_level_0,MINUTES
GENDER,Unnamed: 1_level_1
F,161.47
M,161.8


In [10]:
dataset_list = []
for reader in listdir(audio_path):
    chapters_path = join(audio_path, reader)
    for chapter in listdir(chapters_path):
        files_path = join(chapters_path, chapter)
        for file in listdir(files_path):
            if file.endswith('.wav'):
                reader_gender = speakers.loc[int(reader)].GENDER
                dataset_list.append((int(reader), reader_gender, join(files_path, file)))
dataset_df = pd.DataFrame(dataset_list, columns=['reader', 'gender', 'path'])
dataset_df.head()

Unnamed: 0,reader,gender,path
0,1919,F,data/dev-clean/1919/142785/1919_142785_000005_...
1,1919,F,data/dev-clean/1919/142785/1919_142785_000118_...
2,1919,F,data/dev-clean/1919/142785/1919_142785_000035_...
3,1919,F,data/dev-clean/1919/142785/1919_142785_000064_...
4,1919,F,data/dev-clean/1919/142785/1919_142785_000071_...


In [11]:
dataset_df.groupby('gender').count()

Unnamed: 0_level_0,reader,path
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
F,2976,2976
M,2760,2760


In [13]:
def get_features(sample_path: str) ->np.ndarray:
    y, sr = librosa.load(sample_path)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr, n_chroma=N_CHROMA, hop_length=HOP_LENGTH, n_fft=FRAME_LENGTH)
    zcr = librosa.feature.zero_crossing_rate(y, hop_length=HOP_LENGTH, frame_length=FRAME_LENGTH)
    rms = librosa.feature.rms(y, hop_length=HOP_LENGTH, frame_length=FRAME_LENGTH)

    features = []
    for el in [chroma_stft, zcr, rms]:
        features.append(el.mean(1))
        features.append(el.std(1))

    return np.hstack(features)

In [14]:
def process_batch(samples_paths: List[str]) -> Dict[str, np.ndarray]:
    res = {}
    for sample_path in samples_paths:
        res[sample_path] = get_features(sample_path)
    return res

In [15]:
features_name = [f'chroma_stft_{i}_{aggr}' for i, aggr in product(range(N_CHROMA), ['mean', 'std'])]
features_name += [f'{method}_{aggr}' for method, aggr in product(['zcr', 'rms'], ['mean', 'std'])]
features_name

['chroma_stft_0_mean',
 'chroma_stft_0_std',
 'chroma_stft_1_mean',
 'chroma_stft_1_std',
 'chroma_stft_2_mean',
 'chroma_stft_2_std',
 'chroma_stft_3_mean',
 'chroma_stft_3_std',
 'chroma_stft_4_mean',
 'chroma_stft_4_std',
 'chroma_stft_5_mean',
 'chroma_stft_5_std',
 'chroma_stft_6_mean',
 'chroma_stft_6_std',
 'chroma_stft_7_mean',
 'chroma_stft_7_std',
 'chroma_stft_8_mean',
 'chroma_stft_8_std',
 'chroma_stft_9_mean',
 'chroma_stft_9_std',
 'chroma_stft_10_mean',
 'chroma_stft_10_std',
 'chroma_stft_11_mean',
 'chroma_stft_11_std',
 'chroma_stft_12_mean',
 'chroma_stft_12_std',
 'chroma_stft_13_mean',
 'chroma_stft_13_std',
 'chroma_stft_14_mean',
 'chroma_stft_14_std',
 'chroma_stft_15_mean',
 'chroma_stft_15_std',
 'chroma_stft_16_mean',
 'chroma_stft_16_std',
 'chroma_stft_17_mean',
 'chroma_stft_17_std',
 'chroma_stft_18_mean',
 'chroma_stft_18_std',
 'chroma_stft_19_mean',
 'chroma_stft_19_std',
 'chroma_stft_20_mean',
 'chroma_stft_20_std',
 'chroma_stft_21_mean',
 'chroma_

In [16]:
n_jobs = 12
batch_size = 100
batch_number = int(np.ceil(len(dataset_df)/batch_size))
print(f'batch number: {batch_number}')
jobs = []
for i in range(batch_number):
    jobs.append(delayed(process_batch)(dataset_df.path[i*batch_size:(i+1)*batch_size].values))
features = Parallel(n_jobs=n_jobs, verbose=10)(jobs)

batch number: 58


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:   21.1s
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:   28.6s
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:   47.0s
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  1.0min
[Parallel(n_jobs=12)]: Done  41 out of  58 | elapsed:  1.4min remaining:   34.7s
[Parallel(n_jobs=12)]: Done  47 out of  58 | elapsed:  1.7min remaining:   24.5s
[Parallel(n_jobs=12)]: Done  53 out of  58 | elapsed:  1.8min remaining:   10.3s
[Parallel(n_jobs=12)]: Done  58 out of  58 | elapsed:  1.9min finished


In [17]:
features_dict = {}
for el in features:
    for k,v in el.items():
        features_dict[k] = v
features_df = pd.DataFrame(features_dict).T
features_df.columns = features_name
features_df.index.name = 'path'
features_df.head()

Unnamed: 0_level_0,chroma_stft_0_mean,chroma_stft_0_std,chroma_stft_1_mean,chroma_stft_1_std,chroma_stft_2_mean,chroma_stft_2_std,chroma_stft_3_mean,chroma_stft_3_std,chroma_stft_4_mean,chroma_stft_4_std,...,chroma_stft_85_mean,chroma_stft_85_std,chroma_stft_86_mean,chroma_stft_86_std,chroma_stft_87_mean,chroma_stft_87_std,zcr_mean,zcr_std,rms_mean,rms_std
path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
data/dev-clean/1919/142785/1919_142785_000005_000002.wav,0.257667,0.260941,0.260627,0.260974,0.264403,0.268702,0.265345,0.25973,0.259609,0.261992,...,0.287626,0.277788,0.274903,0.274345,0.274439,0.278929,0.13836,0.139642,0.073803,0.054839
data/dev-clean/1919/142785/1919_142785_000118_000001.wav,0.351155,0.339509,0.331138,0.324826,0.317682,0.311728,0.3038,0.296491,0.297595,0.300528,...,0.306288,0.305334,0.298272,0.298105,0.300521,0.307202,0.149536,0.135147,0.040383,0.034122
data/dev-clean/1919/142785/1919_142785_000035_000001.wav,0.216299,0.212288,0.216035,0.21883,0.221038,0.224242,0.22555,0.229322,0.233747,0.234828,...,0.255618,0.248775,0.244253,0.246457,0.252221,0.260247,0.144588,0.14032,0.046175,0.032386
data/dev-clean/1919/142785/1919_142785_000064_000003.wav,0.30673,0.291617,0.290036,0.291296,0.291558,0.287382,0.28183,0.282077,0.286006,0.288771,...,0.300353,0.301541,0.298994,0.295679,0.29472,0.303719,0.182428,0.169379,0.055092,0.043736
data/dev-clean/1919/142785/1919_142785_000071_000000.wav,0.275765,0.27127,0.266984,0.269012,0.272405,0.262805,0.270713,0.274939,0.273037,0.268459,...,0.278703,0.276195,0.276083,0.281105,0.271816,0.268462,0.202299,0.201331,0.07304,0.048869


In [18]:
dataset = dataset_df.join(features_df, on='path')
dataset.gender = dataset.gender.apply(lambda x: int(x=='F'))
dataset.head()

Unnamed: 0,reader,gender,path,chroma_stft_0_mean,chroma_stft_0_std,chroma_stft_1_mean,chroma_stft_1_std,chroma_stft_2_mean,chroma_stft_2_std,chroma_stft_3_mean,...,chroma_stft_85_mean,chroma_stft_85_std,chroma_stft_86_mean,chroma_stft_86_std,chroma_stft_87_mean,chroma_stft_87_std,zcr_mean,zcr_std,rms_mean,rms_std
0,1919,1,data/dev-clean/1919/142785/1919_142785_000005_...,0.257667,0.260941,0.260627,0.260974,0.264403,0.268702,0.265345,...,0.287626,0.277788,0.274903,0.274345,0.274439,0.278929,0.13836,0.139642,0.073803,0.054839
1,1919,1,data/dev-clean/1919/142785/1919_142785_000118_...,0.351155,0.339509,0.331138,0.324826,0.317682,0.311728,0.3038,...,0.306288,0.305334,0.298272,0.298105,0.300521,0.307202,0.149536,0.135147,0.040383,0.034122
2,1919,1,data/dev-clean/1919/142785/1919_142785_000035_...,0.216299,0.212288,0.216035,0.21883,0.221038,0.224242,0.22555,...,0.255618,0.248775,0.244253,0.246457,0.252221,0.260247,0.144588,0.14032,0.046175,0.032386
3,1919,1,data/dev-clean/1919/142785/1919_142785_000064_...,0.30673,0.291617,0.290036,0.291296,0.291558,0.287382,0.28183,...,0.300353,0.301541,0.298994,0.295679,0.29472,0.303719,0.182428,0.169379,0.055092,0.043736
4,1919,1,data/dev-clean/1919/142785/1919_142785_000071_...,0.275765,0.27127,0.266984,0.269012,0.272405,0.262805,0.270713,...,0.278703,0.276195,0.276083,0.281105,0.271816,0.268462,0.202299,0.201331,0.07304,0.048869


In [19]:
dataset.shape

(5736, 183)

In [20]:
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

In [21]:
scores = []
for train_reader_indxs, test_reader_indxs in kf.split(speakers):
    train_readers = set(speakers.iloc[train_reader_indxs].index.values)
    test_readers = set(speakers.iloc[test_reader_indxs].index.values)
    
    train_dataset = dataset[[el in train_readers for el in dataset.reader]]
    test_dataset = dataset[[el in test_readers for el in dataset.reader]]
    
    X_train = train_dataset.loc[:, 'chroma_stft_0_mean':].values
    y_train = train_dataset.gender.values
    
    X_test = test_dataset.loc[:, 'chroma_stft_0_mean':].values
    y_test = test_dataset.gender.values
    
    clf = SVC(gamma='scale')
    clf.fit(X_train, y_train)
    scores.append(clf.score(X_test, y_test))
    
scores, np.mean(scores )    

([0.8622754491017964,
  0.744406779661017,
  0.8742985409652076,
  0.8805744520030234,
  0.8485193621867881],
 0.8420149167835664)