# Spectrogram embeddings

In [3]:
!pip install librosa

Collecting librosa
  Obtaining dependency information for librosa from https://files.pythonhosted.org/packages/e2/a2/4f639c1168d7aada749a896afb4892a831e2041bebdcf636aebfe9e86556/librosa-0.10.1-py3-none-any.whl.metadata
  Downloading librosa-0.10.1-py3-none-any.whl.metadata (8.3 kB)
Collecting audioread>=2.1.9 (from librosa)
  Obtaining dependency information for audioread>=2.1.9 from https://files.pythonhosted.org/packages/57/8d/30aa32745af16af0a9a650115fbe81bde7c610ed5c21b381fca0196f3a7f/audioread-3.0.1-py3-none-any.whl.metadata
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Obtaining dependency information for soundfile>=0.12.1 from https://files.pythonhosted.org/packages/c8/73/059c84343be6509b480013bf1eeb11b96c5f9eb48deff8f83638011f6b2c/soundfile-0.12.1-py2.py3-none-macosx_10_9_x86_64.whl.metadata
  Downloading soundfile-0.12.1-py2.py3-none-macosx_10_9_x86_64.whl.metadata (14 kB)
Collecting pooch>=1.0 (from librosa)
  O

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from numpy.lib import stride_tricks
import scipy.io.wavfile as wav
from scipy.fftpack import fft
from scipy.signal import get_window
import librosa
import librosa.display
import os
import glob
from tqdm import tqdm
import pandas as pd

In [2]:
def spectrogram_embedding(y, sr, n_mels=256):
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    embedding = np.mean(mel, axis=1)
    return embedding

In [3]:
WAV_DIRECTORY = '../data/wav/'

embedding_map = {}

wav_files = glob.glob(os.path.join(WAV_DIRECTORY, '**', '*.wav'), recursive=True)

for wav_path in tqdm(wav_files):
    y, sr = librosa.load(wav_path)
    
    embedding = spectrogram_embedding(y=y, sr=sr)
    
    file_id = wav_path.replace(WAV_DIRECTORY, '')
    embedding_map[file_id] = embedding

100%|███████████████████████████████████| 153516/153516 [46:15<00:00, 55.31it/s]


In [4]:
embedding_df = pd.DataFrame.from_dict(embedding_map, orient="index")

embedding_df.index.name = 'file_id'
embedding_df = embedding_df.add_prefix('x_')

embedding_df.to_csv('spectrogram_embeddings.csv')

Unnamed: 0_level_0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_246,x_247,x_248,x_249,x_250,x_251,x_252,x_253,x_254,x_255
file_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id10384/vxBFGKGXSFA/00006.wav,0.051118,0.07581,0.077973,0.87563,4.209989,1.813636,0.334108,0.165719,0.421833,0.602115,...,5.824187e-06,5.744669e-06,5.673987e-06,5.617007e-06,5.573508e-06,5.535287e-06,5.500909e-06,5.477081e-06,5.467958e-06,5.445129e-06
id10384/vxBFGKGXSFA/00007.wav,0.042835,0.062862,0.066601,0.406492,1.580761,0.658935,0.189841,0.1729,0.371853,0.575293,...,3.605826e-07,3.555249e-07,3.510333e-07,3.474087e-07,3.446305e-07,3.421867e-07,3.400112e-07,3.384883e-07,3.378854e-07,3.364562e-07
id10384/vxBFGKGXSFA/00005.wav,0.039275,0.060451,0.069024,0.217769,1.003179,1.240808,0.538822,0.14585,0.230882,0.424917,...,3.07561e-09,2.816975e-09,2.597574e-09,2.414698e-09,2.26426e-09,2.139767e-09,2.038914e-09,1.962609e-09,1.911178e-09,1.874295e-09
id10384/vxBFGKGXSFA/00004.wav,0.057395,0.07964,0.08075,0.435038,1.668968,0.724695,0.21315,0.17201,0.33464,0.582857,...,1.256977e-07,1.234602e-07,1.214354e-07,1.197388e-07,1.183725e-07,1.17167e-07,1.161054e-07,1.153295e-07,1.149353e-07,1.143309e-07
id10384/vxBFGKGXSFA/00001.wav,0.046767,0.06362,0.062323,0.60849,2.282021,0.864586,0.282813,0.196223,0.39059,0.653713,...,1.233509e-07,1.210096e-07,1.189437e-07,1.172473e-07,1.159059e-07,1.147468e-07,1.137367e-07,1.13011e-07,1.126547e-07,1.120826e-07


# X-vector embeddings

In [1]:
!pip install speechbrain

Collecting speechbrain
  Obtaining dependency information for speechbrain from https://files.pythonhosted.org/packages/90/ee/c8669b57ebdbeac0530538725caa02cd226e2623b725f1e216ae59b54a1f/speechbrain-1.0.0-py3-none-any.whl.metadata
  Downloading speechbrain-1.0.0-py3-none-any.whl.metadata (23 kB)
Collecting hyperpyyaml (from speechbrain)
  Obtaining dependency information for hyperpyyaml from https://files.pythonhosted.org/packages/33/c9/751b6401887f4b50f9307cc1e53d287b3dc77c375c126aeb6335aff73ccb/HyperPyYAML-1.2.2-py3-none-any.whl.metadata
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting ruamel.yaml>=0.17.28 (from hyperpyyaml->speechbrain)
  Obtaining dependency information for ruamel.yaml>=0.17.28 from https://files.pythonhosted.org/packages/73/67/8ece580cc363331d9a53055130f86b096bf16e38156e33b1d3014fffda6b/ruamel.yaml-0.18.6-py3-none-any.whl.metadata
  Downloading ruamel.yaml-0.18.6-py3-none-any.whl.metadata (23 kB)
Collecting ruamel.yaml.clib>=0.2.7 (from

In [6]:
from speechbrain.inference.speaker import EncoderClassifier
from tqdm import tqdm
import numpy as np
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from numpy.lib import stride_tricks
import scipy.io.wavfile as wav
from scipy.fftpack import fft
from scipy.signal import get_window
import librosa
import librosa.display
import os
import glob
from tqdm import tqdm
import pandas as pd

In [4]:
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", savedir="pretrained_models/spkrec-xvect-voxceleb")

Downloading hyperparams.yaml:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

Downloading embedding_model.ckpt:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

Downloading mean_var_norm_emb.ckpt:   0%|          | 0.00/3.20k [00:00<?, ?B/s]

Downloading classifier.ckpt:   0%|          | 0.00/15.9M [00:00<?, ?B/s]

Downloading label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

In [20]:
WAV_DIRECTORY = '../data/wav/'

embedding_map = {}

wav_files = glob.glob(os.path.join(WAV_DIRECTORY, '**', '*.wav'), recursive=True)

for wav_path in tqdm(wav_files):
    signal, fs = torchaudio.load(wav_path)
    embedding = classifier.encode_batch(signal)
    
    file_id = wav_path.replace(WAV_DIRECTORY, '')
    embedding_map[file_id] = embedding[0, 0].numpy()

100%|█████████████████████████████████| 153516/153516 [1:27:38<00:00, 29.20it/s]


In [21]:
embedding_df = pd.DataFrame.from_dict(embedding_map, orient="index")

embedding_df.index.name = 'file_id'
embedding_df = embedding_df.add_prefix('x_')

embedding_df.to_csv('x_vector_embeddings.csv')

Unnamed: 0_level_0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_502,x_503,x_504,x_505,x_506,x_507,x_508,x_509,x_510,x_511
file_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id10384/vxBFGKGXSFA/00006.wav,-23.354128,-5.221647,12.24058,15.430451,1.791511,-0.519118,-20.50967,-12.18144,11.994558,5.631124,...,-26.862387,6.17073,-5.671959,-3.804694,-16.238195,11.366379,17.722548,7.378318,11.428824,-19.350277
id10384/vxBFGKGXSFA/00007.wav,-24.505468,-8.043333,8.051411,11.658402,-0.395556,-8.682104,-19.021248,-10.961925,12.948086,5.389829,...,-26.868261,6.515625,-3.996622,-4.034167,-16.338655,8.985258,14.625571,11.528162,8.261323,-19.290039
id10384/vxBFGKGXSFA/00005.wav,-25.886944,-6.220237,12.843323,12.866706,5.178396,-1.025675,-19.521687,-9.459463,12.362989,5.665018,...,-24.904192,3.411289,-3.564738,-3.262426,-13.783454,6.580886,14.320321,9.41615,11.227883,-17.160492
id10384/vxBFGKGXSFA/00004.wav,-26.685034,-7.005956,10.387636,13.525033,0.91744,-6.970324,-21.471519,-10.747233,9.679843,6.068498,...,-24.744322,8.596687,-11.971527,-4.305787,-19.009026,11.050454,15.358821,6.720806,9.944366,-21.631676
id10384/vxBFGKGXSFA/00001.wav,-28.967779,-8.30089,12.400032,11.168025,1.29022,-7.722721,-17.906452,-8.237776,11.43465,5.865917,...,-22.42664,3.674269,-6.918116,-3.619047,-10.661541,9.406233,12.912198,11.07388,9.938466,-20.352184
