In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import librosa
import numpy as np
from numpy import asarray
from numpy import save
import os
import random

from sklearn.manifold import TSNE
from numpy import reshape
import seaborn as sns
import pandas as pd

In [None]:
base = '/content/drive/MyDrive/THESIS/LibriSpeech_training_data'

In [None]:
def extract_features(file_name):

  # try:
  print(file_name)
  audio, sample_rate = librosa.load(file_name, duration=2.97)
  mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
  mfccsscaled = np.mean(mfccs.T, axis=0)

  # except Exception as e:
  #     print("Error encountered while parsing file: ", file)
  #     return None

  return mfccsscaled


In [None]:
# feature extract for tsne-plot

first_iteration = True

tsne_array = np.zeros(40)
samples_speaker_id = np.zeros(1, dtype=int)
file_count = 1

for dirpath, dirnames, files in os.walk(base):

  # Giving the file_count condition to pick total number of speakers/classes
  # in the support set and query set.
  if (file_count <= 10):

    new_speaker = True

    ## Just ignoring the first iteration because, we do not need the base path's
    ## files. we need the base path's directory's(speaker's) audio files.
    if first_iteration:
      first_iteration = False
      continue


    count = 1
    while (count <= 10):

      random_file = random.choice(files)

      ## This checking is for taking the speaker_id only once for a speaker
      ## rather than picking the speaker_id for each audiofiles of the same speaker
        if new_speaker:
            speaker_id = random_file.split('-')[0]
            new_speaker = False

        tsne_array = np.vstack([tsne_array, extract_features(os.path.join(*[base, speaker_id, random_file]))])

        count += 1

    file_count += 1

## Removing the 1st row from tsne array' feature vectors because of zero value
tsne_array = np.delete(tsne_array, 0, axis=0)



In [None]:
# tsne-plot

tsne = TSNE(n_components=2, verbose=1, random_state=123)
z = tsne.fit_transform(tsne_array)
df = pd.DataFrame()
df["y"] = y_train
df["comp-1"] = z[:,0]
df["comp-2"] = z[:,1]

sns.scatterplot(x="comp-1", y="comp-2", hue=df.y.tolist(),
                palette=sns.color_palette("hls", 10),
                data=df).set(title="dataset T-SNE projection")

In [None]:
first_iteration = True

anchor_mfcc_scaled_samples = np.zeros(40)
positive_mfcc_scaled_samples = np.zeros(40)
negative_mfcc_scaled_samples = np.zeros(40)
samples_speaker_id = np.zeros(1, dtype=int)

## all directories list from the basepath. Used to later find the negative
## samples for each corresponding anchor samples.
all_dir_for_neg_samples = []


##For Debug
anchor_debug = []
positive_debug = []
negative_debug = []

for dirpath, dirnames, files in os.walk(base):

  new_speaker = True

  ## Just ignoring the first iteration because, we do not need the base path's
  ## files. we need the base path's directory's(speaker's) audio files.
  if first_iteration:
    all_dir_for_neg_samples += dirnames
    first_iteration = False
    continue

  speaker_id = ''
  file_counter = 1
  total_files = len(files)

  for file in files:

    ## This checking is for taking the speaker_id only once for a speaker
    ## rather than picking the speaker_id for each audiofiles of the same speaker
    if new_speaker:
      new_speaker = False
      speaker_id = file.split('-')[0]

    ## This is for the last file. whether we ignore the last file or not.
    ## only used for odd number of audiofiles of a speaker.
    if file_counter == total_files:
      if total_files % 2 != 0:
        continue

    ## This checking is for taking the first half file of a speaker(directory)
    ## to the anchor samples and the last half file is for the positive samples.
    if file_counter <= (total_files//2):

      anchor_mfcc_scaled_samples = np.vstack([anchor_mfcc_scaled_samples, extract_features(os.path.join(*[base, speaker_id, file]))])

      ## Until we are working in the same directory or the same speaker
      ## we are just stacking the same speaker id. (This is corresponding to the
      ## anchor sample's speaker).
      samples_speaker_id = np.vstack([samples_speaker_id, speaker_id])

      ## using the speaker_id for debugging
      anchor_debug.append(speaker_id)

    else:
      positive_mfcc_scaled_samples = np.vstack([positive_mfcc_scaled_samples, extract_features(os.path.join(*[base, speaker_id, file]))])

      ## using the speaker_id for debugging
      positive_debug.append(speaker_id)

    file_counter += 1


## Removing the 1st row from anchor, positive samples' feature vectors because of zero value
anchor_mfcc_scaled_samples = np.delete(anchor_mfcc_scaled_samples, 0, axis=0)
positive_mfcc_scaled_samples = np.delete(positive_mfcc_scaled_samples, 0, axis=0)
## Removing the 1st row of sample speaker_id array because of zero value
samples_speaker_id = np.delete(samples_speaker_id, 0, axis=0)


## This loop is to get the feature vectors of the negative samples for corresponding anchor samples
rows = samples_speaker_id.shape[0] # the number of rows

for index in range(rows):
  anchor_speaker = (samples_speaker_id[index])[0]

  ## Generating a random number to pick a directory which is not equal to the
  ## anchor speaker id.
  random_directory = ''
  while (True):
    random_directory = random.choice(all_dir_for_neg_samples)
    if random_directory != anchor_speaker:
      break

  negative_debug.append(random_directory)

  ## Getting the directory location of the random_directory which was found earlier
  negative_directory = os.path.join(base, random_directory)

  ## Getting the list of files from that random_directory
  negative_directory_files = os.listdir(negative_directory)

  ## Getting a Random file from the negative_directory
  random_file = random.choice(negative_directory_files)

  negative_mfcc_scaled_samples = np.vstack([negative_mfcc_scaled_samples, extract_features(os.path.join(negative_directory, random_file))])

## Removing the 1st row from negative samples' feature vectors because of zero value
negative_mfcc_scaled_samples = np.delete(negative_mfcc_scaled_samples, 0, axis=0)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
/content/drive/MyDrive/THESIS/LibriSpeech_training_data/7286/7286-92930-0016.flac
/content/drive/MyDrive/THESIS/LibriSpeech_training_data/2196/2196-170151-0025.flac
/content/drive/MyDrive/THESIS/LibriSpeech_training_data/6574/6574-120583-0008.flac
/content/drive/MyDrive/THESIS/LibriSpeech_training_data/5126/5126-27504-0010.flac
/content/drive/MyDrive/THESIS/LibriSpeech_training_data/1482/1482-140067-0002.flac
/content/drive/MyDrive/THESIS/LibriSpeech_training_data/6701/6701-71401-0017.flac
/content/drive/MyDrive/THESIS/LibriSpeech_training_data/6694/6694-70837-0035.flac
/content/drive/MyDrive/THESIS/LibriSpeech_training_data/5750/5750-100289-0058.flac
/content/drive/MyDrive/THESIS/LibriSpeech_training_data/8838/8838-298545-0036.flac
/content/drive/MyDrive/THESIS/LibriSpeech_training_data/2007/2007-132570-0003.flac
/content/drive/MyDrive/THESIS/LibriSpeech_training_data/6426/6426-64290-0021.flac
/content/drive/MyDrive/THES

In [None]:
if (len(anchor_debug) == len(positive_debug)) and (len(anchor_debug) == len(negative_debug)):

  if anchor_debug == positive_debug:
    print("All indices match between Anchor and Positive")
  else:
    print("Some indices do not match between Anchor and Positive")

  check = False
  for i in range(len(anchor_debug)):
    if anchor_debug[i] == negative_debug[i]:
      check = True
      break

  if check:
    print("Some indices match between Anchor and Negative")
  else:
    print("No index matches between Anchor and Negative")

else:
  print("Lengths are different")

All indices match between Anchor and Positive
No index matches between Anchor and Negative


In [None]:
print(len(anchor_debug))
print(len(positive_debug))
print(len(negative_debug))
print(len(samples_speaker_id))

33675
33675
33675
33675


In [None]:
anchor_mfcc_scaled_samples.shape

(33675, 40)

In [None]:
positive_mfcc_scaled_samples.shape

(33675, 40)

In [None]:
negative_mfcc_scaled_samples.shape

(33675, 40)

In [None]:
anchor_mfcc_scaled_samples[1]

array([-4.38506042e+02,  6.16738129e+01,  7.99811554e+00,  5.37212906e+01,
       -1.43555508e+01,  3.20328617e+00, -2.83175659e+01,  9.20974922e+00,
       -1.71878929e+01, -8.51651287e+00, -6.58206081e+00, -1.13818955e+00,
       -1.31066048e+00, -8.07521820e+00,  6.86008501e+00, -1.28607101e+01,
        3.68439984e+00, -1.02860012e+01, -1.85287678e+00, -2.58285499e+00,
       -4.93689537e+00, -4.90523529e+00, -5.66874170e+00,  8.72467875e-01,
       -3.43097997e+00,  5.35144329e+00, -5.33776999e-01,  7.75018883e+00,
        4.36687469e+00,  4.61056709e+00, -3.45989168e-02,  1.67759454e+00,
        5.63966370e+00,  1.21249962e+00,  5.08640432e+00,  1.87036729e+00,
        4.95489359e+00, -1.48943782e+00,  3.16615015e-01, -1.61731231e+00])

In [None]:
positive_mfcc_scaled_samples[1]

array([-3.83090210e+02,  8.71394958e+01, -1.57678194e+01,  3.32040253e+01,
       -1.73363228e+01,  1.56117928e+00, -2.28554001e+01, -1.11321473e+00,
       -1.27033100e+01, -5.74067545e+00, -7.88111591e+00, -2.31858182e+00,
       -7.91300964e+00, -2.50361538e+00,  5.22616196e+00, -1.94596329e+01,
        8.49137497e+00, -1.17112617e+01, -3.01079178e+00, -1.49214077e+00,
       -6.79152441e+00,  1.87976229e+00, -7.63947010e+00,  1.74341953e+00,
       -3.02447987e+00, -2.67849296e-01, -9.69307065e-01,  4.60756016e+00,
        4.19127512e+00,  9.12496758e+00,  9.07171631e+00,  8.58738518e+00,
        1.02260103e+01,  2.70673752e+00,  5.08431196e+00,  9.94921029e-01,
        4.47814178e+00,  2.21798539e-01,  1.32624462e-01, -1.18378773e-01])

In [None]:
negative_mfcc_scaled_samples[1]

array([-3.45811035e+02,  1.04266357e+02, -2.40751991e+01,  4.95007629e+01,
       -5.03006172e+00,  7.27886200e+00, -5.25660801e+00, -1.44637120e+00,
       -1.41303921e+01, -5.22911787e+00, -2.08406878e+00, -4.95688105e+00,
       -2.24943542e+00, -4.22679567e+00,  6.50574207e+00, -8.91066849e-01,
       -5.99530697e-01, -3.40216279e+00,  4.43440533e+00, -7.62123919e+00,
       -5.82524109e+00,  4.69675362e-01, -4.52787828e+00, -2.80453229e+00,
       -3.83745289e+00, -1.59681129e+00, -5.40812111e+00, -3.13694954e-01,
       -4.87440395e+00, -1.33888113e+00, -2.55615377e+00, -1.43516016e+00,
       -1.01084387e+00, -3.86926365e+00,  6.46865726e-01, -1.67125678e+00,
        1.26112831e+00, -3.59205222e+00,  1.15791297e+00, -2.99591041e+00])

In [None]:
# Saving the anchor sample arrray to .npy format
anchor_samples_array_saved = asarray(anchor_mfcc_scaled_samples)
save('/content/drive/MyDrive/Colab Notebooks/thesis_code_main/feature_extracted_saved_arrays/anchor.npy', anchor_samples_array_saved)

# Saving the positive sample arrray to .npy format
positive_samples_array_saved = asarray(positive_mfcc_scaled_samples)
save('/content/drive/MyDrive/Colab Notebooks/thesis_code_main/feature_extracted_saved_arrays/positive.npy', positive_samples_array_saved)

# Saving the Negative sample arrray to .npy format
negative_samples_array_saved = asarray(negative_mfcc_scaled_samples)
save('/content/drive/MyDrive/Colab Notebooks/thesis_code_main/feature_extracted_saved_arrays/negative.npy', negative_samples_array_saved)

# Saving the Speaker_id arrray to .npy format
samples_speaker_id_array_saved = asarray(samples_speaker_id)
save('/content/drive/MyDrive/Colab Notebooks/thesis_code_main/feature_extracted_saved_arrays/speaker_id.npy', samples_speaker_id_array_saved)
