# DataProcessing and generation of TfRecords:

In [1]:
#Tensorflow for TfRecords, neural network architecture and training
import tensorflow as tf

#Librosa for parsing audio files into numpy and audio processing tools
import librosa

#Pandas for DataFrame handling
import pandas as pd

#other useful libraries
import os
import datetime
import matplotlib.pyplot as plt
import numpy as np
import scipy
import glob
import numpy as np
import math
from sklearn.utils import shuffle
import zipfile
from sklearn.preprocessing import StandardScaler

tf.random.set_seed(999)
np.random.seed(999)

## Filenames and paths of Clean audio are found and split into Train:Validation:Test sets.

In [2]:
#Defining the paths for Clean and noise audio files
mozilla_basepath = 'en'
urbansound_basepath = 'UrbanSound8K'

In [3]:
#Dataframe of clean audio(Mozilla Common voice dataset)
mozilla_data = pd.read_csv(mozilla_basepath + '/train.tsv', sep = '\t')
mozilla_data.head()

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent,locale,segment
0,f1f6414c04e74453065e1b7fc1639c6f728dc03ed95890...,common_voice_en_20009651.mp3,It just didn't seem fair.,2,1,,,,en,
1,f1f6414c04e74453065e1b7fc1639c6f728dc03ed95890...,common_voice_en_20009653.mp3,The anticipated synergies of the two modes of ...,2,0,,,,en,
2,f1f6414c04e74453065e1b7fc1639c6f728dc03ed95890...,common_voice_en_20009655.mp3,"The fossil fuels include coal, petroleum and n...",3,1,,,,en,
3,f1f6414c04e74453065e1b7fc1639c6f728dc03ed95890...,common_voice_en_20009654.mp3,"Eventually, they named the complex after him.",2,0,,,,en,
4,f208e11e4b036d4728602fef34b9f1158faa601f20c63a...,common_voice_en_19684651.mp3,He was the grandfather of Arent S. Crowninshield.,2,0,,,,en,


In [4]:
clean_files = mozilla_data['path'].values

#Randomly shuffle all the file names
np.random.shuffle(clean_files)

print("Number of files: ", len(clean_files))

Number of files:  435947


In [5]:
#Uncomment in case of some missing files after download and extraction of the dataset.

#files_present = os.listdir('en/clips')
#print(files_present[:5])

In [6]:
#clean_paths = []
#c = 0
#for a in clean_files:
#    if a in files_present:
#        clean_paths.append(os.path.join(mozilla_basepath, 'clips', a))
#    c += 1
#    if c%2000==0:
#        print(c)
#import gc
#files_present = []
#gc.collect()
#files = pd.DataFrame(clean_paths)
#files.to_csv("cleanpaths.csv")

In [7]:
#Files present in the folder are cross-checked and a csv file was saved.

files = pd.read_csv('cleanpaths.csv')
clean_paths = files['0'].values

In [8]:
# Save 1000 files as Validation sets and remaianing as training examples

clean_val_paths = clean_paths[:1000]
clean_train_paths = clean_paths[1000:]

In [9]:
print("NUmber of training files: ", len(clean_train_paths))
print("Number of validation files: ", len(clean_val_paths))

NUmber of training files:  427053
Number of validation files:  1000


In [10]:
# #### As there are 4,27,000 files, We have truncated the number to 1,00,000 files

In [11]:
#clean_train_paths = clean_train_paths[:100000]

In [12]:
#Similarly the same is done for testsets of clean audio files.

clean_test_files = pd.read_csv(mozilla_basepath + '/test.tsv', sep = '\t')
clean_test_files = clean_test_files['path'].values
clean_test_paths = [os.path.join(mozilla_basepath, 'clips', a) for a in clean_test_files]

In [13]:
print("Number of testing files", len(clean_test_paths))

Number of testing files 16029


## File names and paths of noise audio are also extracted from metadata files provided along with the dataset.

In [14]:
#Data from metadata files
urbansound_metadata = pd.read_csv(os.path.join(urbansound_basepath, 'metadata', 'UrbanSound8K.csv'))
urbansound_metadata.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [15]:
# All the files in the noises dataset are split into 10 folds
# We are going to use 9folds for training,val and 10th fold as test
urbansound_metadata['fold'].value_counts()

4     990
5     936
3     925
2     888
1     873
7     838
10    837
6     823
9     816
8     806
Name: fold, dtype: int64

In [16]:
# All files other than 10th fold are collected

urbansound_train = urbansound_metadata[urbansound_metadata['fold']!= 10]
urbansound_train

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.000000,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.500000,62.500000,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.500000,64.500000,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.000000,67.000000,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.500000,72.500000,1,5,2,children_playing
...,...,...,...,...,...,...,...,...
8727,99812-1-2-0.wav,99812,159.522205,163.522205,2,7,1,car_horn
8728,99812-1-3-0.wav,99812,181.142431,183.284976,2,7,1,car_horn
8729,99812-1-4-0.wav,99812,242.691902,246.197885,2,7,1,car_horn
8730,99812-1-5-0.wav,99812,253.209850,255.741948,2,7,1,car_horn


In [17]:
# The audio files are collected from 10 sources as mentioned in the dataset website

class_ids = np.unique(urbansound_metadata['classID'].values)
print("Number of classses present: ", len(class_ids))

Number of classses present:  10


In [18]:
# Checking if all the noises have equal share among the whole dataset on an average.

all_files = []
for a in class_ids:
    per_class_files = urbansound_train[urbansound_train['classID']==a][['slice_file_name', 'fold']].values
    per_class_files = [os.path.join(urbansound_basepath, 'audio', 'fold'+str(a[1]), a[0]) for a in per_class_files]
    print("Class ID ",a," has", len(per_class_files), " files")
    all_files.extend(per_class_files)

Class ID  0  has 900  files
Class ID  1  has 396  files
Class ID  2  has 900  files
Class ID  3  has 900  files
Class ID  4  has 900  files
Class ID  5  has 907  files
Class ID  6  has 342  files
Class ID  7  has 904  files
Class ID  8  has 846  files
Class ID  9  has 900  files


In [19]:
all_files[:5]

['UrbanSound8K\\audio\\fold5\\100852-0-0-0.wav',
 'UrbanSound8K\\audio\\fold5\\100852-0-0-1.wav',
 'UrbanSound8K\\audio\\fold5\\100852-0-0-10.wav',
 'UrbanSound8K\\audio\\fold5\\100852-0-0-11.wav',
 'UrbanSound8K\\audio\\fold5\\100852-0-0-12.wav']

In [20]:
# Randomly shuffle the list elements

np.random.shuffle(all_files)
#200 noise files are split for 

ub_val_paths = all_files[:200]
ub_train_paths = all_files[200:]
print("Training Noise number of files: ", len(ub_train_paths))
print("Validation Noise number of files: ", len(ub_val_paths))

Training Noise number of files:  7695
Validation Noise number of files:  200


In [21]:
# use the 10th fold examples from the dataset as test set

ub_test_data = urbansound_metadata[urbansound_metadata['fold']==10]
all_files = []
for a in class_ids:
    per_class_files = ub_test_data[ub_test_data['classID']==a][['slice_file_name', 'fold']].values
    per_class_files = [os.path.join(urbansound_basepath, 'audio', 'fold'+str(a[1]),a[0]) for a in per_class_files]
    all_files.extend(per_class_files)
all_files[:5]

['UrbanSound8K\\audio\\fold10\\119067-0-0-0.wav',
 'UrbanSound8K\\audio\\fold10\\119067-0-0-1.wav',
 'UrbanSound8K\\audio\\fold10\\119067-0-0-2.wav',
 'UrbanSound8K\\audio\\fold10\\167464-0-0-0.wav',
 'UrbanSound8K\\audio\\fold10\\167464-0-0-1.wav']

In [22]:
ub_test_paths = all_files
print("Testing noise number of files: ", len(ub_test_paths))

Testing noise number of files:  837


## Next is the data processing phase, now that we have finished collecting paths for files  under Train, Validation and Test sets

In [23]:
#Important parameters

window_length = 256
overlap = round(0.25*window_length)
fs = 16000
max_duration = 0.8
window = scipy.signal.hamming(window_length, sym = False)

In [24]:
#The function takes in input paths for the clean audio file and noise file and does the following to finally create one exmaple
#We give the clean audio file name and a list of noise files, one noise is randomly picked and added to the clean audio.

#1. Removes silent frames from both noise and clean audio.
#2. Getting the both audio clips to the same length<=maximum duration(fixed by us)
#3. Based on a formula for addition of singal and noise so that the SNR(signal to noise ratio) is not unreal, the signals 
# are added.

def process(file, noise_files):

    #Important parameters
    window_length = 256
    overlap = round(0.25*window_length)
    fs = 16000
    max_duration = 0.8
    window = scipy.signal.hamming(window_length, sym = False)
    
    #Load Audio file
    audio, sr = librosa.load(file,sr=fs)
    audio = librosa.util.normalize(audio)
    
    #Removing silent frames from the audio loaded
    trimmed = []
    indices = librosa.effects.split(audio, hop_length = overlap, top_db = 20)
    for index in indices:
        trimmed.extend(audio[index[0]: index[1]])
    clean_audio = trimmed
    
    #Adding randomly picked Noise
    noise_filename = np.random.choice(noise_files)
    noise_audio, sr = librosa.load(noise_filename, fs)
    
    
    #Removing silent frames from Noise
    noise_trimmed = []
    indices = librosa.effects.split(noise_audio, hop_length = overlap, top_db = 20)
    for i in indices:
        noise_trimmed.extend(noise_audio[i[0]:i[1]])
    noise_audio = noise_trimmed
    
    #Sampling random fixed length snippets from the audio
    audio_duration_secs = librosa.core.get_duration(clean_audio, fs)
    if audio_duration_secs > max_duration:
        audio_duration_ms = math.floor(audio_duration_secs*fs)
        duration_ms = math.floor(max_duration*fs)
        idx = np.random.randint(0 , audio_duration_ms - duration_ms)
        clean_audio = clean_audio[idx: idx + duration_ms]
    
    #Adding noise to noise to make it atleast as long as the input clean audio
    if len(clean_audio)>=len(noise_audio):
        while len(clean_audio)>=len(noise_audio):
            noise_audio = np.append(noise_audio, noise_audio)
    clean_audio = np.array(clean_audio)
    noise_audio = np.array(noise_audio)
    
    #Picking up a random segment from the noise 
    ind = np.random.randint(0, noise_audio.size - clean_audio.size)
    noiseSegment = noise_audio[ind : ind + clean_audio.size]
    
    #Mixing the two signals
    clean_power = np.sum((clean_audio**2))
    noise_power = np.sum(noiseSegment ** 2)
    noisyAudio = clean_audio + np.sqrt(clean_power / noise_power)*noiseSegment

    #Extracting STFT features from noisy audio
    noisy_spectrogram = librosa.stft(noisyAudio, n_fft=window_length, win_length=window_length, hop_length=overlap,
                            window=window, center=True)
    noise_phase = np.angle(noisy_spectrogram)
    noise_magnitude = np.abs(noisy_spectrogram)
    
    #Extracting featues from Clean audio
    clean_spectrogram = librosa.stft(clean_audio, n_fft=window_length, win_length=window_length, hop_length=overlap,
                            window=window, center=True)
    clean_phase = np.angle(clean_spectrogram)
    clean_magnitude = np.abs(clean_spectrogram)
    
    #Standard Scaler is used for normalisation
    scaler = StandardScaler(copy=False, with_mean=True, with_std=True)
    noise_magnitude = scaler.fit_transform(noise_magnitude)
    clean_magnitude = scaler.transform(clean_magnitude)
    
    return noise_magnitude, clean_magnitude, noise_phase

In [25]:
# Helper funtions for creating tf_records as mentioned in tensorflow tfrecords documentation

def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def get_tf_feature(noise_stft_mag_features, clean_stft_magnitude, noise_stft_phase):
    noise_stft_mag_features = noise_stft_mag_features.astype(np.float32).tostring()
    clean_stft_magnitude = clean_stft_magnitude.astype(np.float32).tostring()
    noise_stft_phase = noise_stft_phase.astype(np.float32).tostring()

    example = tf.train.Example(features=tf.train.Features(feature={
        'noise_stft_phase': _bytes_feature(noise_stft_phase),
        'noise_stft_mag_features': _bytes_feature(noise_stft_mag_features),
        'clean_stft_magnitude': _bytes_feature(clean_stft_magnitude)}))
    return example

In [26]:
import warnings
import gc
warnings.filterwarnings(action='ignore')

#Create tfrecords combining Noise and clean audio, both for validation and train data

def tf_record(clean_files, noise_files, size, type_):
    
    #Important parameters
    window_length = 256
    overlap = round(0.25*window_length)
    fs = 16000
    max_duration = 0.8
    window = scipy.signal.hamming(window_length, sym = False)
    
    count = 0
    
    for i in range(0, len(clean_files), size):
        
        tfrecord_filename = 'records/' + type_ + str(count) + '.tfrecords'
    
        if os.path.isfile(tfrecord_filename):
            print(f"Skipping {tfrecord_filename}")
            count += 1
            continue

        writer = tf.io.TFRecordWriter(tfrecord_filename)
        clean_files_sub = clean_files[i:i+size]
        print(f"{type_} Processing files from: {i} to {i+size}")
        
        out = []
        for file in clean_files_sub:
            out.append(process(file, noise_files))
        gc.collect()
        
        #Based on output from previos function defined above, tfrecord contents are generated.
        for o in out:
            noise_stft_magnitude = o[0]
            clean_stft_magnitude = o[1]
            noise_stft_phase = o[2]
            
            numFeatures = 129 
            numSegments = 8
            
            noisySTFT = np.concatenate([noise_stft_magnitude[:, 0:numSegments-1], noise_stft_magnitude],axis=1)
            stftsegments = np.zeros((numFeatures,numSegments,noisySTFT.shape[-1]-numSegments+1))
            for index in range(noisySTFT.shape[1]-numSegments+1):
                stftsegments[:,:,index] = noisySTFT[:,index:index+numSegments]
            noise_stft_mag_features = np.transpose(stftsegments, (2,0,1))
            
            clean_stft_magnitude = np.transpose(clean_stft_magnitude,(1,0))
            noise_stft_phase = np.transpose(noise_stft_phase, (1,0))
            noise_stft_mag_features = np.expand_dims(noise_stft_mag_features, axis=3)
            clean_stft_magnitude = np.expand_dims(clean_stft_magnitude, axis=2)
            
            for x,y,p in zip(noise_stft_mag_features, clean_stft_magnitude, noise_stft_phase):
                y = np.expand_dims(y, 2)
                example = get_tf_feature(x, y, p)
                writer.write(example.SerializeToString())
        count += 1
        writer.close()

In [None]:
# Functions are called to create respective tfrecord files

tf_record(clean_val_paths,ub_val_paths, 2000, "val")
tf_record(clean_train_paths,ub_train_paths,10000,"train")

Skipping records/val0.tfrecords
Skipping records/train0.tfrecords
Skipping records/train1.tfrecords
Skipping records/train2.tfrecords
Skipping records/train3.tfrecords
Skipping records/train4.tfrecords
Skipping records/train5.tfrecords
Skipping records/train6.tfrecords
Skipping records/train7.tfrecords
Skipping records/train8.tfrecords
Skipping records/train9.tfrecords
Skipping records/train10.tfrecords
Skipping records/train11.tfrecords
Skipping records/train12.tfrecords
train Processing files from: 130000 to 140000
train Processing files from: 140000 to 150000
train Processing files from: 150000 to 160000
train Processing files from: 160000 to 170000
train Processing files from: 170000 to 180000
train Processing files from: 180000 to 190000
train Processing files from: 190000 to 200000
train Processing files from: 200000 to 210000
train Processing files from: 210000 to 220000
