In [None]:
import IPython.display as ipd
import librosa
import librosa.display
import matplotlib.pyplot as plt

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, Dense, MaxPool2D, Dropout
from tensorflow.keras.utils import to_categorical 

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import tensorflow as tf

In [None]:
#DOG_BARK
filename = '../input/urbansound8k/fold1/101415-3-0-2.wav'
plt.figure(figsize=(12,4))
data,sample_rate = librosa.load(filename)
_ = librosa.display.waveplot(data,sr=sample_rate)
print
ipd.Audio(filename)

In [None]:
#Car_Horn
filename = '../input/urbansound8k/fold10/100648-1-1-0.wav'
plt.figure(figsize=(12,4))
data,sample_rate = librosa.load(filename)
_ = librosa.display.waveplot(data,sr=sample_rate)
ipd.Audio(filename)

In [None]:
import pandas as pd
metadata = pd.read_csv('../input/urbansound8k/UrbanSound8K.csv')
metadata.head()

In [None]:
print(metadata.classID.value_counts())

In [None]:
import struct

class WavFileHelper():
    
    def read_file_properties(self, filename):

        wave_file = open(filename,"rb")
        
        riff = wave_file.read(12)
        fmt = wave_file.read(36)
        
        num_channels_string = fmt[10:12]
        num_channels = struct.unpack('<H', num_channels_string)[0]

        sample_rate_string = fmt[12:16]
        sample_rate = struct.unpack("<I",sample_rate_string)[0]
        
        bit_depth_string = fmt[22:24]
        bit_depth = struct.unpack("<H",bit_depth_string)[0]

        return (num_channels, sample_rate, bit_depth)

In [None]:
# Load various imports 

import pandas as pd
import os
import librosa
import librosa.display

#from helpers.wavfilehelper import WavFileHelper

wavfilehelper = WavFileHelper()

audiodata = []
for index, row in metadata.iterrows():
    
    file_name = os.path.join(os.path.abspath('../input/urbansound8k/'),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    data = wavfilehelper.read_file_properties(file_name)
    audiodata.append(data)

# Convert into a Panda dataframe
audiodf = pd.DataFrame(audiodata, columns=['num_channels','sample_rate','bit_depth'])

In [None]:
#Information about num of channels 

print(audiodf.num_channels.value_counts(normalize=True))

In [None]:
#Information about sample rates 

print(audiodf.sample_rate.value_counts(normalize=True))

In [None]:
#Information about bit depth

print(audiodf.bit_depth.value_counts(normalize=True))

In [None]:
#Sample rate conversion to same value for all the audios 

import librosa 
from scipy.io import wavfile as wav
import numpy as np

filename = '../input/urbansound8k/fold1/101415-3-0-2.wav'

librosa_audio, librosa_sample_rate = librosa.load(filename) 
scipy_sample_rate, scipy_audio = wav.read(filename) 

print('Original sample rate:', scipy_sample_rate) 
print('Librosa sample rate:', librosa_sample_rate)

In [None]:
#Bit_Depth_Normalisation 
print('Original audio file min~max range:', np.min(scipy_audio), 'to', np.max(scipy_audio))
print('Librosa audio file min~max range:', np.min(librosa_audio), 'to', np.max(librosa_audio))


In [None]:
#No need for concversion to mono channel as it is already single channel signal
import matplotlib.pyplot as plt

# Original audio with 1 channels 
plt.figure(figsize=(12, 4))
plt.plot(scipy_audio)

In [None]:
# 40 MFCC values are collected over 173 frames
mfccs = librosa.feature.mfcc(y=librosa_audio, sr=librosa_sample_rate, n_mfcc=40)
print(mfccs.shape)

In [None]:
import librosa.display
librosa.display.specshow(mfccs, sr=librosa_sample_rate, x_axis='time')

In [None]:
max_pad_len = 174

def extract_features(file_name):
   
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None 
     
    return mfccs

In [None]:
# Load various imports 
import pandas as pd
import os
import librosa

# Set the path to the full UrbanSound dataset 
fulldatasetpath = '../input/urbansound8k/'

metadata = pd.read_csv('../input/urbansound8k/UrbanSound8K.csv')

features = []

# Iterate through each sound file and extract the features 
for index, row in metadata.iterrows():
    
    file_name = os.path.join(os.path.abspath(fulldatasetpath),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    #print(file_name)
    #print(file_name)
    
    class_label = row["classID"]
    data = extract_features(file_name)
  #  print(len(data))
    
    features.append([data ,class_label])

# Convert into a Panda dataframe 
featuresdf = pd.DataFrame(features, columns=['feature','class_label'])

print('Finished feature extraction from ', len(featuresdf) , ' files' )

In [None]:
#All the 8732 files are converted into the 40 mfcc's at 174 frames 
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Convert features and corresponding classification labels into numpy arrays
X = np.array(featuresdf.feature.tolist())
y = np.array(featuresdf.class_label.tolist())

print(len(X))
print(X.shape)

# Encode the classification labels
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y)) 

# split the dataset 
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state = 42)

In [None]:
# Generate/extract Log-MEL Spectrogram coefficients with LibRosa in case we want to experiment 
def get_mel_spectrogram(file_path, mfcc_max_padding=0, n_fft=2048, hop_length=512, n_mels=128):
    try:
        # Load audio file
        y, sr = librosa.load(file_path)

        # Normalize audio data between -1 and 1
        normalized_y = librosa.util.normalize(y)

        # Generate mel scaled filterbanks
        mel = librosa.feature.melspectrogram(normalized_y, sr=sr, n_mels=n_mels)

        # Convert sound intensity to log amplitude:
        mel_db = librosa.amplitude_to_db(abs(mel))

        # Normalize between -1 and 1
        normalized_mel = librosa.util.normalize(mel_db)

        # Should we require padding
        shape = normalized_mel.shape[1]
        if (mfcc_max_padding > 0 & shape < mfcc_max_padding):
            xDiff = mfcc_max_padding - shape
            xLeft = xDiff//2
            xRight = xDiff-xLeft
            normalized_mel = np.pad(normalized_mel, pad_width=((0,0), (xLeft, xRight)), mode='constant')

    except Exception as e:
        print("Error parsing wavefile: ", e)
        return None 
    return normalized_mel

In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
x_train1 = x_train 
x_test1 = x_test
y_train1 = y_train
y_test1 = y_test

In [None]:
from keras import backend as keras_backend
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten, LeakyReLU, SpatialDropout2D, Activation, Conv2D, MaxPooling2D, BatchNormalization, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.utils import np_utils, to_categorical, plot_model
from keras.callbacks import ModelCheckpoint 
from keras.regularizers import l2
num_rows = 40
num_columns = 174
num_channels = 1
num_labels = y_train.shape[1]


In [None]:
def create_model(spatial_dropout_rate_1=0, spatial_dropout_rate_2=0, l2_rate=0):

    # Create a secquential object
    model = Sequential()


    # Conv 1
    model.add(Conv2D(filters=32, 
                     kernel_size=(3, 3), 
                     kernel_regularizer=l2(l2_rate), 
                     input_shape=(num_rows, num_columns, num_channels)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())

    model.add(SpatialDropout2D(spatial_dropout_rate_1))
    model.add(Conv2D(filters=32, 
                     kernel_size=(3, 3), 
                     kernel_regularizer=l2(l2_rate)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())


    # Max Pooling #1
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(SpatialDropout2D(spatial_dropout_rate_1))
    model.add(Conv2D(filters=64, 
                     kernel_size=(3, 3), 
                     kernel_regularizer=l2(l2_rate)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())
    model.add(SpatialDropout2D(spatial_dropout_rate_2))
    model.add(Conv2D(filters=64, 
                     kernel_size=(3,3), 
                     kernel_regularizer=l2(l2_rate)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())


    # Reduces each h×w feature map to a single number by taking the average of all h,w values.
    model.add(GlobalAveragePooling2D())

    # Softmax output
    model.add(Dense(num_labels, activation='softmax'))
    
    return model

# Regularization rates
spatial_dropout_rate_1 = 0.07
spatial_dropout_rate_2 = 0.14
l2_rate = 0.001

model = create_model(spatial_dropout_rate_1, spatial_dropout_rate_2, l2_rate)

In [None]:
adam = Adam(lr=1e-4, beta_1=0.99, beta_2=0.999)
model.compile(
    loss='categorical_crossentropy', 
    metrics=['accuracy'], 
    optimizer=adam)

# Model architecture summary 
model.summary()

In [None]:
models_path = os.path.abspath('./models')

In [None]:
#you need to tell Conv2D that there is only 1 feature map, and add an extra dimension to the input vector
x_train = x_train.reshape(x_train.shape + (1,))
print(x_train.shape)  

In [None]:
#adam = Adam(lr=1e-4, beta_1=0.99, beta_2=0.999)
#model.compile(
 #   loss='categorical_crossentropy', 
  #  metrics=['accuracy'], 
   # optimizer=adam)
def get_apply_grad_fn():
    @tf.function
    def apply_grad(X, Y, model, loss_fn, optimizer):
        with tf.GradientTape() as t:

            output = model(X)

            loss = loss_fn(Y, output)

        grads = t.gradient(loss, model.trainable_weights)

        optimizer.apply_gradients(zip(grads, model.trainable_weights))

        return loss
    return apply_grad

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import metrics 

num_rows = 40
num_columns = 174
num_channels = 1

x_train = x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)
print(x_train.shape)

num_labels = yy.shape[1]
filter_size = 2

# Construct model 
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(GlobalAveragePooling2D())

model.add(Dense(num_labels, activation='softmax'))

In [None]:
# Compile the model
model.compile(loss = 'categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [None]:
model.summary()

# Calculate pre-training accuracy 
score = model.evaluate(x_test, y_test, verbose=1)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy)

In [None]:
from keras.callbacks import ModelCheckpoint 
from datetime import datetime 

#num_epochs = 12
#num_batch_size = 128

num_epochs = 100
num_batch_size = 256

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.basic_cnn.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data = (x_test, y_test), callbacks=[checkpointer], verbose=1)

duration = datetime.now() - start
print("Training completed in time: ", duration)

In [None]:
# Evaluating the model on the training and testing set
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

In [None]:
#Redefining prediction function
def print_prediction(file_name):

    prediction_feature = extract_features(file_name) 
    
    prediction_feature = prediction_feature.reshape(1, num_rows, num_columns, num_channels)

    predicted_vector = model.predict_classes(prediction_feature)
    predicted_class = le.inverse_transform(predicted_vector) 
#    print("The predicted class is:", predicted_class[0], '\n') 

    predicted_proba_vector = model.predict_proba(prediction_feature) 
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)): 
        category = le.inverse_transform(np.array([i]))
        print(category[0], "\t\t : ", format(predicted_proba[i], '.32f') )


In [None]:
#Sciffer Wav File splitting 
#Audio Splitting for better analysis 
from scipy.io import wavfile
import os
import numpy as np
import argparse
from tqdm import tqdm

# Utility functions

def windows(signal, window_size, step_size):
    if type(window_size) is not int:
        raise AttributeError("Window size must be an integer.")
    if type(step_size) is not int:
        raise AttributeError("Step size must be an integer.")
    for i_start in np.arange(0, len(signal), step_size):
        i_end = i_start + window_size
        if i_end >= len(signal):
            break
        yield signal[i_start:i_end]

def energy(samples):
    return np.sum(np.power(samples, 2.)) / float(len(samples))

def rising_edges(binary_signal):
    previous_value = 0
    index = 0
    for x in binary_signal:
        if x and not previous_value:
            yield index
        previous_value = x
        index += 1




#args = parser.parse_args()
input_filename =  '../input/test-audio/Test_Video.wav' 
output_dir = '/kaggle/working/'
step_duration = None
min_silence_length = 0.01
silence_threshold = 0.000001


window_duration = min_silence_length
if step_duration is None:
    step_duration = window_duration / 10.
else:
    step_duration = step_duration
silence_threshold = silence_threshold
output_dir = output_dir
output_filename_prefix = os.path.splitext(os.path.basename(input_filename))[0]
dry_run = 0


print("Splitting {} where energy is below {}% for longer than {}s.".format(
    input_filename,
    silence_threshold * 100.,
    window_duration
)
     )
# Read and split the file

sample_rate, samples = input_data=wavfile.read(filename=input_filename, mmap=True)

max_amplitude = np.iinfo(samples.dtype).max
max_energy = energy([max_amplitude])

window_size = int(window_duration * sample_rate)
step_size = int(step_duration * sample_rate)

signal_windows = windows(
    signal=samples,
    window_size=window_size,
    step_size=step_size
)

window_energy = (energy(w) / max_energy for w in tqdm(
    signal_windows,
    total=int(len(samples) / float(step_size))
))

window_silence = (e > silence_threshold for e in window_energy)

cut_times = (r * step_duration for r in rising_edges(window_silence))

# This is the step that takes long, since we force the generators to run.
print("Finding silences...")
cut_samples = [int(t * sample_rate) for t in cut_times]
cut_samples.append(-1)

cut_ranges = [(i, cut_samples[i], cut_samples[i+1]) for i in np.arange(len(cut_samples) - 1)]

for i, start, stop in tqdm(cut_ranges):
    output_file_path = "{}_{:03d}.wav".format(
        os.path.join(output_dir, output_filename_prefix),
        i
    )
    if not dry_run:
        print ("Writing file {}".format(output_file_path))
        wavfile.write(
            filename = output_file_path,
            rate=sample_rate,
            data=samples[start:stop]
        )
    else:
        print("Not writing file {}".format(output_file_path))

In [None]:
#DoG Bark
filename = '/kaggle/working/Test_Video_001.wav'
plt.figure(figsize=(12,4))
data,sample_rate = librosa.load(filename)
_ = librosa.display.waveplot(data,sr=sample_rate)
ipd.Audio(filename)

In [None]:
#Dog Barking
filename = '/kaggle/working/Test_Video_001.wav'
print_prediction(filename)

In [None]:
#Traffic
filename = '/kaggle/working/Test_Video_002.wav'
plt.figure(figsize=(12,4))
data,sample_rate = librosa.load(filename)
_ = librosa.display.waveplot(data,sr=sample_rate)
ipd.Audio(filename)

In [None]:
#Redefining prediction function
def prediction(file_name):

    prediction_feature = extract_features(file_name) 
    
    prediction_feature = prediction_feature.reshape(1, num_rows, num_columns, num_channels)

    predicted_vector = model.predict_classes(prediction_feature)
    predicted_class = le.inverse_transform(predicted_vector) 
#    print("The predicted class is:", predicted_class[0], '\n') 

    predicted_proba_vector = model.predict_proba(prediction_feature) 
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)): 
        category = le.inverse_transform(np.array([i]))
       # print(category[0], "\t\t : ", format(predicted_proba[i], '.32f') )
        return(list(predicted_proba[i]))

In [None]:
from pydub import AudioSegment

#Let's do voting strategy to find out probability 
#Gives the most to street music 
count = 0 
#List of list to store values of predictions 
val = [[] for i in range(11)]
from pydub import AudioSegment
t1 = 0 * 500 #Works in milliseconds
t2 = 1 * 500
newAudio = AudioSegment.from_wav('/kaggle/working/Test_Video_002.wav')
newAudio = newAudio[t1:t2]
newAudio.export('new.wav', format="wav") #Exports to a wav file in the current path.

t1 = 1 * 500 #Works in milliseconds
t2 = 1 * 1000
newAudio = AudioSegment.from_wav('/kaggle/working/Test_Video_002.wav')
newAudio = newAudio[t1:t2]
newAudio.export('new1.wav', format="wav") #Exports to a wav file in the current path.

t1 = 1 * 1000 #Works in milliseconds
t2 = 1 * 15000
newAudio = AudioSegment.from_wav('/kaggle/working/Test_Video_002.wav')
newAudio = newAudio[t1:t2]
newAudio.export('new2.wav', format="wav") #Exports to a wav file in the current path.

t1 = 1 * 1500 #Works in milliseconds
t2 = 1 * 2000
newAudio = AudioSegment.from_wav('/kaggle/working/Test_Video_002.wav')
newAudio = newAudio[t1:t2]
newAudio.export('new3.wav', format="wav") #Exports to a wav file in the current path.

t1 = 1 * 2000 #Works in milliseconds
t2 = 1 * 2500
newAudio = AudioSegment.from_wav('/kaggle/working/Test_Video_002.wav')
newAudio = newAudio[t1:t2]
newAudio.export('new4.wav', format="wav") #Exports to a wav file in the current path.

t1 = 1 * 2500 #Works in milliseconds
t2 = 1 * 3000
newAudio = AudioSegment.from_wav('/kaggle/working/Test_Video_002.wav')
newAudio = newAudio[t1:t2]
newAudio.export('new5.wav', format="wav") #Exports to a wav file in the current path.



In [None]:
filename = '/kaggle/working/new.wav'
plt.figure(figsize=(12,4))
data,sample_rate = librosa.load(filename)
_ = librosa.display.waveplot(data,sr=sample_rate)
ipd.Audio(filename)

In [None]:
val = [[] for i in range(11)]

In [None]:

filename = '/kaggle/working/new.wav'
print_prediction(filename)


In [None]:

filename = '/kaggle/working/new1.wav'
print_prediction(filename)


In [None]:

filename = '/kaggle/working/new2.wav'
print_prediction(filename)


In [None]:

filename = '/kaggle/working/new3.wav'
print_prediction(filename)


In [None]:

filename = '/kaggle/working/new4.wav'
print_prediction(filename)


In [None]:

filename = '/kaggle/working/new5.wav'
print_prediction(filename)


In [None]:
#Max Mean average prediction : 
#Car_Horn ~ 40% 
#Which is different as compared to original prediction which came out to be Street Noise 

In [None]:
#FFT BASED NOISE REDUCTION METHOD
pip install noisereduce

In [None]:
import noisereduce as nr
# load data
filename = ("/kaggle/working/Test_Video_002.wav")
data,sample_rate = librosa.load(filename)
print(len(data))
# select section of data that is noise
noisy_part = data
#print(data)
# perform noise reduction
reduced_noise = nr.reduce_noise(audio_clip=data, noise_clip=noisy_part, verbose=True)

#reduced_noise.export('hope.wav', format="wav") #Exports to a wav file in the current path.

In [None]:
reduced_noise
_ = librosa.display.waveplot(reduced_noise,sr=sample_rate)

In [None]:
#Prediction on this part of the signal gives us --->
from pydub import AudioSegment
t1 = 1 * 900 #Works in milliseconds
t2 = 1 * 2000
newAudio = AudioSegment.frnewAudio = AudioSegment.from_wav("../input/test-audio/Test_Video.wav")
newAudio = newAudio[t1:t2]
print(type(newAudio))
newAudio.export('significant.wav', format="wav") #Exports to a wav file in the current path.

In [None]:
filename = '/kaggle/working/significant.wav'
print_prediction(filename)

#Here it shows the car horn highest probability 

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
np.random.seed(7)

In [None]:
print(x_train.shape)

In [None]:
k = x_train.reshape((6985, 174, 40))
#y_train.reshape((6985, 174, 40))
#print(y_train.shape)

y_train = to_categorical(y_train , 10)

In [None]:
print(k.shape)

AUGMENTATION

In [None]:
import os
import librosa
import librosa.display
import pandas as pd
import numpy as np
import random
import matplotlib as plt
import matplotlib.pyplot as plt
from IPython.display import clear_output, display
import librosa.display
from scipy.io import wavfile


In [None]:
#path to the dataset
#us8k_path = os.path.abspath('../input/urbansound8k/')

metadata = pd.read_csv('../input/urbansound8k/UrbanSound8K.csv')
audio_path = '../input/urbansound8k/'
augmented_path = '/kaggle/working/'

# Metadata
#metadata_path = os.path.join(us8k_path, 'metadata/UrbanSound8K.csv')
#metadata_augmented_path = os.path.abspath('data/augmented-data.csv')

In [None]:
import pandas as pd
metadata = pd.read_csv('../input/urbansound8k/UrbanSound8K.csv')
metadata.head()

In [None]:
#Time Scaling
rates = [0.81, 1.07]
total = len(metadata) * len(rates)
count = 0
for rate in rates: 
    # Generate new stretched audio file
    for index, row in metadata.iterrows(): 
        fulldatasetpath = '../input/urbansound8k/'

     #   file_name = os.path.join(os.path.abspath(fulldatasetpath),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
        #print(file_name)
        #print(file_name)
    
      #  class_label = row["classID"]
     #   data = extract_features(file_name)
        curr_fold = str(row['fold'])
        curr_file_path = audio_path + '/fold' + curr_fold + '/' + row['slice_file_name']
        
        # Speed sub-dir inside current fold dir
        curr_rate_path = augmented_path + '/fold' + '/speed_' + str(int(rate*100))

        
        # Create sub-dir if it does not exist
        if not os.path.exists(curr_rate_path):
            os.makedirs(curr_rate_path)
                    
        output_path = curr_rate_path + '/' + row['slice_file_name']
        
        # Skip when file already exists
        if (os.path.isfile(output_path)):
            count += 1 
            continue
        
        y, sr = librosa.load(curr_file_path)  
        y_changed = librosa.effects.time_stretch(y, rate=rate)
        print(y_changed)
       # librosa.output.write_wav(output_path, y_changed, sr)
        wavfile.write(
            filename = output_path,
            rate = sr,
            data = y_changed
        )
        
        count += 1 
        
        clear_output(wait=True)
        print("Progress: {}/{}".format(count, total))
        print("Last file: ", row['slice_file_name'])

In [None]:
#Pitch_Shifting 
tone_steps = [-1, -2, 1, 2]
total = len(metadata) * len(tone_steps)
count = 0
for tone_step in tone_steps:
    # Generate new pitched audio
    for index, row in metadata.iterrows():        
        curr_fold = str(row['fold'])
        curr_file_path = audio_path + '/fold' + curr_fold + '/' + row['slice_file_name']

        # Pitch Shift sub-dir inside current fold dir
        curr_ps_path = augmented_path + '/fold' + curr_fold + '/pitch_' + str(tone_step)

        # Create sub-dir if it does not exist
        if not os.path.exists(curr_ps_path):
            os.makedirs(curr_ps_path)
        
        output_path = curr_ps_path + '/' + row['slice_file_name']
        
        # Skip when file already exists
        if (os.path.isfile(output_path)):
            count += 1 
            continue
        
        y, sr = librosa.load(curr_file_path)  
        y_changed = librosa.effects.pitch_shift(y, sr, n_steps=tone_step)
        librosa.output.write_wav(output_path, y_changed, sr)
        
        count += 1 
        
        clear_output(wait=True)
        print("Progress: {}/{}".format(count, total))
        print("Last file: ", row['slice_file_name'])

In [None]:
#Noise addition 
import random

def add_noise(data):
    noise = np.random.rand(len(data))
    noise_amp = random.uniform(0.005, 0.008)
    data_noise = data + (noise_amp * noise)
    return data_noise

total = len(metadata)
count = 0

# Generate new noised audio
for index, row in metadata.iterrows():        
    curr_fold = str(row['fold'])
    curr_file_path = audio_path + '/fold' + curr_fold + '/' + row['slice_file_name']
    
    # Noised sub-dir inside current fold dir
    curr_noise_path = augmented_path + '/fold' + curr_fold + '/noise'

    # Create sub-dir if it does not exist
    if not os.path.exists(curr_noise_path):
        os.makedirs(curr_noise_path)
        
    output_path = curr_noise_path + '/' + row['slice_file_name']
        
    # Skip when file already exists
    if (os.path.isfile(output_path)):
        count += 1 
        continue
        
    y, sr = librosa.load(curr_file_path)  
    y_changed = add_noise(y)
    librosa.output.write_wav(output_path, y_changed, sr)
    
    count += 1 

    clear_output(wait=True)
    print("Progress: {}/{}".format(count, total))
    print("Last file: ", row['slice_file_name'])

In [None]:
def get_files_recursive(path):
    # create a list of file and sub directories names in the given directory 
    file_list = os.listdir(path)
    all_files = list()
    # Iterate over all the entries
    for entry in file_list:
        # Create full path
        full_path = os.path.join(path, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(full_path):
            all_files = all_files + get_files_recursive(full_path)
        else:
            all_files.append(full_path)
                
    return all_files

In [None]:
# Get every single file within the tree
files = get_files_recursive(augmented_path)

# Define metadata columns
names = []
classes = []
folds = []
augmentations = []

# Iterate and collect name, fold and class
for file in files:
    pieces = file.split("/")
    file = pieces[len(pieces) - 1]
    fold = pieces[len(pieces) - 3] 
    augment = pieces[len(pieces) - 2] 
    fold_num = fold[4:len(fold)]
    class_id = file.split("-")[1]

    # Push records
    names.append(file)
    folds.append(fold_num)
    classes.append(class_id)
    augmentations.append(augment)

# Create a dataframe with the new augmented data
new_meta = pd.DataFrame({'file': names, 'fold': folds, 'class_id': classes, 'augment': augmentations })

# Make sure class_id is int
new_meta['class_id'] = new_meta['class_id'].astype(np.int64)

print(len(new_meta), "new entries")

In [None]:
classes = pd.DataFrame({
    'class_id': range(0,10),
    'class': [
        'air_conditioner',
        'car_horn',
        'children_playing',
        'dog_bark',
        'drilling',
        'engine_idling',
        'gun_shot',
        'jackhammer',
        'siren',
        'street_music'
    ]
})

new_meta = pd.merge(new_meta, classes, on='class_id')

In [None]:
new_meta.tail()

In [None]:
# Modify original data to fit the new structure
del metadata['fsID'], metadata['start'], metadata['end'], metadata['salience']
metadata.columns = ['file', 'fold', 'class_id', 'class']
metadata['augment'] = 'none'

In [None]:
# Concat the two dataframes
full_meta = pd.concat([metadata, new_meta])

# Verify lengths
if (len(full_meta) == len(metadata) + len(new_meta)):
    print("Dataframes merged correctly!")
else:
    print("Error! Lengths do not match.")

print("Initial data:", len(metadata))
print("New data:", len(new_meta))
print("Merged data:", len(full_meta))

In [None]:
# Save the new metadata
full_meta.to_csv(metadata_augmented_path, index=False, encoding="utf-8")

In [None]:
#Couldn't perform Augmentation cause of memory allocation got full in here. Might try the same in Google Collab in the future 