In [161]:
import librosa
import json
from os import listdir
from os.path import isfile, join

import torch
import IPython.display as ipd
import os
import soundfile as sf
# Import the data augmentation component from ASR collection
from nemo.collections.asr.parts import perturb, segment

# Load wav files

In [162]:
resampled_path = "data/label1/"
onlyfiles = [f for f in listdir(resampled_path) if isfile(join(resampled_path, f))]
training_files =[]
for file in onlyfiles:
    training_files.append(file)

# Create manifest file

In [163]:
manifest_path = 'manifest_label1.jsonl'
with open(manifest_path, 'w', encoding="utf-8") as fout:
    for file in training_files:
        duration = librosa.core.get_duration(filename=resampled_path+file)
        metadata = {
            "audio_filepath": resampled_path+file,
            "duration": duration,
            "command": "label_1"
        }
        json.dump(metadata, fout, ensure_ascii=False)
        fout.write('\n')

# Change speech pitch

In [164]:
def load_audio(filepath) -> segment.AudioSegment:
    sample_segment = segment.AudioSegment.from_file(filepath, target_sr=sr)
    return sample_segment

sr = 16000

resample_type = 'kaiser_best'

gain = perturb.GainPerturbation(min_gain_dbfs=0, max_gain_dbfs=20)
fast_speed = perturb.SpeedPerturbation(sr, resample_type, min_speed_rate=0.7, max_speed_rate=0.9, num_rates=-1)
tstretch = perturb.TimeStretchPerturbation()


augmentors = []
probas = [1.0 ,0.5]
augmentor = [
    fast_speed,  
    gain
]

augmentations = list(zip(probas, augmentor))
fast_augmentor = perturb.AudioAugmentor(augmentations)
augmentors = []

probas = [1.0 ,0.5]
augmentor = [
    tstretch,    
    gain
]

augmentations = list(zip(probas, augmentor))
stretch_augmentor = perturb.AudioAugmentor(augmentations)

white_noise = perturb.WhiteNoisePerturbation(min_level=-60, max_level=-55)

resample_type = 'kaiser_best'  # Can be ['kaiser_best', 'kaiser_fast', 'fft', 'scipy']
fast_speed = perturb.SpeedPerturbation(sr, resample_type, min_speed_rate=0.7, max_speed_rate=0.9, num_rates=-1)
slow_speed = perturb.SpeedPerturbation(sr, resample_type, min_speed_rate=1.1, max_speed_rate=1.2, num_rates=-1)

gain = perturb.GainPerturbation(min_gain_dbfs=0, max_gain_dbfs=50)

tstretch = perturb.TimeStretchPerturbation()

##########
WHITE_NOISE_PROB = 0.7
##########

augmentors = []
first_probas = [1.0, 0.7 ,0.5]
first_augmentor = [
    fast_speed,
    white_noise,
    gain
]
augmentations = list(zip(first_probas, first_augmentor))
audio_augmentations = perturb.AudioAugmentor(augmentations)
augmentors.append(audio_augmentations)

###################################

second_probas = [1.0, 0.7 ,0.5]
second_augmentor = [
    slow_speed,
    white_noise,
    gain
]
augmentations = list(zip(second_probas, second_augmentor))
audio_augmentations = perturb.AudioAugmentor(augmentations)
augmentors.append(audio_augmentations)


[augmentor._pipeline for augmentor in augmentors]

[[(1.0,
   <nemo.collections.asr.parts.perturb.SpeedPerturbation at 0x7fb9d5bd5f70>),
  (0.7,
   <nemo.collections.asr.parts.perturb.WhiteNoisePerturbation at 0x7fb9d7908a00>),
  (0.5,
   <nemo.collections.asr.parts.perturb.GainPerturbation at 0x7fb9d5c57580>)],
 [(1.0,
   <nemo.collections.asr.parts.perturb.SpeedPerturbation at 0x7fbb064c8be0>),
  (0.7,
   <nemo.collections.asr.parts.perturb.WhiteNoisePerturbation at 0x7fb9d7908a00>),
  (0.5,
   <nemo.collections.asr.parts.perturb.GainPerturbation at 0x7fb9d5c57580>)]]

# Add background noise

In [165]:
def load_audio(filepath) -> segment.AudioSegment:
    sample_segment = segment.AudioSegment.from_file(filepath, target_sr=sr)
    return sample_segment

sr = 16000

aug_funcs = []
noise_manifest_path = "noise_files/noise_manifest.jsonl"
   
noise = perturb.NoisePerturbation(manifest_path=noise_manifest_path,
                                          min_snr_db=5, max_snr_db=5,
                                          max_gain_db=300.0)

aug_funcs = [noise, noise, noise, noise, noise, noise, noise, noise, noise, noise, noise, noise]

[NeMo I 2022-07-14 15:46:08 collections:173] Dataset loaded with 12 files totalling 0.22 hours
[NeMo I 2022-07-14 15:46:08 collections:174] 0 files were filtered totalling 0.00 hours


# Play audio

In [151]:
filepath = os.path.join('data/label1/speech1.wav')
sample_segment = load_audio(filepath)
ipd.Audio(sample_segment.samples, rate=sr)

In [156]:
sample_segment = load_audio(filepath)
augmentors[1].perturb(sample_segment)

ipd.Audio(sample_segment.samples, rate=sr)

In [46]:
sample_segment = load_audio(filepath)
noise.perturb(sample_segment)
ipd.Audio(sample_segment.samples, rate=sr)

# Create augmented manifest and wav files with speech pitch

In [166]:
manifest_path = 'manifest_label1.jsonl'
with open(manifest_path, 'r') as fin:
    aug_manifest_path = 'manifest_label1_aug.jsonl'
    with open(aug_manifest_path, 'w', encoding="utf-8") as fout:

        for line in fin:
            original_file_json = json.loads(line)
            json.dump(original_file_json, fout, ensure_ascii=False)
            fout.write('\n')
            for i,augmentor in enumerate(augmentors):

                sample_segment = load_audio(original_file_json['audio_filepath'])
                augmentor.perturb(sample_segment)

                # add a new line to the manifest
                new_filepath = f"{original_file_json['audio_filepath'][:-4]}_aug{i}.wav"

                # save the augmented file
                sf.write(new_filepath, sample_segment.samples, sr)

                new_json = original_file_json.copy()
                new_json['audio_filepath'] = new_filepath

                duration = librosa.core.get_duration(filename=new_filepath)
                new_json['duration'] = duration

                json.dump(new_json, fout, ensure_ascii=False)
                fout.write('\n')

# Create augmented manifest and wav files with noise background


In [170]:
manifest_path = 'manifest_label1_aug.jsonl'
with open(manifest_path, 'r') as fin:
    aug_manifest_path = 'manifest_label1_aug_noise.jsonl'
    with open(aug_manifest_path, 'w', encoding="utf-8") as fout:

        for line in fin:
            original_file_json = json.loads(line)
            json.dump(original_file_json, fout, ensure_ascii=False)
            fout.write('\n')

            for i,aug_func in enumerate(aug_funcs):

                sample_segment = load_audio(original_file_json['audio_filepath'])
                aug_func.perturb(sample_segment)

                 # add a new line to the manifest
                new_filepath = f"{original_file_json['audio_filepath'][:-4]}_noise_aug{i}.wav"
                sf.write(new_filepath, sample_segment.samples, sr)

                new_json = original_file_json.copy()
                new_json['audio_filepath'] = new_filepath

                duration = librosa.core.get_duration(filename=new_filepath)
                new_json['duration'] = duration

                json.dump(new_json, fout, ensure_ascii=False)
                fout.write('\n')
