In [None]:
from fastaudio.augment.all import *
from fastaudio.core.all import *
from fastai.data.all import *
from scipy.ndimage import minimum_filter1d
from tqdm.notebook import tqdm
import IPython.display as ipd

In [None]:
# Set seed
np.random.seed(2022)

In [None]:
# Load files
train = pd.read_csv('data/Train.csv')
test = pd.read_csv('data/Test.csv')

In [None]:
train.head()

In [None]:
# Concatenate train with test for easy data manipulation
train_test_files = train.Word_id.tolist() + test.Word_id.tolist()



In [None]:
def remove_short_sounds_and_filter_words(audio):
    """
    inputs an audio signal and filters the signal to a length of filter_time. 
    The signal is centered around loudest long sections of the input signal, which is usually the spoken word
    input: 
    audio: audio tensor, 
    filter_time: length of the output (needs to be longer than the spoken words)
    """
    y = minimum_filter1d(abs(audio), size=1000, mode='constant')
    ind_max = y[0].argmax()
    window_range = audio.sr/2
    if ind_max<=window_range:
        ind_lrange=0
    else:
        ind_lrange=int(ind_max-window_range)
        
    if (audio.shape[1]-ind_max)<=window_range:
        ind_rrange=int(audio.shape[1])
    else:
        ind_rrange=int(ind_max+window_range)
    audio.data=audio.data[:,ind_lrange:ind_rrange]
    return audio

In [None]:
os.mkdir('data/Swahili_words_filtered') # Create a folder to store the spectrograms

In [None]:
# Function to generate spectrogram
def gen_filtered_signal(path):
    audio = AudioTensor.create(path)
    audio.data=audio.data[:,int(audio.sr/2):]
    tfm = RemoveSilence()
    tfm(audio)
    remove_short_sounds_and_filter_words(audio)
    tfm(audio)
    ResizeSignal(duration=1000)(audio)
    
    filter_path = 'data/Swahili_words_filtered/' + path.split('/')[2]
    audio.save(filter_path)

In [None]:
for word_id in tqdm(train_test_files):
  # Check if we've already generated a filtered signal, and if not, make one
    spec_path = 'data/Swahili_words_filtered/' + word_id
    if not os.path.isfile(spec_path):
        gen_filtered_signal('data/Swahili_words/'+ word_id)
    ipd.clear_output(wait=True)

In [None]:
# Checking that the spectrograms were generated successfully:
len(os.listdir('data/Swahili_words')) == len(os.listdir('data/Swahili_words_filtered')), len(os.listdir('data/Swahili_words_filtered'))

In [None]:
i = 0

In [None]:
# Sample some words
for word in random.sample(train['Swahili_word'].unique().tolist(), 6):
    sample = train[train['Swahili_word'] == word]['Word_id'].sample(1).values[0]
    display(word, sample, ipd.Audio('data/Swahili_words_filtered/'+ sample ))
i+=1

In [None]:
i

### wrongly filtered (forth try)

i=32

* id_ke9iku0q76f1 kumi, end cut off, background noise at beginning
* id_6ljcok2j07f9 end cut off
* id_cdxl1wbdn9pn end cut off
* id_cdxl1wbdn9pn hapana end cut off
* id_e82jojic8yrp laughing

### wrongly filtered (third try)

i=27

* id_y0ofpylebult hapana, end cut off
* id_8yceuwbm7ack children screaming
* id_idbws40cm1jq brabbling, word starting at the end
* id_1izzii125db0 hapana, end cut off
* id_z8spxgllon1a bird
* id_zlfye2qpp812 background speaker
* id_qp2exgpeewvo hapana, beginning cut off
* id_rf6o0w1uinfy phone ringing at end, in original, the word was repeated
* id_nt2tl73rh373 sizzling
* id_0wxcl0zehbjh cut short
* id_q2drmnfcncwr cut short
* id_j6hsq1dcl82k cut short

### wrongly filtered (second try)

i=16

* id_k6x7ae632gsi (sizzling)
* id_q8ublattuf6h (a little off)
* id_u2nwo5hc15gg (hapana, last a cut off)
* id_o3kdutdq4o8d (silent)
* id_zg5r8d3w0vls (silent)
* id_edwy74cljdl3 (silent)
* id_toljkyjcpo2g (wrong data)

### wrongly filtered (First try)
* id_8yceuwbm7ack (children screaming)
* id_q8ublattuf6h (plop)
* id_xybg1e3runiu (very quiet sizzling)
* id_yd5ohcfe3mam (very quiet)
* id_e27kxvgw6h0o (very quiet)
* id_lrduy4kd0l9m (very quiet)
* id_8c3z26adlmsq (very quiet)
* id_710tsclefl7o (plop)
* id_vcr1v5ff5ytr (half word cut off)
* id_rf6o0w1uinfy (word only half spoken, then interrupted. In original, the word was repeated)

In [None]:
full_audio = AudioTensor.create('data/Swahili_words/id_k6x7ae632gsi.wav')
full_audio.show()

In [None]:
tfm = RemoveSilence()
tfm(full_audio).show()

In [None]:
filt_audio = AudioTensor.create('data/Swahili_words_filtered/id_k6x7ae632gsi.wav')
filt_audio.show()