# Swahili Audio Classification EDA


|Swahili|English|
|---|---|
| ndio | yes |
| hapana | no |
| moja | one |
| mbili | two |
| tatu | three |
| nne | four |
| tano | five |
| sita | six |
| saba | seven |
| nane | eight |
| tisa | nine |
| kumi | ten |

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import random
import IPython.display as ipd

In [None]:
train_df = pd.read_csv('data/Train.csv')

In [None]:
train_df.head()

In [None]:
train_df['Swahili_word'].value_counts()
# perfectly balanced

In [None]:
# Sample some words
for word in random.sample(train_df['Swahili_word'].unique().tolist(), 6):
    sample = train_df[train_df['Swahili_word'] == word]['Word_id'].sample(1).values[0]
    display(word, sample, ipd.Audio('data/Swahili_words/'+ sample ))

## Wrong Data 

* id_p0w83k2cxaq7 (tatu) are totally wrong words. Nothing that is part of the challenge
* id_qbkjs5jaji4s sais 'yes'
* id_bkbq9rsmuq8j sais 'seven'
* id_toljkyjcpo2g someone randomly talking
* id_v2okjqke4zmb saying nne instead of nane

## Compare 3 samples for each word

In [None]:
dict_samples=dict()
for word in train_df['Swahili_word'].unique().tolist():
    sample = train_df[train_df['Swahili_word'] == word]['Word_id'].sample(3).values[:]
    dict_samples[word] = sample

In [None]:
dict_samples

In [None]:
# show three wavefiles for all words in time domain, for easy comparison between the words
for word in dict_samples:
    i=0
    fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True)
    fig.set_size_inches(10, 5)
    fig.suptitle(word)
    for audiofile in dict_samples[word]:
        x, sr = librosa.load('data/Swahili_words/'+audiofile)
        img = librosa.display.waveplot(x, sr=sr, ax=ax[i])
        i+=1


In [None]:
# # show the three files of each word time-centered in frequency domain
# for word in dict_samples:
#     i=0
#     fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True)
#     fig.set_size_inches(10, 5)
#     fig.suptitle(word)
#     for audiofile in dict_samples[word]:
#         x, sr = librosa.load('data/Swahili_words/'+audiofile)
#         X = librosa.amplitude_to_db(np.abs(librosa.stft(x)), ref=np.max)
#         img = librosa.display.specshow(X, y_axis='log', x_axis='time', sr=sr, ax=ax[i])
#         i+=1

## Preprocessing

In [None]:
from fastaudio.augment.all import *
from fastaudio.core.all import *
from fastai.data.all import *

In [None]:
audio = AudioTensor.create('data/Swahili_words/id_lrduy4kd0l9m.wav')
audio.show()

In [None]:
spectrogram = AudioToSpec.from_cfg(AudioConfig.Voice())(audio)
spectrogram.show()

In [None]:
tfm = RemoveSilence()
tfm(audio).show()

In [None]:
from scipy.ndimage import minimum_filter1d

In [None]:
y = minimum_filter1d(abs(audio), size=1000, mode='constant')

In [None]:
plt.plot(y[0])

In [None]:
audio.shape

In [None]:
ind_max = y[0].argmax()
window_range = audio.sr/2
if ind_max<=window_range:
    ind_lrange=0
else:
    ind_lrange=int(ind_max-window_range)
    
if (audio.shape[1]-ind_max)<=window_range:
    ind_rrange=int(audio.shape[1])
else:
    ind_rrange=int(ind_max+window_range)
audio.data=audio.data[:,ind_lrange:ind_rrange]

In [None]:
audio.shape[1]

In [None]:
audio.show()

In [None]:
tfm(audio).show()

In [None]:
audio.save('data/Swahili_words_filtered/id_lrduy4kd0l9m.wav')

In [None]:
audio.sr