In [None]:
# Import libraries
import pandas as pd
import IPython.display as ipd
from matplotlib import pyplot as plt
import librosa
import librosa.display
import os
import shutil
import random
import numpy as np
from tqdm.notebook import tqdm

In [None]:
# Set seed
np.random.seed(2022)

In [None]:
# Load files
train = pd.read_csv('data/Train.csv')
test = pd.read_csv('data/Test.csv')

In [None]:
# Going from a sound to an image: spectrograms
# os.mkdir('data/spectrograms_filter') # Create a folder to store the spectrograms

# Function to generate spectrogram
def gen_spectrogram(path):
    x , sr = librosa.load(path)
    # ind_max = x.argmax()
    # x_2sec = x[ind_max-sr:ind_max+sr]
    X = librosa.stft(x) #librosa.stft(x_2sec)
    Xdb = librosa.amplitude_to_db(np.abs(X), ref=np.max)
    fig = plt.figure(frameon=False)
    fig.set_size_inches(8, 8)
    ax = plt.Axes(fig, [0., 0., 1., 1.])
    ax.set_axis_off()
    fig.add_axes(ax)
    # plt.title('Swahili Word - Nane')
    librosa.display.specshow(Xdb, y_axis='log', x_axis='time', sr=sr)
    spec_path = 'data/spectrograms_filter/' + path.split('/')[2][:-4] +'.png'
    fig.savefig(spec_path, dpi=512//8)

gen_spectrogram('data/Swahili_words_filtered/id_pwvzavl2dl6q.wav')
display(ipd.Audio('data/Swahili_words_filtered/id_pwvzavl2dl6q.wav'))


# show the three files of each word time-centered in frequency domain
# for word in dict_samples:
#     i=0
#     fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True)
#     fig.set_size_inches(10, 5)
#     fig.suptitle(word)
#     for audiofile in dict_samples[word]:
#         x, sr = librosa.load('data/Swahili_words/'+audiofile)
#         # filtering +-1sec around the loudest part, centering the wavefiles to spoken word 
#         ind_max = x.argmax()
#         x_2sec = x[ind_max-sr:ind_max+sr]
#         X = librosa.amplitude_to_db(np.abs(librosa.stft(x_2sec)), ref=np.max)
#         img = librosa.display.specshow(X, y_axis='log', x_axis='time', sr=sr, ax=ax[i])
#         i+=1

In [None]:
# Concatenate train with test for easy data manipulation
train_test_files = train.Word_id.tolist() + test.Word_id.tolist()

for word_id in tqdm(train_test_files):
  # Check if we've already generated a spectrogram, and if not, make one
    spec_path = 'data/spectrograms_filter/' + word_id[:-4] +'.png'
    if not os.path.isfile(spec_path):
        plt.clf()
        gen_spectrogram('data/Swahili_words_filtered/'+ word_id)
    ipd.clear_output(wait=True)
    plt.close()

In [None]:
# Checking that the spectrograms were generated successfully:
len(os.listdir('data/Swahili_words_filtered')) == len(os.listdir('data/spectrograms_filter')), len(os.listdir('data/spectrograms_filter'))

In [None]:
# Add spectrogram path to train set
train['spec_name'] = [x.split('.')[0] + '.png' for x in train.Word_id]

# Preview train
train.head()

In [None]:
from fastai.vision.all import *

In [None]:
# Create the dataloaders
dls = ImageDataLoaders.from_df(train, fn_col='spec_name', label_col='Swahili_word', 
                               folder='data/spectrograms_filter', item_tfms=Resize(400),
                               batch_tfms = aug_transforms(max_rotate=0, max_warp=0, size=224))
dls.show_batch() # Note the augmentation applied to the images!

In [None]:
learn = vision_learner(dls, resnet34, metrics=error_rate)
learn.fine_tune(10)

In [None]:
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_top_losses(9, figsize=(15,10))

In [None]:
interp.plot_confusion_matrix(figsize=(6,6))

In [None]:
test = pd.read_csv('data/Test.csv')
test['spec_name'] = [x.split('.')[0] + '.png' for x in test['Word_id']]

preds, _ = learn.get_preds(dl=dls.test_dl(test)) 
preds.shape

In [None]:
learn.dls.vocab # The labels in the order used by the model (alphabetical)

In [None]:
submission = pd.DataFrame({'Word_id': test['Word_id']})
for i, label in enumerate(learn.dls.vocab):
    submission[label] = preds[:,i].numpy()
submission.head()

In [None]:
submission.to_csv('data/submission_filter2.csv', index=False)