In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import dill as pickle
import os
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, LSTM, Dense

%matplotlib inline
%load_ext autoreload
%autoreload 2

### Download and Preprocess Audio Files in Surah Fatihah

This will take quite a bit of time, but the good news is that you only need to do it once! After you've done this once, the files will be saved locally and you can skip the cells in this section.

In [None]:
%run -i "../download.py" -s 1

In [None]:
%run -i "../audio_preprocessing/generate_features.py" -f mfcc -s 1 --local_download_dir "../.audio" --output_dir "../.outputs" 

In [None]:
%run -i "../audio_preprocessing/generate_one_hot_encoding.py" -i "../data/data-uthmani.json" -o "../data/one-hot.pkl" 

### Write Methods to Load the Dataset

In [None]:
"""
Inspired by: https://github.com/ruohoruotsi/LSTM-Music-Genre-Classification/blob/master/lstm_genre_classifier_keras.py
"""

def convert_list_of_arrays_to_padded_array(list_varying_sizes, pad_value=0):
    '''
    Converts a list of arrays of varying sizes to a single numpy array. The extra elements are set to 0
    '''
    max_shape = [0]*len(list_varying_sizes[0].shape)
    # first pass to compute the max size
    for arr in list_varying_sizes:
        shape = arr.shape
        max_shape = [max(s1, s2) for s1, s2 in zip(shape, max_shape)]
    padded_array = pad_value * np.ones((len(list_varying_sizes), *max_shape))
    
    # second pass to fill in the values in the array:
    for a, arr in enumerate(list_varying_sizes):
        r, c = arr.shape  # TODO(abidlabs): maybe make more general to more than just 2D arrays.
        padded_array[a, :r, :c] = arr
    
    return padded_array


def preprocess_encoder_input(arr):
    '''
    Simple method to handle the complex MFCC coefs that are produced during preprocessing. This means:
    1. (For now), discarding one of the channels of the MFCC coefs
    2. Collapsing any empty dimensions
    '''
    return arr.squeeze()[0]

    
# Load the CSV file with URLs and Gender information
with open('../.cache/tarteel_v1.0.csv', 'rb') as tarteel_csv:
    tarteel_df = pd.read_csv(tarteel_csv)
    tarteel_df = tarteel_df[['URL to Recording', 'Gender']]
    recording_urls = tarteel_df['URL to Recording']
    recording_filenames_from_csv = recording_urls.str.extract('https://tarteel-data.s3.amazonaws.com/media/([_\d]+)\.wav.+')
    
    
def get_gender_of_recitation(recording_filename):
    recording_filename = recording_filename[:-10]
    matching_audio_file = recording_filenames_from_csv[0].str.startswith(recording_filename).fillna(False)
    tarteel_df_out = tarteel_df[matching_audio_file]
    if tarteel_df_out.empty:
        return None
    if tarteel_df_out['Gender'].iloc[0] == 'male':
        return np.array([1, 0])
    if tarteel_df_out['Gender'].iloc[0] == 'female':
        return np.array([0, 1])
    return None
    

def build_dataset(local_coefs_dir='../.outputs/mfcc', surahs=[1], n=100):
    '''
    Builds a dataset to be used with the sequence-to-sequence network.
    
    :param local_coefs_dir: a string with the path of the coefficients for prediction
    '''
    
    def get_encoder_and_decoder_data(n=100):
        count = 0
        encoder_input_data = []
        gender_data = []
        for surah_num in surahs:
            local_surah_dir = os.path.join(local_coefs_dir, "s" + str(surah_num))
            for _, ayah_directories, _ in os.walk(local_surah_dir):
                for ayah_directory in ayah_directories:
                    ayah_num = ayah_directory[1:]
                    local_ayah_dir = os.path.join(local_surah_dir, ayah_directory)
                    for _, _, recording_filenames in os.walk(local_ayah_dir):
                        for recording_filename in recording_filenames:
                            local_coefs_path = os.path.join(local_ayah_dir, recording_filename)
                            encoder_input = np.load(local_coefs_path)
                            encoder_input = preprocess_encoder_input(encoder_input)

                            gender = get_gender_of_recitation(recording_filename)
                            if gender is not None:  # Only if gender is known, add the recording to the list
                                encoder_input_data.append(encoder_input)
                                gender_data.append(gender)
                                count += 1
                                if count == n:
                                    return encoder_input_data, gender_data

        return encoder_input_data, gender_data
    
    
    encoder_input_data, gender_data = get_encoder_and_decoder_data(n=n)
    encoder_input_data = convert_list_of_arrays_to_padded_array(encoder_input_data)
    gender_data = np.stack(gender_data)
    return encoder_input_data, gender_data

In [None]:
# matching_audio_file = recording_filenames_from_csv[0].str.startswith('1_1').fillna(False)
# tarteel_df_out = tarteel_df[matching_audio_file]

In [None]:
batch_size = 10  # Batch size for training.
epochs = 25  # Number of epochs to train for.
n_units_1 = 128  # number of LSTM cells in layer 1
n_units_2 = 32 # number of LSTM cells in layer 2
n = 100

encoder_input_data, gender_data = build_dataset(n=n)

In [None]:
[print(a.shape) for a in [encoder_input_data, gender_data]]

input_shape = encoder_input_data.shape[1], encoder_input_data.shape[2]
num_classes = gender_data.shape[1]

Even though we tried to create a dataset with 100 recordings, only 30 of them had gender recordings

### Build a Keras Model for Training

In [None]:
model = Sequential()
model.add(LSTM(units=n_units_1, dropout=0.05, recurrent_dropout=0.35, return_sequences=True, input_shape=input_shape))
model.add(LSTM(units=n_units_2, dropout=0.05, recurrent_dropout=0.35, return_sequences=False))
model.add(Dense(units=num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(encoder_input_data, gender_data,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.2)

In [None]:
plt.plot(range(epochs), history.history['loss'])
plt.plot(range(epochs), history.history['val_loss'])

The drop in loss curves suggest that the model is learning something. It seems to have started to overfit, likely because our training set is so small, that it's picking up on irrelevant patterns.