In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import librosa
import keras
from keras import layers
import tensorflow as tf

We will evaluate the quality of the model using Word Error Rate (WER). WER is obtained by adding up the substitutions, insertions, and deletions that occur in a sequence of recognized words. Divide that number by the total number of words originally spoken. The result is the WER. To get the WER score you need to install the jiwer package. You can use the following command line:

In [4]:
import jiwer
from jiwer import wer

## Dataset Setup

In [7]:
data_path = 'LJSpeech-1.1'

wavs = data_path + '/wavs/'
metadata = data_path + '/metadata.csv'

## Preprocessing

In [2]:
characters = [x for x in "abcdefghijklmnopqrstuvwxyz' "]
print(f"Number of characters: {len(characters)}")
print(f"Characters: {characters}")

Number of characters: 28
Characters: ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'", ' ']


In [5]:
char_to_num = keras.layers.StringLookup(
    vocabulary=characters, oov_token=""
)

num_to_char = keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)

In [6]:
print(
    f"The vocabulary is: {char_to_num.get_vocabulary()} "
    f"(size ={char_to_num.vocabulary_size()})"
)

The vocabulary is: ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'", ' '] (size =29)


In [30]:
char_to_num(["b"])

<tf.Tensor: shape=(1,), dtype=int64, numpy=array([2])>

In [7]:
# An integer scalar Tensor. The window length in samples.
frame_length = 256
# An integer scalar Tensor. The number of samples to step.
frame_step = 160
# An integer scalar Tensor. The size of the FFT to apply.
# If not provided, uses the smallest power of 2 enclosing frame_length.
fft_length = 384

In [9]:
def encode_single_sample(wav_file, label):
    ###########################################
    ##  Process the Audio
    ##########################################
    # 1. Read wav file
    file = tf.io.read_file(wavs + wav_file + ".wav")
    # 2. Decode the wav file
    audio, sample_rate = tf.audio.decode_wav(file)
    audio = tf.squeeze(audio, axis=-1)
    # 3. Change type to float
    audio = tf.cast(audio, tf.float32)
    # 4. Get the spectrogram
    spectrogram = tf.signal.stft(
        audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
    )
    # 5. We only need the magnitude, which can be derived by applying tf.abs
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.math.pow(spectrogram, 0.5)
    # 6. normalisation
    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)

    ###########################################
    ##  Process the label
    ##########################################
    # 7. Convert label to Lower case
    label = tf.strings.lower(label)
    # 8. Split the label
    label = tf.strings.unicode_split(label, input_encoding="UTF-8")
    # 9. Map the characters in label to numbers
    label = char_to_num(label)
    # 10. Return a dict as our model is expecting two inputs
    return spectrogram, label

In [33]:
batch_size = 32

# Define the training dataset
train_dataset = tf.data.Dataset.from_tensor_slices(
    (df_train["file_name"], df_train["normalized_transcription"])
)

train_dataset = (
    train_dataset.shuffle(buffer_size=len(df_train))  # Shuffle the dataset
    .map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
    .cache()  # Cache the dataset after mapping if possible
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

# Define the validation dataset
validation_dataset = tf.data.Dataset.from_tensor_slices(
    (df_val["file_name"], df_val["normalized_transcription"])
)
validation_dataset = (
    validation_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
    .cache()  # Cache the dataset after mapping if possible
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

+ Displaying sample data

In [34]:
from IPython import display

In [10]:
fig = plt.figure(figsize=(8, 5))
for batch in train_dataset.take(1):
    spectrogram = batch[0][0].numpy()
    spectrogram = np.array([np.trim_zeros(x) for x in np.transpose(spectrogram)])
    label = batch[1][0]
    # Spectrogram
    label = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
    ax = plt.subplot(2, 1, 1)
    ax.imshow(spectrogram, vmax=1)
    ax.set_title(label)
    ax.axis("off")
    # Wav
    file = tf.io.read_file(wavs + list(df_train["file_name"])[0] + ".wav")
    audio, _ = tf.audio.decode_wav(file)
    audio = audio.numpy()
    ax = plt.subplot(2, 1, 2)
    plt.plot(audio)
    ax.set_title("Signal Wave")
    ax.set_xlim(0, len(audio))
    display.display(display.Audio(np.transpose(audio), rate=16000))
plt.show()

NameError: name 'train_dataset' is not defined

<Figure size 800x500 with 0 Axes>

# Model Code and Loss functions

+ What is DeepSpeech ?
- + DeepSpeech is an open-source speech recognition system developed by Mozilla in 2017 and based on the homonymous algorithm by Baidu.

## Why have we used CTC Loss Function
+  It is more dedicated to understand sentences (e.g. "Please close the door.") than word commands (e.g. "close") as it's efficient to learn the sentence's structure (spaces, end of sentence) and fitted to recognize unknown words from the training set.

+ Example:
- + Classical model has 5 classes, 4 are commands ("open", "close", "enter", "exit) and the fifth is a garbage which takes whatever that is not a command.
- + CTC-loss model "translate" letter by letter every words and then we check if it's a command.


In [8]:
def CTCLoss(y_true, y_pred):
    # Compute the training-time loss value
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

    loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss

In [9]:
def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):
    """Model similar to DeepSpeech2."""
    # Model's input
    input_spectrogram = layers.Input((None, input_dim), name="input")
    # Expand the dimension to use 2D CNN.
    x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)
    # Convolution layer 1
    x = layers.Conv2D(
        filters=32,
        kernel_size=[11, 41],
        strides=[2, 2],
        padding="same",
        use_bias=False,
        name="conv_1",
    )(x)
    x = layers.BatchNormalization(name="conv_1_bn")(x)
    x = layers.ReLU(name="conv_1_relu")(x)
    # Convolution layer 2
    x = layers.Conv2D(
        filters=32,
        kernel_size=[11, 21],
        strides=[1, 2],
        padding="same",
        use_bias=False,
        name="conv_2",
    )(x)
    x = layers.BatchNormalization(name="conv_2_bn")(x)
    x = layers.ReLU(name="conv_2_relu")(x)
    # Reshape the resulted volume to feed the RNNs layers
    x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
    # RNN layers
    for i in range(1, rnn_layers + 1):
        recurrent = layers.GRU(
            units=rnn_units,
            activation="tanh",
            recurrent_activation="sigmoid",
            use_bias=True,
            return_sequences=True,
            reset_after=True,
            name=f"gru_{i}",
        )
        x = layers.Bidirectional(
            recurrent, name=f"bidirectional_{i}", merge_mode="concat"
        )(x)
        if i < rnn_layers:
            x = layers.Dropout(rate=0.5)(x)
    # Dense layer
    x = layers.Dense(units=rnn_units * 2, name="dense_1")(x)
    x = layers.ReLU(name="dense_1_relu")(x)
    x = layers.Dropout(rate=0.5)(x)
    # Classification layer
    output = layers.Dense(units=output_dim + 1, activation="softmax")(x)
    # Model
    model = keras.Model(input_spectrogram, output, name="Custom")
    # Optimizer
    opt = keras.optimizers.Adam(learning_rate=1e-4)
    # Compile the model and return
    model.compile(optimizer=opt, loss=CTCLoss)
    return model


In [10]:
model = build_model(
    input_dim=fft_length // 2 + 1,
    output_dim=char_to_num.vocabulary_size(),
    rnn_units=512,
)

model.summary()

In [11]:
from tensorflow.keras.utils import plot_model
plot_model(model, to_file="model.png", show_shapes=True, show_layer_names=True)

You must install graphviz (see instructions at https://graphviz.gitlab.io/download/) for `plot_model` to work.


# Training

In [39]:
# A utility function to decode the output of the network
def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    # Use greedy search. For complex tasks, you can use beam search
    results = tf.keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
    # Iterate over the results and get back the text
    output_text = []
    for result in results:
        result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
        output_text.append(result)
    return output_text


# A callback class to output a few transcriptions during training
class CallbackEval(keras.callbacks.Callback):
    """Displays a batch of outputs after every epoch."""

    def __init__(self, dataset):
        super().__init__()
        self.dataset = dataset

    def on_epoch_end(self, epoch: int, logs=None):
        predictions = []
        targets = []
        for batch in self.dataset:
            X, y = batch
            batch_predictions = model.predict(X)
            batch_predictions = decode_batch_predictions(batch_predictions)
            predictions.extend(batch_predictions)
            for label in y:
                label = (
                    tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
                )
                targets.append(label)
        wer_score = wer(targets, predictions)
        print("-" * 100)
        print(f"Word Error Rate: {wer_score:.4f}")
        print("-" * 100)
        for i in np.random.randint(0, len(predictions), 2):
            print(f"Target    : {targets[i]}")
            print(f"Prediction: {predictions[i]}")
            print("-" * 100)


# Loading the trained model and testing it with real .wav files

In [40]:
from keras.models import load_model

In [41]:
model = load_model('models/deepspeechv2_custom.keras', custom_objects={'CTCLoss': CTCLoss})

In [42]:
def ctc_decode(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    results = tf.keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
    
    decoded_text = []
    for result in results.numpy():
        filtered = [num_to_char(idx).numpy().decode("utf-8") for idx in result if idx not in [0, 29]]  
        decoded_text.append("".join(filtered))  
    
    return decoded_text


In [43]:
def preprocess_wav(file_path):
    """Loads a WAV file and converts it into a spectrogram with correct shape."""
    file = tf.io.read_file(file_path)
    audio, _ = tf.audio.decode_wav(file)
    audio = tf.squeeze(audio, axis=-1)  # Remove channel dimension
    audio = tf.cast(audio, tf.float32)

    # Compute STFT-based spectrogram (same as training)
    spectrogram = tf.signal.stft(
        audio, frame_length=256, frame_step=160, fft_length=384
    )
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.math.pow(spectrogram, 0.5)

    # Normalize the spectrogram
    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)

    # Expand dims to mimic batch shape
    spectrogram = tf.expand_dims(spectrogram, axis=0)  # Shape: (1, time_steps, features)
    
    return spectrogram

def predict_wav(file_path, model):
    """Runs prediction on a single WAV file and decodes the output."""
    # Preprocess the WAV file
    spectrogram = preprocess_wav(file_path)

    # Run inference
    predictions = model.predict(spectrogram)

    # Decode output using CTC decoding
    decoded_text = ctc_decode(predictions)

    return decoded_text  # Return the first prediction


In [44]:
# Example usage
file_path = "audios/LJ050-0278.wav"  # Path to your WAV file
predicted_text = predict_wav(file_path, model)
print("Predicted Transcription:", predicted_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Predicted Transcription: ['the recommendations we have here suggested would greatly advance the security of the office without any impairment of our fundamental libertis']


# Finetuning on Svarah

In [2]:
import pandas as pd

In [3]:
svarah = pd.read_parquet("Svarah/data/0001.parquet")
svarah.head(3)

Unnamed: 0,audio_filepath,duration,text,gender,age-group,primary_language,native_place_state,native_place_district,highest_qualification,job_category,occupation_domain
0,{'bytes': b'RIFF|\x9e\x04\x00WAVEfmt \x10\x00\...,9.45875,"some in the starting, then again I poured and ...",Female,45-60,Maithili,Bihar,Darbhanga,Graduate,Full Time,Education and Research
1,{'bytes': b'RIFF\x02\xae\x02\x00WAVEfmt \x10\x...,5.486937,"North 24 Parganas, South 24 Parganas, Murshida...",Female,18-30,Bengali,West Bengal,South 24 Parganas,Post Graduate,Full Time,Information and Media
2,{'bytes': b'RIFF\xd6\x96\x01\x00WAVEfmt \x10\x...,3.253563,Breast cancers can be classified by different ...,Female,30-45,Konkani,Goa,North Goa,Post Graduate,Full Time,Education and Research


- as we can tell from the bytes in the audio file path, all of the audio data is comprised of .wav files as bytes saved in json. This tells us what to do for decoding

In [4]:
svarah = svarah[["audio_filepath", "text", "primary_language"]]
svarah.head(3)

Unnamed: 0,audio_filepath,text,primary_language
0,{'bytes': b'RIFF|\x9e\x04\x00WAVEfmt \x10\x00\...,"some in the starting, then again I poured and ...",Maithili
1,{'bytes': b'RIFF\x02\xae\x02\x00WAVEfmt \x10\x...,"North 24 Parganas, South 24 Parganas, Murshida...",Bengali
2,{'bytes': b'RIFF\xd6\x96\x01\x00WAVEfmt \x10\x...,Breast cancers can be classified by different ...,Konkani


In [4]:
import json

In [11]:
for index, row in svarah.iterrows():
    if index < 2:
        audio_data = row["audio_filepath"]
    
        audio_bytes = audio_data["bytes"]
        file_name = audio_data["path"]
        
        with open(file_name, "wb") as f:
            f.write(audio_bytes)

# once verified that file extraction is working well
- We will now
+ + Confirm audio Hz
+ + Confirm files are clear
+ + Combine all parquet files into one for easy dataset manipulation

In [8]:
import glob
import pandas as pd
import os

def combine_parquet_files(directory='./', recursive=True):
    merged_df = pd.DataFrame()
    
    for filename in glob.iglob(pathname=directory + '**/*.parquet', recursive=recursive):
        if os.path.isfile(filename):
            try:
                temp_df = pd.read_parquet(filename)
                merged_df = pd.concat([merged_df, temp_df], ignore_index=True)
            except Exception as e:
                print(f'Skipping {filename} due to error: {e}')
                continue
        else:
            print(f'Not a file: {filename}')
    
    return merged_df

directory = './Svarah/data/'

df = combine_parquet_files(directory=directory, recursive=True)

df.to_parquet('./combined.parquet', index=False)


- Verify new generated parquet file

In [5]:
final_df = pd.read_parquet('./combined.parquet')
final_df = final_df[["audio_filepath", "text", "primary_language"]]
final_df.head(3)

Unnamed: 0,audio_filepath,text,primary_language
0,{'bytes': b'RIFF|\x9e\x04\x00WAVEfmt \x10\x00\...,"some in the starting, then again I poured and ...",Maithili
1,{'bytes': b'RIFF\x02\xae\x02\x00WAVEfmt \x10\x...,"North 24 Parganas, South 24 Parganas, Murshida...",Bengali
2,{'bytes': b'RIFF\xd6\x96\x01\x00WAVEfmt \x10\x...,Breast cancers can be classified by different ...,Konkani


# Post Validation
- After verifying that the wav files are in 16Hz we can now make a folder similar to the LJSpeech dataset setup to easily import over the training methods