In [1]:
!pip install jiwer
!pip install tensorflow
!pip install keras



In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras 
from keras import layers
import matplotlib.pyplot as plt
from IPython import display
from jiwer import wer
from tensorflow.python.keras import activations

In [2]:
### Link to the dataset - https://keithito.com/LJ-Speech-Dataset/
wavs_path = 'dataset/wavs/'
metadata_path = 'dataset/metadata.csv'

In [3]:
metadata_df = pd.read_csv(metadata_path, sep="|", header = None, quoting=3)
metadata_df

Unnamed: 0,0,1,2
0,LJ001-0001,"Printing, in the only sense with which we are ...","Printing, in the only sense with which we are ..."
1,LJ001-0002,in being comparatively modern.,in being comparatively modern.
2,LJ001-0003,For although the Chinese took impressions from...,For although the Chinese took impressions from...
3,LJ001-0004,"produced the block books, which were the immed...","produced the block books, which were the immed..."
4,LJ001-0005,the invention of movable metal letters in the ...,the invention of movable metal letters in the ...
...,...,...,...
13095,LJ050-0274,made certain recommendations which it believes...,made certain recommendations which it believes...
13096,LJ050-0275,materially improve upon the procedures in effe...,materially improve upon the procedures in effe...
13097,LJ050-0276,"As has been pointed out, the Commission has no...","As has been pointed out, the Commission has no..."
13098,LJ050-0277,with the active cooperation of the responsible...,with the active cooperation of the responsible...


In [4]:
metadata_df.tail()

Unnamed: 0,0,1,2
13095,LJ050-0274,made certain recommendations which it believes...,made certain recommendations which it believes...
13096,LJ050-0275,materially improve upon the procedures in effe...,materially improve upon the procedures in effe...
13097,LJ050-0276,"As has been pointed out, the Commission has no...","As has been pointed out, the Commission has no..."
13098,LJ050-0277,with the active cooperation of the responsible...,with the active cooperation of the responsible...
13099,LJ050-0278,the recommendations we have here suggested wou...,the recommendations we have here suggested wou...


In [5]:
metadata_df.columns = ["file_name", "transcription", "normalized_transcription"]
metadata_df = metadata_df[["file_name", "normalized_transcription"]]
metadata_df = metadata_df.sample(frac=1).reset_index(drop = True)
metadata_df

Unnamed: 0,file_name,normalized_transcription
0,LJ017-0244,The murders were perpetrated on the tenth Sept...
1,LJ026-0083,"Starch, however, contains potential energy, si..."
2,LJ018-0345,"As in that case, the grave had been dug long i..."
3,LJ004-0145,"At Guildford prison, which Mr. Buxton also vis..."
4,LJ018-0244,The effect of establishing the forgeries would...
...,...,...
13095,LJ044-0179,He apparently based his claim for a visa in tr...
13096,LJ031-0089,considerable time which at this juncture was n...
13097,LJ034-0119,Approximately seven or eight minutes later
13098,LJ021-0077,"There is no magic formula,"


Preprocessing

In [6]:
split = int(len(metadata_df) * 0.70)
df_train = metadata_df[:split]
df_val = metadata_df[split:]
print("Size of the training dataset :- ", len(df_train))
print("Size of validation dataset :- ", len(df_val))

Size of the training dataset :-  9170
Size of validation dataset :-  3930


### Mapping all the English characters to numbers for feeding into Deep Learning Model

In [7]:
characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
nums_to_char = keras.layers.StringLookup(vocabulary = to_num.get_vocabulary(), oov_token="", invert=True)
print("The vocabulary is :- ", to_num.get_vocabulary())
print("The size is :- ", to_num.vocabulary_size())

The vocabulary is :-  ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'", '?', '!', ' ']
The size is :-  31


In [8]:
frame_length = 256
frame_step = 160
fft_length = 384

In [9]:
def encode_single_sample(wav_file, label) : 
    ### Step 1 :- Process the audio
    file = tf.io.read_file(wavs_path + wav_file + '.wav')
    #Converting wav file into float tensor
    audio, _ = tf.audio.decode_wav(file)
    #Remove repetitive tensors and keep only multi-dimensional tensors
    audio = tf.squeeze(audio, axis=-1)
    #Converting the tensors into into the datatype float32
    audio = tf.cast(audio, tf.float32)
    spectrogram = tf.signal.stft(audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
    #Getting the absolute value of spectrogram i.e the magitude
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.math.pow(spectrogram, 0.5)
    mean = tf.math.reduce_mean(spectrogram, 1, keepdims = True)
    stddev = tf.math.reduce_std(spectrogram, 1, keepdims = True)
    spectrogram = (spectrogram-mean)/(stddev+1e-10)
    ### Step 2 :- Process the label
    label = tf.strings.lower(label)
    label = tf.strings.unicode_split(label, input_encoding = "UTF-8")
    label = to_num(label)
    return spectrogram, label

### Dividing the dataset into dataset objects

In [10]:
batch_size = 32
train_dataset = tf.data.Dataset.from_tensor_slices(
    (list(df_train["file_name"]), list(df_train["normalized_transcription"]))
)

train_dataset = (
    train_dataset.map(encode_single_sample, num_parallel_calls = tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size = tf.data.AUTOTUNE)
)

validation_dataset = tf.data.Dataset.from_tensor_slices(
    (list(df_val["file_name"]), list(df_val["normalized_transcription"]))
)

validation_dataset = (
    validation_dataset.map(encode_single_sample, num_parallel_calls = tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size = tf.data.AUTOTUNE)
)

In [11]:
def CTCLoss(y_true, y_pred) : 
    batch_len = tf.cast(tf.shape(y_true)[0], dtype='int64')
    input_length = tf.cast(tf.shape(y_pred)[1], dtype='int64')
    label_length = tf.cast(tf.shape(y_true)[1], dtype='int64')
    
    input_length = input_length*tf.ones(shape=(batch_len, 1), dtype='int64')
    label_length = label_length*tf.ones(shape=(batch_len, 1), dtype='int64')
    
    loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss

In [12]:
def build_model(input_dim, output_dim, rnn_layers = 5, rnn_units = 128) :
    input_spectrogram = layers.Input((None, input_dim), name="input")
    x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)
    # 2 CNN Layers
    x = layers.Conv2D(
        filters = 32,
        kernel_size = [11,41],
        strides = [2, 2],
        padding = "same",
        use_bias = False,
        name = "conv_1"
    )(x)
    x = layers.BatchNormalization(name = "conv_1_bn")(x)
    x = layers.ReLU(name = "conv_1_relu")(x)
    x = layers.Conv2D(
        filters = 32,
        kernel_size = [11,21],
        strides = [1,2],
        padding = "same",
        use_bias = False,
        name = "conv_2"
    )(x)
    x = layers.BatchNormalization(name = "conv_2_bn")(x)
    x = layers.ReLU(name = "conv_2_relu")(x)
    x = layers.Reshape((-1, x.shape[-2]*x.shape[-1]))(x)
    
    # RNN Layers
    for i in range(1, rnn_layers + 1) :
        recurrent = layers.GRU(
            units = rnn_units,
            activation = "tanh",
            recurrent_activation = 'sigmoid',
            use_bias = True,
            return_sequences = True,
            reset_after = True,
            name = f"gru_{i}",
        )
        x = layers.Bidirectional(
            recurrent, name = f"bidirectional_{i}", merge_mode = "concat"
        )(x)
        if i<rnn_layers :
            x = layers.Dropout(rate = 0.5)(x)
    # Dense Layer
    x = layers.Dense(units = rnn_units*2, name = "dense_1")(x)
    x = layers.ReLU(name = "dense_1_relu")(x)
    x = layers.Dropout(rate = 0.5)(x)
    # Classification Layer
    output = layers.Dense(units = output_dim + 1, activation = "softmax")(x)
    # Model
    model = keras.Model(input_spectrogram, output, name = "DeepSpeech_2")
    # Optimizer
    opt = keras.optimizers.Adam(learning_rate = 1e-4)
    model.compile(optimizer = opt, loss = CTCLoss)
    return model

In [13]:
model = build_model(
    input_dim = fft_length // 2 + 1,
    output_dim = to_num.vocabulary_size(),
    rnn_units = 512
)
model.summary(line_length = 110)

Model: "DeepSpeech_2"
______________________________________________________________________________________________________________
 Layer (type)                                    Output Shape                                Param #          
 input (InputLayer)                              [(None, None, 193)]                         0                
                                                                                                              
 expand_dim (Reshape)                            (None, None, 193, 1)                        0                
                                                                                                              
 conv_1 (Conv2D)                                 (None, None, 97, 32)                        14432            
                                                                                                              
 conv_1_bn (BatchNormalization)                  (None, None, 97, 32)                     

In [14]:
def decode_batch_predictions(pred) :
    input_len = np.ones(pred.shape[0])*pred.shape[1]
    results = keras.backend.ctc_decode(pred, input_length = input_len, greedy = True)[0][0]
    output_text = []
    for result in results :
        result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
        output_text.append(result)
    return output_text

class CallbackEval(keras.callbacks.Callback) :
    """Displays a batch of outputs after each epoch"""
    def __init__(self,dataset) :
        super().__init__()
        self.dataset = dataset
    def on_epoch_end(self, epoch: int, logs = None) :
        predictions = []
        targets = []
        for batch in self.dataset :
            X, y = batch
            batch_predictions = model.predict(X)
            batch_predictions = decode_batch_predictions(batch_predictions)
            predictions.extend(batch_predictions)
            for label in y :
                label = (
                    tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
                )
                targets.append(label)
        wer_score = wer(targets, predictions)
        print("-" * 100)
        print(f"Word Error Rate : {wer_score:.4f}")
        print("-" * 100)
        for i in np.random.randint(0, len(predictions), 2) :
            print(f"Target      :{targets[i]}")
            print(f"Predictions :{predictions[i]}")
            print("-" * 100)

In [None]:
epochs = 1
validation_callback = CallbackEval(validation_dataset)
history = model.fit(
    train_dataset,
    validation_data = validation_dataset,
    epochs = epochs,
    callbacks = [validation_callback],
)

In [None]:
predictions = []
targets = []
for batch in validation_dataset : 
    X, y = batch
    batch_predictions = model.predict(X)
    batch_predictions = decode_batch_predictions(batch_predictions)
    predictions.extend(batch_predictions)
    for label in y :
        label = (
            tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
        )
        targets.append(label)
wer_score = wer(targets, predictions)
for i in np.random.randint(0, len(predictions), 5) :
    print(f"Target      :{targets[i]}")
    print(f"Predictions :{predictions[i]}")
    print("-" * 100)