In [2]:
import os
import pandas as pd
import keras
from keras import layers
import tensorflow as tf
import wave
import numpy as np
import matplotlib.pyplot as plt
import librosa
import jiwer
from jiwer import wer

In [3]:
from keras.models import load_model

In [4]:
characters = [x for x in "abcdefghijklmnopqrstuvwxyz' "]
char_to_num = keras.layers.StringLookup(
    vocabulary=characters, oov_token=""
)

num_to_char = keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)

In [5]:
def CTCLoss(y_true, y_pred):
    # Compute the training-time loss value
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

    loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss

In [6]:
model = load_model("../models/deepspeechv2_finetuned_svarah.keras", custom_objects={"CTCLoss": CTCLoss})

  saveable.load_own_variables(weights_store.get(inner_path))


In [7]:
import re

In [8]:
def ctc_decode(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    results = tf.keras.backend.ctc_decode(pred, input_length=input_len, greedy=False, beam_width=20, top_paths=1)[0][0]
    
    decoded_text = []
    for result in results.numpy():
        filtered = [num_to_char(idx).numpy().decode("utf-8") for idx in result if idx not in [0, 29]]  
        decoded_text.append("".join(filtered))  
    
    return decoded_text

In [9]:
def preprocess_wav(file_path):
    """Loads a WAV file and converts it into a normalized log-mel spectrogram (same as training)."""
    file = tf.io.read_file(file_path)
    audio, _ = tf.audio.decode_wav(file)
    audio = tf.squeeze(audio, axis=-1)
    audio = tf.cast(audio, tf.float32)

    # STFT parameters
    frame_length = 256
    frame_step = 160
    fft_length = 384
    sample_rate = 16000

    # 1. STFT
    stft = tf.signal.stft(audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
    spectrogram = tf.abs(stft)

    # 2. Power Spectrogram
    power_spectrogram = tf.square(spectrogram)

    # 3. Mel Filterbank
    num_mel_bins = 80
    lower_edge_hertz, upper_edge_hertz = 80.0, 7600.0
    mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins, fft_length // 2 + 1, sample_rate, lower_edge_hertz, upper_edge_hertz
    )
    mel_spectrogram = tf.tensordot(power_spectrogram, mel_weight_matrix, 1)
    mel_spectrogram.set_shape(power_spectrogram.shape[:-1].concatenate([num_mel_bins]))

    # 4. Log-mel
    log_mel_spectrogram = tf.math.log(mel_spectrogram + 1e-6)

    # 5. Normalize
    means = tf.math.reduce_mean(log_mel_spectrogram, axis=0, keepdims=True)
    stddevs = tf.math.reduce_std(log_mel_spectrogram, axis=0, keepdims=True)
    normalized_log_mel = (log_mel_spectrogram - means) / (stddevs + 1e-10)

    # 6. Add batch dimension
    normalized_log_mel = tf.expand_dims(normalized_log_mel, axis=0)  # (1, time_steps, 80)

    return normalized_log_mel


def predict_wav(file_path, model):
    """Runs prediction on a single WAV file and decodes the output."""
    # Preprocess the WAV file
    spectrogram = preprocess_wav(file_path)

    # Run inference
    predictions = model.predict(spectrogram)

    # Decode output using CTC decoding
    decoded_text = ctc_decode(predictions)

    return decoded_text  # Return the first prediction


In [10]:
import soundfile as sf

In [33]:
def convert_to_wav(input_file, output_file, target_sr=16000):
    """Loads an audio file, resamples it to the target sampling rate, and saves it as a WAV file."""
    # Load the audio file with librosa
    audio, sr = librosa.load(input_file, sr=target_sr)

    # Save the audio as a WAV file
    sf.write(output_file, audio, sr)

    print(f"Converted {input_file} to {output_file} with sampling rate {target_sr} Hz.")

# Example usage
input_file = "../audios/fth.m4a" #demo files are imp.m4a and mumisthe.wav and fth.m4a
output_file = "../audios/output_audio.wav"
convert_to_wav(input_file, output_file)

Converted ../audios/fth.m4a to ../audios/output_audio.wav with sampling rate 16000 Hz.


  audio, sr = librosa.load(input_file, sr=target_sr)


In [34]:
# playing audio with IPython
from IPython.display import Audio
Audio(output_file)

In [35]:
file_path = "../audios/output_audio.wav" 
predicted_text = predict_wav(file_path, model)
print("Predicted Transcription:", predicted_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step   
Predicted Transcription: ['my farthe one stold me that in order to be the best version of yourself you must dive op a part of yourselfs']


In [36]:
#demo 1
from jiwer import wer
wer("wmy father once told me that in order to be the best version of yourself you must give up a part of yourself", predicted_text[0])

0.30434782608695654

In [15]:
# demo 2
wer("where are you from i am from india", predicted_text[0])

0.125

# Using a spelling corrector 
- look into custom language models
- Auxiallary loss for words to make model learn how to spell them more "holisically"

In [16]:
from symspellpy.symspellpy import SymSpell, Verbosity
import os

In [17]:
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

# Load dictionary
dictionary_path = "../correctors/frequency_dictionary_en_82_765.txt"  # txt from github
term_index = 0  # column with the word
count_index = 1  # column with frequency
sym_spell.load_dictionary(dictionary_path, term_index, count_index)

True

In [18]:
def symspell_correct(text):
    corrected = []
    for word in text.split():
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        if suggestions:
            corrected.append(suggestions[0].term)
        else:
            corrected.append(word)
    return ' '.join(corrected)

In [38]:
corr = symspell_correct(predicted_text[0])
print(corr)

my farther one told me that in order to be ﻿the best version of yourself you must dive of a part of yourself


In [37]:
# Demo 1
wer("my father once told me that in order to be the best version of yourself you must give up a part of yourself", corr)

0.21739130434782608

In [21]:
# Demo 2
wer("where are you from i am from india", corr)

0.125