In [1]:
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential, Model
from keras.layers import Conv2D, MaxPool2D, BatchNormalization, Bidirectional
from keras.layers import Activation, Dropout, Flatten, Dense, Reshape, LSTM, Input, Lambda
from keras.callbacks import ModelCheckpoint
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from keras import backend as K
import tensorflow as tf
#import cv2
#import time
import numpy as np
import matplotlib.pyplot as plt
import keras
import keras.utils
from keras import utils as np_utils


In [2]:
train_data_dir = "E:/New folder/train"
#test_data_dir = 'E:/testing dataset'

batch_size = 32

In [3]:
train_datagen = ImageDataGenerator(rescale=1. /255)

In [4]:
train = train_datagen.flow_from_directory(train_data_dir,
                                                    target_size=(128, 128),
                                                    batch_size=32,
                                                    class_mode ='categorical'
                                                    )

Found 4592 images belonging to 39 classes.


In [5]:
classes = train.class_indices
label = list(classes.keys())
#labels = ' '.join(map(str, labels))
#labels = labels.encode()
print(label)

# The set of characters accepted in the transcription.
characters = ['aa', 'ae', 'ah', 'aw', 'axr', 'ay', 'b', 'bcl', 'ch', 'd', 'dh', 'dx', 'eh', 'em', 'ey', 
              'f', 'g', 'hh', 'ih', 'iy', 'jh', 'k','l', 'n', 'ng', 'ow', 'oy', 'p', 
              'r', 's', 'sh', 't', 'th', 'uh', 'uw', 'v', 'w', 'y', 'z']
# Mapping characters to integers
char_to_num = keras.layers.StringLookup(vocabulary=characters)
print(
    f"The vocabulary is: {char_to_num.get_vocabulary()} "
    f"(size ={char_to_num.vocabulary_size()})"
)
label = [x.encode('utf-8') for x in label]
labels = char_to_num(label)
print(labels)

['aa', 'ae', 'ah', 'aw', 'axr', 'ay', 'b', 'bcl', 'ch', 'd', 'dh', 'dx', 'eh', 'em', 'ey', 'f', 'g', 'hh', 'ih', 'iy', 'jh', 'k', 'l', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's', 'sh', 't', 'th', 'uh', 'uw', 'v', 'w', 'y', 'z']
The vocabulary is: ['[UNK]', 'aa', 'ae', 'ah', 'aw', 'axr', 'ay', 'b', 'bcl', 'ch', 'd', 'dh', 'dx', 'eh', 'em', 'ey', 'f', 'g', 'hh', 'ih', 'iy', 'jh', 'k', 'l', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's', 'sh', 't', 'th', 'uh', 'uw', 'v', 'w', 'y', 'z'] (size =40)
tf.Tensor(
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39], shape=(39,), dtype=int64)


In [6]:
validation = train_datagen.flow_from_directory(train_data_dir,
                                                             target_size=(128, 128),
                                                             batch_size=32,
                                                             class_mode='categorical')

Found 4592 images belonging to 39 classes.


In [7]:
keras.backend.clear_session()

In [8]:
def CTCLoss(y_true, y_pred):
    # Compute the training-time loss value
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    labels_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    labels_length = labels_length * tf.ones(shape=(batch_len, 1), dtype="int64")

    loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, labels_length)
    return loss

In [9]:
def build_model(output_dim, rnn_layers=1, rnn_units=128):

    # Model's input
    input_spectrogram = layers.Input((128, 128, 3), name="input")
    # Expand the dimension to use 2D CNN.
    #x = layers.Reshape((-1, input_shape, 3), name="expand_dim")(input_spectrogram)
    # Convolution layer 1
    x = layers.Conv2D(
        filters=32,
        kernel_size=[2, 2],
        padding="same",
        use_bias=False,
        name="conv_1",
    )(input_spectrogram)
    x = layers.BatchNormalization(name="conv_1_bn")(x)
    x = layers.ReLU(name="conv_1_relu")(x)
    # Convolution layer 2
    x = layers.Conv2D(
        filters=64,
        kernel_size=[2, 2],
        padding="same",
        use_bias=False,
        name="conv_2",
    )(x)
    x = layers.BatchNormalization(name="conv_2_bn")(x)
    x = layers.ReLU(name="conv_2_relu")(x)
    # Convolution layer 3
    x = layers.Conv2D(
        filters=128,
        kernel_size=[2, 2],
        padding="same",
        use_bias=False,
        name="conv_3",
    )(x)
    x = layers.BatchNormalization(name="conv_3_bn")(x)
    x = layers.ReLU(name="conv_3_relu")(x)
    # Reshape the resulted volume to feed the RNNs layers
    x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
    # RNN layers
    for i in range(1, rnn_layers + 1):
        recurrent = layers.GRU(
            units=rnn_units,
            activation="tanh",
            recurrent_activation="sigmoid",
            use_bias=True,
            return_sequences=True,
            reset_after=True,
            name=f"gru_{i}",
        )
        x = layers.Bidirectional(
            recurrent, name=f"bidirectional_{i}", merge_mode="concat"
        )(x)
        if i < rnn_layers:
            x = layers.Dropout(rate=0.5)(x)
    # Dense layer
    x = layers.Dense(units=rnn_units * 2, name="dense_1")(x)
    x = layers.ReLU(name="dense_1_relu")(x)
    x = layers.Dropout(rate=0.5)(x)
    # Classification layer
    output = layers.Dense(units=output_dim + 1, activation="softmax")(x)
    # Model
    model = keras.Model(input_spectrogram, output, name="Model")
    # Optimizer
    #opt = keras.optimizers.Adam(learning_rate=1e-4)
    # Compile the model and return
    model.compile(optimizer="Adam", loss=CTCLoss)
    return model
# Get the model
model = build_model(
    output_dim=39,
    rnn_units=128,
)
model.summary()

Model: "Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 128, 128, 3)]     0         
                                                                 
 conv_1 (Conv2D)             (None, 128, 128, 32)      384       
                                                                 
 conv_1_bn (BatchNormalizati  (None, 128, 128, 32)     128       
 on)                                                             
                                                                 
 conv_1_relu (ReLU)          (None, 128, 128, 32)      0         
                                                                 
 conv_2 (Conv2D)             (None, 128, 128, 64)      8192      
                                                                 
 conv_2_bn (BatchNormalizati  (None, 128, 128, 64)     256       
 on)                                                         

In [None]:
# Define the number of epochs.
epochs = 30
# Callback function to check transcription on the val set.
#validation_callback = CallbackEval(validation)
# Train the model
history = model.fit(
    train,
    validation_data=validation,
    epochs=epochs
    #callbacks=[validation_callback],
)

Epoch 1/30

In [None]:
import IPython
speech_file = ("E:/anveshan/archive/timit/timit/dr8-mbcg0/sa1.wav")

IPython.display.Audio(speech_file)

In [None]:
import IPython.display as ipd
import librosa
import librosa.display

In [None]:
plt.figure(figsize=(13,5))
data,sample_rate=librosa.load(speech_file)
waveform = librosa.display.waveshow(data,sample_rate)
ipd.Audio(speech_file)

In [None]:
sample_rate

In [None]:
frame_size=2048
hop_size=512

In [None]:
def spectrogram(data,path):
    signal=librosa.stft(y=data, hop_length=hop_size, 
                                   n_fft=frame_size)
    spectrogram = np.abs(signal)
    power_to_db = librosa.power_to_db(spectrogram, ref=np.max)
    plt.figure(figsize=(8, 4))
    librosa.display.specshow(power_to_db, sr=sample_rate, x_axis='time', y_axis='mel', 
     hop_length=hop_size)
    plt.savefig(path)
    plt.show()

In [None]:
spectrogram(data, "C:/Users/HP/Desktop/ddddd_new")

In [None]:

import cv2

img = cv2.imread("C:/Users/HP/Desktop/ddddd_new.png")


In [None]:
img.shape

In [None]:
img.dtype

In [None]:
img = tf.image.convert_image_dtype(img, tf.float32)

In [None]:
img = tf.image.resize(img, [128, 128])
img = tf.transpose(img, perm=[1, 0, 2])
img = tf.expand_dims(img, axis=0)

In [None]:
# Mapping integers back to original characters
num_to_char = keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), invert=True
)
def decode_predictions(y_pred):
    input_len = np.ones(y_pred.shape[0]) * y_pred.shape[1]
    # Use greedy search. For complex tasks, you can use beam search
    results = keras.backend.ctc_decode(y_pred, input_length=input_len, greedy=False, beam_width=100, top_paths=3 )[0][0]
    #print(results)
    # Iterate over the results and get back the text
    output_text = []
    
    for res in results:
        res = tf.strings.reduce_join(num_to_char(res)).numpy().decode("utf-8")
        output_text.append(res)
    
    return output_text

In [None]:
targets = "She had your dark suit in greasy wash water all year"
#targets = targets.split()

preds = model.predict(img)
pred_text = decode_predictions(preds)
print("Target:", targets)
print("predictions:", pred_text[0])