In [1]:
import os 
import imageio 
import tensorflow as tf 

In [9]:
import os 
from tensorflow import keras
from keras.models import Sequential 
from keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten

def load_model() -> Sequential: 
    model = Sequential()

    model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same'))
    model.add(Activation('relu'))
    model.add(MaxPool3D((1,2,2)))

    model.add(Conv3D(256, 3, padding='same'))
    model.add(Activation('relu'))
    model.add(MaxPool3D((1,2,2)))

    model.add(Conv3D(75, 3, padding='same'))
    model.add(Activation('relu'))
    model.add(MaxPool3D((1,2,2)))

    model.add(TimeDistributed(Flatten()))

    model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
    model.add(Dropout(.5))

    model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
    model.add(Dropout(.5))
    
    model.add(Dense(41, kernel_initializer='he_normal', activation='softmax'))

    model.load_weights(os.path.join('models - ~100 cp','checkpoint'))
        
    return model

In [3]:
from typing import List
import cv2

vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
# Mapping integers back to original characters
num_to_char = tf.keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)

def load_video(path:str) -> List[float]: 
    cap = cv2.VideoCapture(path)
    frames = []
    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))): 
        ret, frame = cap.read()
        frame = tf.image.rgb_to_grayscale(frame)
        frames.append(frame[190:236,80:220,:])
    cap.release()
    
    mean = tf.math.reduce_mean(frames)
    std = tf.math.reduce_std(tf.cast(frames, tf.float32))
    return tf.cast((frames - mean), tf.float32) / std
    
def load_alignments(path:str) -> List[str]: 
    with open(path, 'r') as f: 
        lines = f.readlines() 
    tokens = []
    for line in lines:
        line = line.split()
        if line[2] != 'sil': 
            tokens = [*tokens,' ',line[2]]
    return char_to_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'), (-1)))[1:]

def load_data(path: str): 
    path = bytes.decode(path.numpy())
    file_name = path.split('/')[-1].split('.')[0]
    # File name splitting for windows
    file_name = path.split('\\')[-1].split('.')[0]
    video_path = os.path.join('data','s1',f'{file_name}.mpg')
    alignment_path = os.path.join('data','alignments','s1',f'{file_name}.align')
    frames = load_video(video_path) 
    alignments = load_alignments(alignment_path)
    
    return frames, alignments

In [4]:
def calculate_wer(reference, hypothesis):
    references = reference.split()
    hypothesis_ = hypothesis.split()
    ref_words = reference.split()
    hyp_words = hypothesis.split()
    
    
    substitutions = sum(1 for ref, hyp in zip(ref_words, hyp_words) if ref != hyp)

    deletions = len(ref_words) - len(hyp_words)

    insertions = len(hyp_words) - len(ref_words)

    

    wer = (substitutions + deletions + insertions)
    return wer

In [10]:
import gradio as gr
import cv2
import numpy as np

def main(vid, actual):
#     videoFile = open(vid, 'rb')
#     video_bytes = videoFile.read() 
#     video, annotations = load_data(tf.convert_to_tensor(vid))
#     imageio.mimsave('animation.gif', video, fps=10)

    cap = cv2.VideoCapture(vid)

    if (cap.isOpened()== False): 
        print("Error opening video stream or file")

    ret, frame = cap.read()
    
    frame = np.expand_dims(frame, axis=-1)
    

    # Resize the array to match the target shape
    frame = np.resize(frame, (75, 46, 140, 1))

    # If the original array is larger, we need to crop the excess parts
    crop_x = (frame.shape[0] - 75) // 2
    crop_y = (frame.shape[1] - 46) // 2
    crop_z = (frame.shape[2] - 140) // 2

    cropped_array = frame[crop_x:crop_x+75, crop_y:crop_y+46, crop_z:crop_z+140, :]

    
    model = load_model()
    yhat = model.predict(tf.expand_dims(tf.convert_to_tensor(cropped_array), axis=0))
    decoder = tf.keras.backend.ctc_decode(yhat, [75], greedy=True)[0][0].numpy()
    converted_prediction = tf.strings.reduce_join(num_to_char(decoder)).numpy().decode('utf-8')
    return converted_prediction , calculate_wer(actual, converted_prediction)

# main('bbaf2n.mp4')
def gmain(video, text):
    prediction = main(video, text)
    return prediction 


demo = gr.Interface(gmain, inputs= ["playable_video", "text"], outputs=["text", "text"])

# demo.launch(debug=True, share=True, auth=("username", "password"))

demo.launch(debug=True)

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.






Keyboard interruption in main thread... closing server.




bin blue at f two now
place white by q seven again




# 50 cp
bin blue at f two now - bin white biu seie pleaien
place white by q seven again - bin whue i ine ageain


# ~100 cp
bin blue at f two now - bin whte biu s sie lain
place white by q seven again - bin white ie e leasin
