In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from IPython.display import HTML
from base64 import b64encode

## play_video function based on: https://colab.research.google.com/drive/1bNXkfpHiVHzXQH8WjGhzQ-fsDxolpUjD

def play_video(video_path, width=200):
  mp4 = open(video_path,'rb').read()
  data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
  return HTML(f"""
  <video width={width} controls>
        <source src="{data_url}" type="video/mp4">
  </video>
  """)

In [3]:
play_video('/content/drive/MyDrive/FinalAL/1/2.mp4', width=300)

In [4]:
%cd "/content/"
!git clone https://github.com/mpc001/Visual_Speech_Recognition_for_Multiple_Languages.git
%cd "Visual_Speech_Recognition_for_Multiple_Languages"

/content
Cloning into 'Visual_Speech_Recognition_for_Multiple_Languages'...
remote: Enumerating objects: 277, done.[K
remote: Counting objects: 100% (100/100), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 277 (delta 33), reused 81 (delta 22), pack-reused 177[K
Receiving objects: 100% (277/277), 69.77 MiB | 26.51 MiB/s, done.
Resolving deltas: 100% (58/58), done.
/content/Visual_Speech_Recognition_for_Multiple_Languages


In [5]:
!pip install av
!pip install mediapipe
!pip install ffmpeg-python

Collecting av
  Downloading av-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av
Successfully installed av-11.0.0
Collecting mediapipe
  Downloading mediapipe-0.10.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.5/34.5 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.4.6-py3-none-any.whl (31 kB)
Installing collected packages: sounddevice, mediapipe
Successfully installed mediapipe-0.10.9 sounddevice-0.4.6
Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0


In [6]:
import os
import torch
from pipelines.model import AVSR
from pipelines.data.data_module import AVSRDataLoader
from pipelines.detectors.mediapipe.detector import LandmarksDetector

class InferencePipeline(torch.nn.Module):
    def __init__(self, modality, model_path, model_conf, detector="mediapipe", face_track=False, device="cuda:0"):
        super(InferencePipeline, self).__init__()
        self.device = device
        # modality configuration
        self.modality = modality
        self.dataloader = AVSRDataLoader(modality, detector=detector)
        self.model = AVSR(modality, model_path, model_conf, rnnlm=None, rnnlm_conf=None, penalty=0.0, ctc_weight=0.1, lm_weight=0.0, beam_size=40, device=device)
        if face_track and self.modality in ["video", "audiovisual"]:
            self.landmarks_detector = LandmarksDetector()
        else:
            self.landmarks_detector = None


    def process_landmarks(self, data_filename, landmarks_filename):
        if self.modality == "audio":
            return None
        if self.modality in ["video", "audiovisual"]:
            landmarks = self.landmarks_detector(data_filename)
            return landmarks


    def forward(self, data_filename, landmarks_filename=None):
        assert os.path.isfile(data_filename), f"data_filename: {data_filename} does not exist."
        landmarks = self.process_landmarks(data_filename, landmarks_filename)
        data = self.dataloader.load_data(data_filename, landmarks)
        transcript = self.model.infer(data)
        return transcript

    def extract_features(self, data_filename, landmarks_filename=None, extract_resnet_feats=False):
        assert os.path.isfile(data_filename), f"data_filename: {data_filename} does not exist."
        landmarks = self.process_landmarks(data_filename, landmarks_filename)
        data = self.dataloader.load_data(data_filename, landmarks)
        with torch.no_grad():
            if isinstance(data, tuple):
                enc_feats = self.model.model.encode(data[0].to(self.device), data[1].to(self.device), extract_resnet_feats)
            else:
                enc_feats = self.model.model.encode(data.to(self.device), extract_resnet_feats)
        return enc_feats

In [7]:
%mkdir -p /content/data/
!wget http://www.doc.ic.ac.uk/~pm4115/autoAVSR/LRS3_V_WER19.1.zip -O /content/data/LRS3_V_WER19.1.zip
!unzip -o /content/data/LRS3_V_WER19.1.zip -d /content/data/

--2024-01-05 14:53:15--  http://www.doc.ic.ac.uk/~pm4115/autoAVSR/LRS3_V_WER19.1.zip
Resolving www.doc.ic.ac.uk (www.doc.ic.ac.uk)... 146.169.13.6
Connecting to www.doc.ic.ac.uk (www.doc.ic.ac.uk)|146.169.13.6|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 937274463 (894M) [application/zip]
Saving to: ‘/content/data/LRS3_V_WER19.1.zip’


2024-01-05 14:54:19 (14.1 MB/s) - ‘/content/data/LRS3_V_WER19.1.zip’ saved [937274463/937274463]

Archive:  /content/data/LRS3_V_WER19.1.zip
  inflating: /content/data/LRS3_V_WER19.1/model.json  
  inflating: /content/data/LRS3_V_WER19.1/model.pth  


In [8]:
modality = "video"
model_conf = "/content/data/LRS3_V_WER19.1/model.json"
model_path = "/content/data/LRS3_V_WER19.1/model.pth"
pipeline = InferencePipeline(modality, model_path, model_conf, face_track=True)

In [9]:
import cv2
import torchvision
from pipelines.data.data_module import AVSRDataLoader
from pipelines.detectors.mediapipe.detector import LandmarksDetector

def save2vid(filename, vid, frames_per_second):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    torchvision.io.write_video(filename, vid, frames_per_second)

def preprocess_video(src_filename, dst_filename):
    landmarks = landmarks_detector(src_filename)
    data = dataloader.load_data(src_filename, landmarks)
    fps = cv2.VideoCapture(src_filename).get(cv2.CAP_PROP_FPS)
    save2vid(dst_filename, data, fps)
    return

dataloader = AVSRDataLoader(modality="video", speed_rate=1, transform=False, detector="mediapipe", convert_gray=False)
landmarks_detector = LandmarksDetector()

In [10]:
!mkdir /content/data/roi_clips/

In [11]:
import os
import os.path

for dirpath, dirnames, filenames in os.walk("/content/drive/MyDrive/FinalAL/"):
    for filename in [f for f in filenames if f.endswith(".mp4")]:
        src_vid_path = (os.path.join(dirpath, filename))
        preprocess_video(src_filename=src_vid_path, dst_filename="/content/data/roi_clips/{}_{}.mp4".format(src_vid_path.split("/")[-2],src_vid_path.split("/")[-1][:-4]))

In [13]:
!zip /content/data/roi_clips.zip -r /content/data/roi_clips

  adding: content/data/roi_clips/ (stored 0%)
  adding: content/data/roi_clips/9_9.mp4 (deflated 1%)
  adding: content/data/roi_clips/23_1.mp4 (deflated 1%)
  adding: content/data/roi_clips/4_1.mp4 (deflated 1%)
  adding: content/data/roi_clips/10_8.mp4 (deflated 1%)
  adding: content/data/roi_clips/21_2.mp4 (deflated 1%)
  adding: content/data/roi_clips/8_3.mp4 (deflated 1%)
  adding: content/data/roi_clips/7_7.mp4 (deflated 1%)
  adding: content/data/roi_clips/28_8.mp4 (deflated 1%)
  adding: content/data/roi_clips/2_4.mp4 (deflated 1%)
  adding: content/data/roi_clips/26_1.mp4 (deflated 1%)
  adding: content/data/roi_clips/15_7.mp4 (deflated 1%)
  adding: content/data/roi_clips/16_4.mp4 (deflated 1%)
  adding: content/data/roi_clips/25_9.mp4 (deflated 1%)
  adding: content/data/roi_clips/19_3.mp4 (deflated 1%)
  adding: content/data/roi_clips/21_3.mp4 (deflated 1%)
  adding: content/data/roi_clips/28_7.mp4 (deflated 1%)
  adding: content/data/roi_clips/14_4.mp4 (deflated 1%)
  addin

In [15]:
!unzip /content/data/roi_clips_clean.zip -d /content/data/

Archive:  /content/data/roi_clips_clean.zip
  inflating: /content/data/roi_clips_clean/1_10.mp4  
  inflating: /content/data/roi_clips_clean/1_5.mp4  
  inflating: /content/data/roi_clips_clean/1_6.mp4  
  inflating: /content/data/roi_clips_clean/1_7.mp4  
  inflating: /content/data/roi_clips_clean/1_9.mp4  
  inflating: /content/data/roi_clips_clean/10_1.mp4  
  inflating: /content/data/roi_clips_clean/10_4.mp4  
  inflating: /content/data/roi_clips_clean/10_5.mp4  
  inflating: /content/data/roi_clips_clean/10_6.mp4  
  inflating: /content/data/roi_clips_clean/10_7.mp4  
  inflating: /content/data/roi_clips_clean/10_8.mp4  
  inflating: /content/data/roi_clips_clean/11_10.mp4  
  inflating: /content/data/roi_clips_clean/11_4.mp4  
  inflating: /content/data/roi_clips_clean/11_5.mp4  
  inflating: /content/data/roi_clips_clean/11_7.mp4  
  inflating: /content/data/roi_clips_clean/11_8.mp4  
  inflating: /content/data/roi_clips_clean/12_1.mp4  
  inflating: /content/data/roi_clips_clea

In [16]:
play_video("/content/data/roi_clips_clean/9_2.mp4", width=300)

In [1]:
import os
import os.path
import numpy as np
import cv2
from tensorflow.keras.preprocessing.sequence import pad_sequences

data = []
labels = []
max_frames = 30
for dirpath, dirnames, filenames in os.walk("/content/data/roi_clips_clean/"):
    for filename in [f for f in filenames if f.endswith(".mp4")]:
        src_vid_path = (os.path.join(dirpath, filename))
        # print(src_vid_path)
        video_capture = cv2.VideoCapture(src_vid_path)
        frames = []
        while True:
            ret, frame = video_capture.read()
            if not ret:
                break
            # Resize the frame to a fixed size (e.g., 32x32) for consistency
            frame = cv2.resize(frame, (112, 112))
            frames.append(frame)
        video_capture.release()
        if len(frames) > 0:
            # Ensure all frames have the same dimensions by resizing
            frames = [cv2.resize(frame, (112, 112)) for frame in frames]
            # Pad frames to ensure a consistent number of frames
            frames = pad_sequences([frames], maxlen=max_frames, padding='post', truncating='post')[0]
            data.append(frames)
            labels.append(str(src_vid_path.split("/")[-1].split("_")[0]))

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, TimeDistributed
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

data = np.array(data)
labels = np.array(labels)


In [3]:
# Preprocess the data
data = np.array(data).astype('float32') / 255.0

# One-hot encode the labels
labels = to_categorical(labels, 29)

In [4]:
labels.size

4553

In [7]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, TimeDistributed, GlobalAveragePooling1D, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

# data = tf.convert_to_tensor(data, dtype=tf.float32)
# labels = tf.convert_to_tensor(labels, dtype=tf.float32)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.1, random_state=42, shuffle=True)
max_frames = 30

# Reshape the input data to match the model's expected input shape
X_train = X_train.reshape(X_train.shape[0], max_frames, 112, 112, 3)
X_test = X_test.reshape(X_test.shape[0], max_frames, 112, 112, 3)

# Build a simple CNN model
model = Sequential()
model.add(TimeDistributed(Conv2D(32, (3, 3), activation='relu'), input_shape=(max_frames, 112, 112, 3)))
model.add(TimeDistributed(MaxPooling2D((2, 2))))
model.add(TimeDistributed(Conv2D(64, (3, 3), activation='relu')))
model.add(Dropout(0.5))  # Add dropout layer
model.add(TimeDistributed(MaxPooling2D((2, 2))))
model.add(TimeDistributed(Flatten()))
model.add(Dropout(0.5))  # Add dropout layer
model.add(GlobalAveragePooling1D())  # Global average pooling over frames
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))  # Add dropout layer
model.add(Dense(29, activation='softmax'))  # Adjust to the number of classes (29 in this case)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=200, batch_size=16, shuffle=True, validation_data=(X_test, y_test))

# Save the model
model.save('/content/alphabet_recognition_model.h5')

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

  saving_api.save_model(


In [16]:
import cv2
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the trained model
model = load_model('/content/alphabet_recognition_model.h5')

# Function to preprocess a single video clip
def preprocess_video_clip(video_path, max_frames=30):
    frames = []
    video_capture = cv2.VideoCapture(video_path)

    while True:
        ret, frame = video_capture.read()
        if not ret:
            break
        frame = cv2.resize(frame, (112, 112))
        frames.append(frame)

    video_capture.release()

    if len(frames) > 0:
        frames = [cv2.resize(frame, (112, 112)) for frame in frames]
        frames = pad_sequences([frames], maxlen=max_frames, padding='post', truncating='post')[0]
        frames = np.array(frames).astype('float32') / 255.0
        frames = frames.reshape(1, max_frames, 112, 112, 3)
        return frames
    else:
        return None

# Specify the path to the video clip for prediction
video_path_for_prediction = "/content/data/roi_clips_clean/23_8.mp4"

# Preprocess the video clip
preprocessed_clip = preprocess_video_clip(video_path_for_prediction)

if preprocessed_clip is not None:
    # Make predictions
    predictions = model.predict(preprocessed_clip)

    # Get the predicted class
    predicted_class = np.argmax(predictions)

    # Print or use the predicted class as needed
    print("Predicted Class:", predicted_class)
else:
    print("No frames found in the video clip.")


Predicted Class: 23
