In [1]:
import os
import torch
from pipelines.model import AVSR
from pipelines.data.data_module import AVSRDataLoader
from pipelines.detectors.mediapipe.detector import LandmarksDetector

class InferencePipeline(torch.nn.Module):
    def __init__(self, modality, model_path, model_conf, detector="mediapipe", face_track=False, device="metal"):
        super(InferencePipeline, self).__init__()
        self.device = device
        # modality configuration
        self.modality = modality
        self.dataloader = AVSRDataLoader(modality, detector=detector)
        self.model = AVSR(modality, model_path, model_conf
        , rnnlm=None, rnnlm_conf=None, penalty=0.0, ctc_weight=0.1, lm_weight=0.0, beam_size=40, device=device)
        if face_track and self.modality in ["video", "audiovisual"]:
            self.landmarks_detector = LandmarksDetector()
        else:
            self.landmarks_detector = None


    def process_landmarks(self, data_filename, landmarks_filename):
        if self.modality == "audio":
            return None
        if self.modality in ["video", "audiovisual"]:
            landmarks = self.landmarks_detector(data_filename)
            return landmarks


    def forward(self, data_filename, landmarks_filename=None):
        assert os.path.isfile(data_filename), f"data_filename: {data_filename} does not exist."
        landmarks = self.process_landmarks(data_filename, landmarks_filename)
        data = self.dataloader.load_data(data_filename, landmarks)
        transcript = self.model.infer(data)
        return transcript

    def extract_features(self, data_filename, landmarks_filename=None, extract_resnet_feats=False):
        assert os.path.isfile(data_filename), f"data_filename: {data_filename} does not exist."
        landmarks = self.process_landmarks(data_filename, landmarks_filename)
        data = self.dataloader.load_data(data_filename, landmarks)
        with torch.no_grad():
            if isinstance(data, tuple):
                enc_feats = self.model.model.encode(data[0].to(self.device), data[1].to(self.device), extract_resnet_feats)
            else:
                enc_feats = self.model.model.encode(data.to(self.device), extract_resnet_feats)
        return enc_feats

In [2]:
modality = "video"
model_conf = "LRS3_V_WER19.1/model.json"
model_path = "LRS3_V_WER19.1/model.pth"
pipeline = InferencePipeline(modality, model_path, model_conf, face_track=True, device='cpu')

I0000 00:00:1716719901.566971  686705 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1
I0000 00:00:1716719901.572232  686705 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [3]:
transcript = pipeline("/Users/mohammedthansheer/Desktop/Test data/Audioless/Azzhan.mov")

In [4]:
print(transcript) #Expected one of cpu, cuda, ipu, xpu, mkldnn, opengl, opencl, ideep, hip, ve, fpga, ort, xla, lazy, vulkan, mps, meta, hpu, mtia,

MONTH SO I ENCOURAGE YOU TO EMBRACE YOUR CURIOSITY TO APPROACH EACH DAY AND WONDER AND OPENNESS WHO KNOWS WHAT AMAZING DISCOVERIES HAVE ENTERED


In [5]:
import os
import torch
from pipelines.model import AVSR
from pipelines.data.data_module import AVSRDataLoader
from pipelines.detectors.mediapipe.detector import LandmarksDetector

# Initialize the inference pipeline
modality = "video"
model_conf = "LRS3_V_WER19.1/model.json"
model_path = "LRS3_V_WER19.1/model.pth"
pipeline = InferencePipeline(modality, model_path, model_conf, face_track=True, device='cpu')

# Perform inference on sample input data
transcript = pipeline("/Users/mohammedthansheer/Desktop/Test data/Audioless/Azzhan.mov")

I0000 00:00:1716720328.078724  686705 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1
I0000 00:00:1716720328.089694  686705 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1


In [8]:
import time

# Record the start time
start_time = time.time()

# Perform inference on sample input data
transcript = pipeline("/Users/mohammedthansheer/Desktop/Test data/Audioless/Azzhan.mov")

# Calculate the time taken for inference
end_time = time.time()
processing_time = end_time - start_time

# Print the time taken
print("Time taken for inference:", processing_time, "seconds")

Time taken for inference: 51.98213195800781 seconds


In [9]:
# Display the generated transcript
print(transcript)

MONTH SO I ENCOURAGE YOU TO EMBRACE YOUR CURIOSITY TO APPROACH EACH DAY AND WONDER AND OPENNESS WHO KNOWS WHAT AMAZING DISCOVERIES HAVE ENTERED
