In [21]:
import cv2
import torch
import time
import numpy as np
import pandas as pd
from VideoLoader import KeypointExtractor, read_video
from VideoDataset import process_keypoints
from model import SLR

import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [8]:

def record_video_tensor(prep_time=3, record_time=3, fps=30):
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        raise RuntimeError("Cannot access the webcam")

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    print(f"Preparing to record at {width}x{height}, {fps} FPS...")

    # Preparation countdown
    start_prep = time.time()
    while time.time() - start_prep < prep_time:
        ret, frame = cap.read()
        if not ret:
            continue

        # Countdown overlay
        time_left = prep_time - int(time.time() - start_prep)
        cv2.putText(frame, f"Recording in {time_left}", (50, 100),
                    cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 255), 4)

        cv2.imshow("Preview", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            cap.release()
            cv2.destroyAllWindows()
            return None

    # Start recording
    print("Recording started!")
    num_frames = int(record_time * fps)
    frames = []
    start_record = time.time()

    while len(frames) < num_frames:
        ret, frame = cap.read()
        if not ret:
            break

        # Time remaining
        elapsed = time.time() - start_record
        remaining = max(0, record_time - elapsed)
        cv2.putText(frame, f"{remaining:.1f}s left", (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 0), 2)

        # Show frame
        cv2.imshow("Preview", frame)

        # Convert and store
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame_rgb)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

        # FPS control
        expected = len(frames) / fps
        if expected > elapsed:
            time.sleep(expected - elapsed)

    cap.release()
    cv2.destroyAllWindows()

    video_np = np.stack(frames)
    video_tensor = torch.from_numpy(video_np).float()

    print(f"Captured video tensor with shape {video_tensor.shape}")
    return video_tensor


In [9]:
gloss_info = pd.read_csv('./gloss.csv')
idx_to_word = {}
word_to_idx = {}
for i in range(len(gloss_info)):
    idx_to_word[gloss_info['idx'][i]] = gloss_info['word'][i]
    word_to_idx[gloss_info['word'][i]] = gloss_info['idx'][i]

In [11]:
from model import SLR
model = SLR(
    n_embd=12*64, 
    n_cls_dict={'asl_citizen':2305, 'lsfb': 4657, 'wlasl':2000, 'autsl':226, 'rsl':1001},
    n_head=12, 
    n_layer=4,
    n_keypoints=63,
    dropout=0.2, 
    max_len=64,
    bias=True
)

model = torch.compile(model)
model.load_state_dict(torch.load('./models/small_model.pth', map_location=torch.device('cpu')))


# Run a bigger model. About 2.5x larger. Validation accuracy is about the same however
"""

model = SLR(
    n_embd=16*64, 
    n_cls_dict={'asl_citizen':2305, 'lsfb': 4657, 'wlasl':2000, 'autsl':226, 'rsl':1001},
    n_head=16, 
    n_layer=6,
    n_keypoints=63,
    dropout=0.6, 
    max_len=64,
    bias=True
)

model = torch.compile(model)
model.load_state_dict(torch.load('./models/big_model.pth', map_location=torch.device('cpu')))
"""



model.eval()
print(f'Trainable parameters: {model.num_params()}')

Trainable parameters: 36665856


In [12]:

# Load a video or record it:

video = record_video_tensor(fps=20, record_time=3)
#video = read_video('./test.mp4')
video = video.permute(0, 3, 1, 2)/255


Preparing to record at 640x480, 20 FPS...
Recording started!
Captured video tensor with shape torch.Size([60, 480, 640, 3])


In [14]:
from VideoLoader import KeypointExtractor
# Over here it runs the media pipe model. Perhaps the biggest bottle neck overall. 

pose = KeypointExtractor().extract(video)
height, width = video.shape[-2], video.shape[-1]
del video


Downloading model to c:\Users\samty\AppData\Local\Programs\Python\Python312\Lib\site-packages\mediapipe/modules/pose_landmark/pose_landmark_lite.tflite


In [16]:
selected_keypoints = list(range(42)) 
selected_keypoints = selected_keypoints + [x + 42 for x in ([291, 267, 37, 61, 84, 314, 310, 13, 80, 14] + [152])]
selected_keypoints = selected_keypoints + [x + 520 for x in ([2, 5, 7, 8, 11, 12, 13, 14, 15, 16])]


flipped_selected_keypoints = list(range(21, 42)) + list(range(21)) 
flipped_selected_keypoints = flipped_selected_keypoints + [x + 42 for x in ([61, 37, 267, 291, 314, 84, 80, 13, 310, 14] + [152])]
flipped_selected_keypoints = flipped_selected_keypoints + [x + 520 for x in ([5, 2, 8, 7, 12, 11, 14, 13, 16, 15])]



In [22]:
from VideoDataset import process_keypoints
import torch._dynamo
torch._dynamo.config.suppress_errors = True
# with augmentation to true, sample multiple frames and feed it to the model. take the average of the result.
# Since torch.compile is used, the model is compiled the first time it is ran. Running it afterwards will be faster.

sample_amount = 20 # Run the model 20 times

logits = 0
with torch.no_grad():
    model.eval()
    for i in range(sample_amount):
        keypoints, valid_keypoints = process_keypoints(pose, 64, selected_keypoints, height=height, width=width, augment=True)
        keypoints[:,:, 0] = keypoints[:,:, 0]
        logits = logits + model.heads['asl_citizen'](model(keypoints.unsqueeze(0), valid_keypoints.unsqueeze(0)))

idx = torch.argsort(logits, descending=True)[0].tolist()
idx[:10]

  keypoints = torch.tensor(keypoints[indices])
W0416 00:44:59.730000 31640 site-packages\torch\_dynamo\convert_frame.py:1233] WON'T CONVERT forward c:\Users\samty\OneDrive\Desktop\Courses\ecs193\Sign-Language-App\SignLanguageDemo\model.py line 192 
W0416 00:44:59.730000 31640 site-packages\torch\_dynamo\convert_frame.py:1233] due to: 
W0416 00:44:59.730000 31640 site-packages\torch\_dynamo\convert_frame.py:1233] Traceback (most recent call last):
W0416 00:44:59.730000 31640 site-packages\torch\_dynamo\convert_frame.py:1233]   File "c:\Users\samty\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\_dynamo\convert_frame.py", line 1164, in __call__
W0416 00:44:59.730000 31640 site-packages\torch\_dynamo\convert_frame.py:1233]     result = self._inner_convert(
W0416 00:44:59.730000 31640 site-packages\torch\_dynamo\convert_frame.py:1233]              ^^^^^^^^^^^^^^^^^^^^
W0416 00:44:59.730000 31640 site-packages\torch\_dynamo\convert_frame.py:1233]   File "c:\Users\samty\AppDa

[6, 891, 276, 1203, 1116, 590, 366, 562, 1020, 307]

In [23]:
print("Top 5 words")
print(', '.join([idx_to_word[idx[i]] for i in range(5)]))

Top 5 words
hug, love, bat, drug, country


In [25]:
idx.index(word_to_idx['hug']) # search for a word's idx

0