In [1]:
import os, json, random, glob
from pathlib import Path
import cv2
import numpy as np
from PIL import Image
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision as tv
from torchvision import transforms as T

# BASE must be the project root (psl-tutor)
BASE = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd().parents[0]

RAW = BASE / "data" / "raw"
CROPS = BASE / "data" / "crops"
SPLITS = BASE / "data" / "splits"
ARTF = BASE / "artifacts"

for p in [CROPS, SPLITS, ARTF]:
    p.mkdir(parents=True, exist_ok=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE


'cuda'

In [2]:
import torch
print(torch.__version__, torch.version.cuda)
print("is_available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))


2.5.1+cu121 12.1
is_available: True
NVIDIA GeForce RTX 3070 Ti Laptop GPU


In [3]:
from pathlib import Path
RAW = (Path.cwd().parents[0] / "data" / "raw").resolve()

classes = sorted([d.name for d in RAW.iterdir() if d.is_dir()])
print("Classes:", classes)
assert classes, "No class folders in data/raw"

# Count images per class
import glob
counts = {c: len(glob.glob(str(RAW/c/"*.jpg"))) + 
             len(glob.glob(str(RAW/c/"*.png"))) + 
             len(glob.glob(str(RAW/c/"*.jpeg"))) for c in classes}
counts


Classes: ['Ain', 'Aliph', 'Bari yeh', 'Bay', 'Chay', 'Chhoti yeh', 'Daal', 'Ddaal', 'Dhaal', 'Dhuaad', 'Djay', 'Fay', 'Gaaf', 'Ghain', 'Hamza', 'Hay', 'Jeem', 'Kaaf', 'Khay', 'Laam', 'Meem', 'Noon', 'Pay', 'Quaaf', 'Ray', 'Seen', 'Sheen', 'Suaad', 'Tay', 'Tey', 'Thay', 'Toay_n', 'Vao', 'Zay', 'Zoay_n', 'aRay', 'hey']


{'Ain': 195,
 'Aliph': 211,
 'Bari yeh': 194,
 'Bay': 203,
 'Chay': 208,
 'Chhoti yeh': 200,
 'Daal': 203,
 'Ddaal': 203,
 'Dhaal': 99,
 'Dhuaad': 208,
 'Djay': 208,
 'Fay': 205,
 'Gaaf': 209,
 'Ghain': 203,
 'Hamza': 206,
 'Hay': 303,
 'Jeem': 205,
 'Kaaf': 205,
 'Khay': 203,
 'Laam': 204,
 'Meem': 200,
 'Noon': 202,
 'Pay': 159,
 'Quaaf': 215,
 'Ray': 94,
 'Seen': 203,
 'Sheen': 192,
 'Suaad': 203,
 'Tay': 210,
 'Tey': 204,
 'Thay': 186,
 'Toay_n': 215,
 'Vao': 196,
 'Zay': 207,
 'Zoay_n': 216,
 'aRay': 182,
 'hey': 83}

In [4]:
import glob, cv2
import mediapipe as mp
from tqdm import tqdm
from pathlib import Path

BASE   = Path.cwd().parents[0]
RAW    = BASE / "data" / "raw"
CROPS  = BASE / "data" / "crops"
CROPS.mkdir(parents=True, exist_ok=True)

mp_hands = mp.solutions.hands.Hands(static_image_mode=True, max_num_hands=1)

def expand_box(xmin, ymin, xmax, ymax, w, h, pad=0.25):
    cx = (xmin + xmax) / 2.0
    cy = (ymin + ymax) / 2.0
    side = max(xmax - xmin, ymax - ymin)
    side = int(side * (1.0 + pad))
    x0 = max(0, int(cx - side // 2)); y0 = max(0, int(cy - side // 2))
    x1 = min(w, x0 + side);          y1 = min(h, y0 + side)
    return x0, y0, x1, y1

classes = sorted([d.name for d in RAW.iterdir() if d.is_dir()])
for cls in classes:
    in_dir  = RAW / cls
    out_dir = CROPS / cls
    out_dir.mkdir(parents=True, exist_ok=True)
    files = (glob.glob(str(in_dir/"*.jpg")) + 
             glob.glob(str(in_dir/"*.png")) + 
             glob.glob(str(in_dir/"*.jpeg")))
    for p in tqdm(files, desc=f"Cropping {cls}"):
        img = cv2.imread(p)
        if img is None: 
            continue
        h, w = img.shape[:2]
        res = mp_hands.process(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        if res.multi_hand_landmarks:
            lm = res.multi_hand_landmarks[0].landmark
            xs = [int(l.x * w) for l in lm]; ys = [int(l.y * h) for l in lm]
            x0, y0, x1, y1 = expand_box(min(xs), min(ys), max(xs), max(ys), w, h, pad=0.25)
            crop = img[y0:y1, x0:x1]
        else:
            # Fallback: center square crop
            side = min(w, h)
            x0 = (w - side)//2; y0 = (h - side)//2
            crop = img[y0:y0+side, x0:x0+side]
        crop = cv2.resize(crop, (224, 224), interpolation=cv2.INTER_LINEAR)
        cv2.imwrite(str(out_dir / Path(p).name), crop)

print("Done. Crops saved to:", CROPS)


Cropping Ain: 100%|██████████████████████████████████████████████████████████████████| 195/195 [00:17<00:00, 10.88it/s]
Cropping Aliph: 100%|████████████████████████████████████████████████████████████████| 211/211 [00:19<00:00, 10.69it/s]
Cropping Bari yeh: 100%|█████████████████████████████████████████████████████████████| 194/194 [00:18<00:00, 10.68it/s]
Cropping Bay: 100%|██████████████████████████████████████████████████████████████████| 203/203 [00:18<00:00, 10.78it/s]
Cropping Chay: 100%|█████████████████████████████████████████████████████████████████| 208/208 [00:19<00:00, 10.86it/s]
Cropping Chhoti yeh: 100%|███████████████████████████████████████████████████████████| 200/200 [00:18<00:00, 10.78it/s]
Cropping Daal: 100%|█████████████████████████████████████████████████████████████████| 203/203 [00:18<00:00, 11.03it/s]
Cropping Ddaal: 100%|████████████████████████████████████████████████████████████████| 203/203 [00:18<00:00, 10.77it/s]
Cropping Dhaal: 100%|███████████████████

Done. Crops saved to: C:\Users\syeds\Documents\psl-tutor\data\crops





In [5]:
import json, random, glob
from pathlib import Path

random.seed(42)
BASE   = Path.cwd().parents[0]
CROPS  = BASE / "data" / "crops"
SPLITS = BASE / "data" / "splits"
ARTF   = BASE / "artifacts"
SPLITS.mkdir(parents=True, exist_ok=True)
ARTF.mkdir(parents=True, exist_ok=True)

classes = sorted([d.name for d in CROPS.iterdir() if d.is_dir()])
class_to_idx = {c:i for i,c in enumerate(classes)}
json.dump(class_to_idx, open(ARTF/"classes.json","w"), indent=2)

def list_images(d):
    return (glob.glob(str(d/"*.jpg")) + 
            glob.glob(str(d/"*.png")) + 
            glob.glob(str(d/"*.jpeg")))

train, val, test = [], [], []
for c in classes:
    files = list_images(CROPS / c)
    random.shuffle(files)
    n = len(files)
    n_train = int(0.7*n); n_val = int(0.15*n)
    train += [(p, c) for p in files[:n_train]]
    val   += [(p, c) for p in files[n_train:n_train+n_val]]
    test  += [(p, c) for p in files[n_train+n_val:]]

def write_split(lst, name):
    with open(SPLITS / name, "w", encoding="utf-8") as f:
        for p, c in lst:
            f.write(f"{p}\t{c}\n")

write_split(train, "train.txt")
write_split(val,   "val.txt")
write_split(test,  "test.txt")

len(train), len(val), len(test), class_to_idx


(5056,
 1070,
 1116,
 {'Ain': 0,
  'Aliph': 1,
  'Bari yeh': 2,
  'Bay': 3,
  'Chay': 4,
  'Chhoti yeh': 5,
  'Daal': 6,
  'Ddaal': 7,
  'Dhaal': 8,
  'Dhuaad': 9,
  'Djay': 10,
  'Fay': 11,
  'Gaaf': 12,
  'Ghain': 13,
  'Hamza': 14,
  'Hay': 15,
  'Jeem': 16,
  'Kaaf': 17,
  'Khay': 18,
  'Laam': 19,
  'Meem': 20,
  'Noon': 21,
  'Pay': 22,
  'Quaaf': 23,
  'Ray': 24,
  'Seen': 25,
  'Sheen': 26,
  'Suaad': 27,
  'Tay': 28,
  'Tey': 29,
  'Thay': 30,
  'Toay_n': 31,
  'Vao': 32,
  'Zay': 33,
  'Zoay_n': 34,
  'aRay': 35,
  'hey': 36})

In [6]:
import json
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms as T
from PIL import Image

IMSIZE = 224
MEAN = [0.485, 0.456, 0.406]
STD  = [0.229, 0.224, 0.225]

class FileListDataset(Dataset):
    def __init__(self, list_path, class_map_path, train=True):
        self.items = []
        with open(list_path, "r", encoding="utf-8") as f:
            for line in f:
                p, lbl = line.strip().split("\t")
                self.items.append((p, lbl))
        with open(class_map_path, "r") as f:
            self.class_to_idx = json.load(f)

        if train:
            self.tf = T.Compose([
                T.Resize((IMSIZE, IMSIZE)),
                T.ColorJitter(0.2,0.2,0.2,0.1),
                T.RandomRotation(15),
                T.RandomPerspective(0.1, p=0.3),
                T.ToTensor(),
                T.Normalize(MEAN, STD),
            ])
        else:
            self.tf = T.Compose([
                T.Resize((IMSIZE, IMSIZE)),
                T.ToTensor(),
                T.Normalize(MEAN, STD),
            ])

    def __len__(self): return len(self.items)

    def __getitem__(self, i):
        p, lbl = self.items[i]
        x = Image.open(p).convert("RGB")
        x = self.tf(x)
        y = self.class_to_idx[lbl]
        return x, y


In [8]:
from torch.utils.data import DataLoader
from pathlib import Path

BASE   = Path.cwd().parents[0]
SPLITS = BASE / "data" / "splits"
ARTF   = BASE / "artifacts"

batch_size = 64

train_ds = FileListDataset(SPLITS/"train.txt", ARTF/"classes.json", train=True)
val_ds   = FileListDataset(SPLITS/"val.txt",   ARTF/"classes.json", train=False)

# Windows/Jupyter safe: no worker processes
train_dl = DataLoader(
    train_ds, batch_size=batch_size, shuffle=True,
    num_workers=0, pin_memory=True, persistent_workers=False
)
val_dl = DataLoader(
    val_ds, batch_size=batch_size, shuffle=False,
    num_workers=0, pin_memory=True, persistent_workers=False
)

len(train_ds), len(val_ds)


(5056, 1070)

In [9]:
# Must return quickly with shapes; if it hangs here, paths/images are the issue.
xb, yb = next(iter(train_dl))
xb.shape, yb.shape, yb[:10]


(torch.Size([64, 3, 224, 224]),
 torch.Size([64]),
 tensor([23, 17, 20, 15, 10, 25, 14, 34, 11, 28]))

In [10]:
import time, json, torch, torchvision as tv, torch.nn as nn, torch.optim as optim
torch.backends.cudnn.benchmark = True

num_classes = len(json.load(open(ARTF/"classes.json")))
model = tv.models.mobilenet_v3_small(weights="IMAGENET1K_V1")
model.classifier[3] = nn.Linear(model.classifier[3].in_features, num_classes)
model = model.cuda()

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
scaler = torch.amp.GradScaler("cuda")

model.train()
t0 = time.time()
for bi, (xb, yb) in enumerate(train_dl, start=1):
    xb = xb.cuda(non_blocking=True); yb = yb.cuda(non_blocking=True)
    optimizer.zero_grad(set_to_none=True)
    with torch.autocast(device_type="cuda", dtype=torch.float16):
        logits = model(xb); loss = criterion(logits, yb)
    scaler.scale(loss).backward()
    scaler.step(optimizer); scaler.update()
    if bi % 5 == 0:
        print(f"batch {bi}, loss={loss.item():.4f}")
    if bi >= 20:  # run 20 batches to confirm it moves
        break
print("time for 20 batches:", round(time.time()-t0,2), "s")


Downloading: "https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth" to C:\Users\syeds/.cache\torch\hub\checkpoints\mobilenet_v3_small-047dcff4.pth
100%|█████████████████████████████████████████████████████████████████████████████| 9.83M/9.83M [00:04<00:00, 2.40MB/s]


batch 5, loss=3.4415
batch 10, loss=3.0582
batch 15, loss=2.5521
batch 20, loss=2.0337
time for 20 batches: 13.79 s


In [11]:
import json, torch, torchvision as tv, torch.nn as nn, torch.optim as optim
from tqdm import tqdm

num_classes = len(json.load(open(ARTF/"classes.json")))
model = tv.models.mobilenet_v3_small(weights="IMAGENET1K_V1")
model.classifier[3] = nn.Linear(model.classifier[3].in_features, num_classes)
model = model.cuda()

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=25)
scaler = torch.amp.GradScaler("cuda")
torch.backends.cudnn.benchmark = True

best_acc, best_path = 0.0, ARTF / "psl_mnv3_best.pth"
epochs = 25

for ep in range(1, epochs+1):
    model.train()
    for xb, yb in tqdm(train_dl, desc=f"Epoch {ep}/{epochs}", leave=False):
        xb = xb.cuda(non_blocking=True); yb = yb.cuda(non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        with torch.autocast(device_type="cuda", dtype=torch.float16):
            logits = model(xb); loss = criterion(logits, yb)
        scaler.scale(loss).backward()
        scaler.step(optimizer); scaler.update()

    # validation
    model.eval(); correct = n = 0
    with torch.no_grad(), torch.autocast(device_type="cuda", dtype=torch.float16):
        for xb, yb in val_dl:
            xb = xb.cuda(non_blocking=True); yb = yb.cuda(non_blocking=True)
            pred = model(xb).argmax(1)
            correct += (pred==yb).sum().item(); n += yb.size(0)
    acc = correct / max(1, n)
    scheduler.step()
    print(f"Val acc: {acc:.4f}")

    if acc > best_acc:
        best_acc = acc
        torch.save({"state_dict": model.state_dict(),
                    "classes": json.load(open(ARTF/"classes.json"))},
                   best_path)

print("Best val acc:", best_acc, "Saved:", best_path)


                                                                                                                       

Val acc: 0.7963


                                                                                                                       

Val acc: 0.9972


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 0.9991


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 0.9991


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 1.0000


                                                                                                                       

Val acc: 1.0000
Best val acc: 1.0 Saved: C:\Users\syeds\Documents\psl-tutor\artifacts\psl_mnv3_best.pth


In [12]:
# H) Evaluate best model on the test set
import json, torch, torchvision as tv, torch.nn as nn
from sklearn.metrics import classification_report, confusion_matrix
from pathlib import Path

BASE   = Path.cwd().parents[0]
SPLITS = BASE / "data" / "splits"
ARTF   = BASE / "artifacts"

# load classes and checkpoint
ckpt = torch.load(ARTF/"psl_mnv3_best.pth", map_location="cpu")
class_to_idx = ckpt["classes"]
idx_to_class = {v:k for k,v in class_to_idx.items()}

# dataset reuse from earlier cell
test_ds = FileListDataset(SPLITS/"test.txt", ARTF/"classes.json", train=False)
from torch.utils.data import DataLoader
test_dl = DataLoader(test_ds, batch_size=64, shuffle=False, num_workers=0, pin_memory=True)

# rebuild model exactly as in training
num_classes = len(idx_to_class)
model_eval = tv.models.mobilenet_v3_small()
model_eval.classifier[3] = nn.Linear(model_eval.classifier[3].in_features, num_classes)
model_eval.load_state_dict(ckpt["state_dict"]); model_eval.eval()

y_true, y_pred = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        logits = model_eval(xb)
        y_true.extend(yb.numpy().tolist())
        y_pred.extend(logits.argmax(1).numpy().tolist())

names = [idx_to_class[i] for i in range(num_classes)]
print(classification_report(y_true, y_pred, target_names=names))
cm = confusion_matrix(y_true, y_pred)
cm


  ckpt = torch.load(ARTF/"psl_mnv3_best.pth", map_location="cpu")


              precision    recall  f1-score   support

         Ain       1.00      1.00      1.00        30
       Aliph       1.00      1.00      1.00        33
    Bari yeh       1.00      1.00      1.00        30
         Bay       1.00      1.00      1.00        31
        Chay       1.00      1.00      1.00        32
  Chhoti yeh       1.00      1.00      1.00        30
        Daal       1.00      1.00      1.00        31
       Ddaal       1.00      1.00      1.00        31
       Dhaal       1.00      1.00      1.00        16
      Dhuaad       1.00      1.00      1.00        32
        Djay       1.00      1.00      1.00        32
         Fay       1.00      1.00      1.00        32
        Gaaf       1.00      1.00      1.00        32
       Ghain       1.00      1.00      1.00        31
       Hamza       1.00      1.00      1.00        32
         Hay       1.00      1.00      1.00        46
        Jeem       1.00      1.00      1.00        32
        Kaaf       1.00    

array([[30,  0,  0, ...,  0,  0,  0],
       [ 0, 33,  0, ...,  0,  0,  0],
       [ 0,  0, 30, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ..., 33,  0,  0],
       [ 0,  0,  0, ...,  0, 28,  0],
       [ 0,  0,  0, ...,  0,  0, 13]])

In [13]:
# I) Export ONNX + save classes.json
import torch, torchvision as tv, torch.nn as nn
from pathlib import Path
import json

BASE = Path.cwd().parents[0]
ARTF = BASE / "artifacts"

ckpt = torch.load(ARTF/"psl_mnv3_best.pth", map_location="cpu")
class_to_idx = ckpt["classes"]
num_classes = len(class_to_idx)

model_export = tv.models.mobilenet_v3_small()
model_export.classifier[3] = nn.Linear(model_export.classifier[3].in_features, num_classes)
model_export.load_state_dict(ckpt["state_dict"])
model_export.eval()

onnx_path = ARTF / "psl_mnv3.onnx"
dummy = torch.randn(1,3,224,224)
torch.onnx.export(model_export, dummy, str(onnx_path),
                  input_names=["input"], output_names=["logits"],
                  dynamic_axes={"input":{0:"N"}, "logits":{0:"N"}},
                  opset_version=13)
print("ONNX:", onnx_path)

# ensure classes map is present alongside ONNX
with open(ARTF/"classes.json","w") as f:
    json.dump(class_to_idx, f, indent=2)
print("Classes:", ARTF/"classes.json")


  ckpt = torch.load(ARTF/"psl_mnv3_best.pth", map_location="cpu")


ONNX: C:\Users\syeds\Documents\psl-tutor\artifacts\psl_mnv3.onnx
Classes: C:\Users\syeds\Documents\psl-tutor\artifacts\classes.json


In [17]:
import cv2, json, torch, torchvision as tv, torch.nn as nn, numpy as np
from torchvision import transforms as T
from pathlib import Path
import mediapipe as mp

# -------------------------
# LOAD MODEL & LABELS
# -------------------------
BASE = Path.cwd().parents[0]
ARTF = BASE / "artifacts"

ckpt = torch.load(ARTF/"psl_mnv3_best.pth", map_location="cpu")
idx_to_class = {v:k for k,v in ckpt["classes"].items()}

model = tv.models.mobilenet_v3_small()
model.classifier[3] = nn.Linear(model.classifier[3].in_features, len(idx_to_class))
model.load_state_dict(ckpt["state_dict"])
model.eval()

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


# -------------------------
# NORMALIZATION + PREPROCESS
# -------------------------
def preprocess(img):
    # gamma correction
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = img.astype(np.float32) / 255.0
    gamma = 1.4
    img = np.power(img, gamma)

    # back to uint8
    img = (img * 255).astype(np.uint8)

    # convert to PIL for torchvision
    pil = T.ToPILImage()(img)

    tf = T.Compose([
        T.Resize((224, 224)),
        T.ToTensor(),
        T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
    ])
    return tf(pil)


# -------------------------
# MEDIAPIPE HAND DETECTOR
# -------------------------
mp_hands = mp.solutions.hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.6,
    min_tracking_confidence=0.5
)

def expand_box(xmin, ymin, xmax, ymax, w, h, pad=0.35):
    cx = (xmin + xmax) / 2
    cy = (ymin + ymax) / 2
    side = int(max(xmax - xmin, ymax - ymin) * (1 + pad))
    x0 = max(0, int(cx - side // 2))
    y0 = max(0, int(cy - side // 2))
    x1 = min(w, x0 + side)
    y1 = min(h, y0 + side)
    return x0, y0, x1, y1


# -------------------------
# PREDICTION SMOOTHING
# -------------------------
ema_logits = None
alpha = 0.6   # smoothing factor
cooldown = 0
stable_pred = None


# -------------------------
# START CAMERA
# -------------------------
cap = cv2.VideoCapture(0)
assert cap.isOpened(), "Camera blocked. Enable it in Windows Privacy settings."

while True:
    ok, frame = cap.read()
    if not ok: break
    h, w = frame.shape[:2]

    # detect hand
    result = mp_hands.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    # default ROI center crop fallback
    roi = None

    if result.multi_hand_landmarks:
        lm = result.multi_hand_landmarks[0].landmark
        xs = [int(l.x * w) for l in lm]
        ys = [int(l.y * h) for l in lm]
        x0, y0, x1, y1 = expand_box(min(xs), min(ys), max(xs), max(ys), w, h, pad=0.4)
        roi = frame[y0:y1, x0:x1]
        cv2.rectangle(frame, (x0,y0), (x1,y1), (0,255,0), 2)
    else:
        side = min(h, w)
        y0 = (h - side) // 2
        x0 = (w - side) // 2
        roi = frame[y0:y0+side, x0:x0+side]

    # preprocess ROI
    x = preprocess(roi).unsqueeze(0).to(device)

    with torch.no_grad(), torch.autocast(device_type=device, dtype=torch.float16 if device=="cuda" else torch.float32):
        logits = model(x).float().cpu().numpy()[0]

    # EMA smoothing
    global_pred = logits
    if ema_logits is None:
        ema_logits = global_pred
    else:
        ema_logits = alpha * global_pred + (1 - alpha) * ema_logits

    # convert to probability
    ex = np.exp(ema_logits - np.max(ema_logits))
    probs = ex / ex.sum()

    pred_idx = int(probs.argmax())
    pred_label = idx_to_class[pred_idx]
    conf = float(probs[pred_idx])

    # Stabilization: only update final prediction if confidence is high enough
    if conf > 0.65:
        stable_pred = pred_label
        cooldown = 5
    elif cooldown > 0:
        cooldown -= 1
    else:
        stable_pred = "..."

    # draw prediction
    cv2.putText(frame, f"{stable_pred}  {conf:.2f}", (10, 40),
                cv2.FONT_HERSHEY_SIMPLEX, 1.1, (0,255,0), 2)

    cv2.imshow("PSL CNN + MediaPipe ROI (High Accuracy)", frame)
    if cv2.waitKey(1) == 27:
        break

cap.release()
cv2.destroyAllWindows()


  ckpt = torch.load(ARTF/"psl_mnv3_best.pth", map_location="cpu")


In [18]:
import cv2, json, torch, torchvision as tv, torch.nn as nn, numpy as np
from torchvision import transforms as T
from pathlib import Path
import mediapipe as mp

# -------------------------
# LOAD MODEL & LABELS
# -------------------------
BASE = Path.cwd().parents[0]
ARTF = BASE / "artifacts"

ckpt = torch.load(ARTF/"psl_mnv3_best.pth", map_location="cpu")
idx_to_class = {v:k for k,v in ckpt["classes"].items()}

model = tv.models.mobilenet_v3_small()
model.classifier[3] = nn.Linear(model.classifier[3].in_features, len(idx_to_class))
model.load_state_dict(ckpt["state_dict"])
model.eval()

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# -------------------------
# NORMALIZATION + PREPROCESS
# -------------------------
def preprocess(img):
    # gamma correction
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = img.astype(np.float32) / 255.0
    gamma = 1.4
    img = np.power(img, gamma)
    img = (img * 255).astype(np.uint8)

    pil = T.ToPILImage()(img)
    tf = T.Compose([
        T.Resize((224, 224)),
        T.ToTensor(),
        T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
    ])
    return tf(pil)

# -------------------------
# MEDIAPIPE HAND DETECTOR
# -------------------------
mp_hands = mp.solutions.hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.6,
    min_tracking_confidence=0.5
)

def expand_box(xmin, ymin, xmax, ymax, w, h, pad=0.35):
    cx = (xmin + xmax) / 2
    cy = (ymin + ymax) / 2
    side = int(max(xmax - xmin, ymax - ymin) * (1 + pad))
    x0 = max(0, int(cx - side // 2))
    y0 = max(0, int(cy - side // 2))
    x1 = min(w, x0 + side)
    y1 = min(h, y0 + side)
    return x0, y0, x1, y1

# -------------------------
# PREDICTION SMOOTHING
# -------------------------
ema_logits = None
alpha = 0.6   # smoothing factor
cooldown = 0
stable_pred = None

# -------------------------
# START CAMERA
# -------------------------
cap = cv2.VideoCapture(0)
assert cap.isOpened(), "Camera blocked. Enable it in Windows Privacy settings."

while True:
    ok, frame = cap.read()
    if not ok: break
    h, w = frame.shape[:2]

    # detect hand
    result = mp_hands.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    # default ROI center crop fallback
    roi = None
    hand_label = None  # 'Left' or 'Right' if available

    if result.multi_hand_landmarks:
        lm = result.multi_hand_landmarks[0].landmark
        xs = [int(l.x * w) for l in lm]
        ys = [int(l.y * h) for l in lm]
        x0, y0, x1, y1 = expand_box(min(xs), min(ys), max(xs), max(ys), w, h, pad=0.4)
        roi = frame[y0:y1, x0:x1]
        cv2.rectangle(frame, (x0,y0), (x1,y1), (0,255,0), 2)

        # handedness
        if result.multi_handedness:
            hand_label = result.multi_handedness[0].classification[0].label  # 'Left' or 'Right'
            # If Left hand, mirror ROI so it matches right-hand training distribution
            if hand_label == "Left" and roi is not None and roi.size:
                roi = cv2.flip(roi, 1)
                # Optional: draw label
                cv2.putText(frame, "Left (mirrored)", (x0, max(0, y0-10)),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,0), 2)
            elif hand_label == "Right":
                cv2.putText(frame, "Right", (x0, max(0, y0-10)),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,0), 2)
    else:
        side = min(h, w)
        y0 = (h - side) // 2
        x0 = (w - side) // 2
        roi = frame[y0:y0+side, x0:x0+side]

    # guard against empty ROI
    if roi is None or roi.size == 0:
        cv2.imshow("PSL CNN + MediaPipe ROI (High Accuracy)", frame)
        if cv2.waitKey(1) == 27: break
        continue

    # preprocess ROI
    x = preprocess(roi).unsqueeze(0).to(device)

    with torch.no_grad(), torch.autocast(device_type=device, dtype=torch.float16 if device=="cuda" else torch.float32):
        logits = model(x).float().cpu().numpy()[0]

    # EMA smoothing
    if ema_logits is None:
        ema_logits = logits
    else:
        ema_logits = alpha * logits + (1 - alpha) * ema_logits

    # convert to probability
    ex = np.exp(ema_logits - np.max(ema_logits))
    probs = ex / ex.sum()

    pred_idx = int(probs.argmax())
    pred_label = idx_to_class[pred_idx]
    conf = float(probs[pred_idx])

    # Stabilization: only update final prediction if confidence is high enough
    if conf > 0.65:
        stable_pred = pred_label
        cooldown = 5
    elif cooldown > 0:
        cooldown -= 1
    else:
        stable_pred = "..."

    # draw prediction
    txt = f"{stable_pred}  {conf:.2f}"
    if hand_label: txt += f"  [{hand_label}]"
    cv2.putText(frame, txt, (10, 40),
                cv2.FONT_HERSHEY_SIMPLEX, 1.1, (0,255,0), 2)

    cv2.imshow("PSL CNN + MediaPipe ROI (High Accuracy)", frame)
    if cv2.waitKey(1) == 27:
        break

cap.release()
cv2.destroyAllWindows()


  ckpt = torch.load(ARTF/"psl_mnv3_best.pth", map_location="cpu")


In [None]:
import cv2, json, torch, torchvision as tv, torch.nn as nn, numpy as np
from torchvision import transforms as T
from pathlib import Path
import mediapipe as mp

# -------------------------
# LOAD MODEL & LABELS
# -------------------------
BASE = Path.cwd().parents[0]
ARTF = BASE / "artifacts"

ckpt = torch.load(ARTF/"psl_mnv3_best.pth", map_location="cpu")
idx_to_class = {v:k for k,v in ckpt["classes"].items()}

model = tv.models.mobilenet_v3_small()
model.classifier[3] = nn.Linear(model.classifier[3].in_features, len(idx_to_class))
model.load_state_dict(ckpt["state_dict"])
model.eval()

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# -------------------------
# PREPROCESS
# -------------------------
tf_cam = T.Compose([
    T.ToPILImage(),
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])

def preprocess_bgr(img_bgr):
    # optional mild gamma for low light; comment out if you prefer raw
    img = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    img = img.astype(np.float32) / 255.0
    img = np.power(img, 1.2)
    img = (img * 255).astype(np.uint8)
    return tf_cam(img)

# -------------------------
# MEDIAPIPE HAND DETECTOR
# -------------------------
mp_hands = mp.solutions.hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.6,
    min_tracking_confidence=0.5
)

def expand_box(xmin, ymin, xmax, ymax, w, h, pad=0.40):
    cx = (xmin + xmax) / 2
    cy = (ymin + ymax) / 2
    side = int(max(xmax - xmin, ymax - ymin) * (1 + pad))
    x0 = max(0, int(cx - side // 2))
    y0 = max(0, int(cy - side // 2))
    x1 = min(w, x0 + side)
    y1 = min(h, y0 + side)
    return x0, y0, x1, y1

# -------------------------
# PREDICTION HELPERS
# -------------------------
@torch.no_grad()
def logits_from_img(img_bgr):
    x = preprocess_bgr(img_bgr).unsqueeze(0).to(device, non_blocking=True)
    with torch.autocast(device_type=device, dtype=torch.float16 if device=="cuda" else torch.float32):
        out = model(x).float().cpu().numpy()[0]
    return out

def softmax_conf(logits):
    ex = np.exp(logits - logits.max())
    probs = ex / ex.sum()
    idx = int(probs.argmax())
    return probs, idx, float(probs[idx])

# EMA smoothing
ema_logits = None
alpha = 0.6
cooldown = 0
stable_pred = "..."

# -------------------------
# START CAMERA
# -------------------------
cap = cv2.VideoCapture(0)
assert cap.isOpened(), "Camera blocked. Enable it in Windows Privacy settings."

while True:
    ok, frame = cap.read()
    if not ok: break
    h, w = frame.shape[:2]

    # detect hand and crop ROI
    result = mp_hands.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    if result.multi_hand_landmarks:
        lm = result.multi_hand_landmarks[0].landmark
        xs = [int(l.x * w) for l in lm]
        ys = [int(l.y * h) for l in lm]
        x0,y0,x1,y1 = expand_box(min(xs),min(ys),max(xs),max(ys),w,h,0.40)
        roi = frame[y0:y1, x0:x1]
        cv2.rectangle(frame, (x0,y0), (x1,y1), (0,255,0), 2)
    else:
        # fallback: center square
        side = min(h, w); y0 = (h - side)//2; x0 = (w - side)//2
        roi = frame[y0:y0+side, x0:x0+side]

    if roi is None or roi.size == 0:
        cv2.imshow("PSL CNN + MediaPipe ROI", frame)
        if cv2.waitKey(1) == 27: break
        continue

    # dual-path inference: original and flipped
    logits_orig = logits_from_img(roi)
    logits_flip = logits_from_img(cv2.flip(roi, 1))

    # choose the path with higher confidence
    probs_o, idx_o, conf_o = softmax_conf(logits_orig)
    probs_f, idx_f, conf_f = softmax_conf(logits_flip)

    if conf_f > conf_o:
        chosen_logits = logits_flip
        pred_idx = idx_f
        conf = conf_f
        used = "flip"
    else:
        chosen_logits = logits_orig
        pred_idx = idx_o
        conf = conf_o
        used = "orig"

    # EMA smoothing over chosen logits
    if ema_logits is None:
        ema_logits = chosen_logits
    else:
        ema_logits = alpha * chosen_logits + (1 - alpha) * ema_logits

    # final probability after smoothing
    probs, idx_final, conf_final = softmax_conf(ema_logits)
    pred_label = idx_to_class[idx_final]

    # simple stability gate
    if conf_final > 0.65:
        stable_pred = pred_label
        cooldown = 5
    elif cooldown > 0:
        cooldown -= 1
    else:
        stable_pred = "..."

    # draw
    cv2.putText(frame, f"{stable_pred} {conf_final:.2f} [{used}]",
                (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.1, (0,255,0), 2)

    cv2.imshow("PSL CNN + MediaPipe ROI", frame)
    if cv2.waitKey(1) == 27: break

cap.release()
cv2.destroyAllWindows()


  ckpt = torch.load(ARTF/"psl_mnv3_best.pth", map_location="cpu")
