# Preliminary

Import

In [3]:
import cv2
import torch
import numpy as np
import time
from torchvision import transforms
from torch import nn

# =========================
# 1. Definisi Model CNN
# =========================

class CNN(nn.Module):
    def __init__(self, num_classes=26):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1), nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(2), nn.Dropout2d(0.25),
            nn.Conv2d(32, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2), nn.Dropout2d(0.25),
            nn.Conv2d(64, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(), nn.MaxPool2d(2), nn.Dropout2d(0.25)
        )

        self.classifier = nn.Sequential(
            nn.Flatten(),
            # PERBAIKAN DI SINI: Menggunakan ukuran input yang benar
            nn.Linear(128 * 16 * 16, 256), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        # Bentuk x di sini sebelum flatten adalah (batch_size, 128, 16, 16)
        return self.classifier(x)

# =========================
# 2. Load Model
# =========================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN(num_classes=26).to(device)
model.load_state_dict(torch.load("cnn_handsign_model_300_epoch.pth", map_location=device))
model.eval()

# Transformasi input gambar
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

# Label A-Z
LABELS = [chr(i) for i in range(65, 91)]

# =========================
# 3. Prediksi Frame
# =========================
def predict(frame):
    img_tensor = transform(frame).unsqueeze(0).to(device)
    with torch.no_grad():
        outputs = model(img_tensor)
        _, predicted = torch.max(outputs, 1)
    return LABELS[predicted.item()]

# =========================
# 4. Jalankan Kamera
# =========================
cap = cv2.VideoCapture(0)
sentence = ""
last_capture_time = 0
capture_delay = 2  # detik
hand_detected = False

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Crop area tengah (bisa diganti metode deteksi tangan lebih baik)
    h, w, _ = frame.shape
    x1, y1, x2, y2 = w//2-100, h//2-100, w//2+100, h//2+100
    roi = frame[y1:y2, x1:x2]

    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
    current_time = time.time()

    if current_time - last_capture_time > capture_delay:
        letter = predict(roi)
        sentence += letter
        last_capture_time = current_time

    cv2.putText(frame, f"Huruf: {sentence[-1:]}", (10, 50),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    cv2.putText(frame, f"Kalimat: {sentence}", (10, 100),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)

    cv2.imshow("Hand Sign Recognition", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
