In [1]:
import cv2
import torch
import torch.nn as nn
from torchvision import transforms, models
import numpy as np
from PIL import Image

# ASL Classes
ASL_CLASSES = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
               'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
               'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'nothing', 'space']

def create_resnet_model(num_classes):
    """Create ResNet18 model"""
    model = models.resnet18(pretrained=False)
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    return model

def load_resnet_model(model_path, num_classes=29):
    """Load trained ResNet model"""
    model = create_resnet_model(num_classes)
    model.load_state_dict(torch.load(model_path, map_location='cpu'))
    model.eval()
    return model

def preprocess_frame(frame, image_size=224):
    """Preprocess frame for ResNet"""
    # Convert BGR to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Convert to PIL Image
    pil_image = Image.fromarray(rgb_frame)
    
    # Apply transforms
    transform = transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                           std=[0.229, 0.224, 0.225])
    ])
    
    tensor = transform(pil_image).unsqueeze(0)
    return tensor

def run_resnet_detection():
    """Run real-time ASL detection with ResNet"""
    # Load model
    model_path = "models/asl_resnet_model.pth"
    model = load_resnet_model(model_path)
    
    # Initialize camera
    cap = cv2.VideoCapture(0)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
    
    print("ResNet ASL Detection Started. Press 'q' to quit.")
    
    # For prediction smoothing
    prediction_history = []
    history_size = 5
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Create ROI for hand detection
        h, w = frame.shape[:2]
        roi_size = 300
        x1 = (w - roi_size) // 2
        y1 = (h - roi_size) // 2
        x2 = x1 + roi_size
        y2 = y1 + roi_size
        
        # Extract ROI
        roi = frame[y1:y2, x1:x2]
        
        # Preprocess and predict
        input_tensor = preprocess_frame(roi)
        
        with torch.no_grad():
            outputs = model(input_tensor)
            probabilities = torch.softmax(outputs, dim=1)
            confidence, predicted = torch.max(probabilities, 1)
            
            predicted_class = ASL_CLASSES[predicted.item()]
            confidence_score = confidence.item()
        
        # Smooth predictions
        prediction_history.append((predicted_class, confidence_score))
        if len(prediction_history) > history_size:
            prediction_history.pop(0)
        
        # Get most frequent prediction if confidence > 0.7
        if confidence_score > 0.7:
            most_common = max(set([p[0] for p in prediction_history]), 
                            key=[p[0] for p in prediction_history].count)
            display_text = f"Prediction: {most_common} ({confidence_score:.2f})"
        else:
            display_text = "Low Confidence"
        
        # Draw ROI rectangle
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        
        # Display prediction
        cv2.putText(frame, display_text, (10, 30), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
        cv2.putText(frame, "ResNet Model", (10, 60), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
        cv2.putText(frame, "Place hand in green box", (10, h-20), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
        
        cv2.imshow('ASL Detection - ResNet', frame)
        
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    run_resnet_detection()



ResNet ASL Detection Started. Press 'q' to quit.


In [2]:
import cv2
import torch
import torch.nn as nn
from torchvision import transforms, models
import numpy as np
from PIL import Image
import time
from collections import deque, Counter
import json
import os

# ASL Classes
ASL_CLASSES = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
               'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
               'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'nothing', 'space']

class FingerSpellingEngine:
    def __init__(self):
        self.current_word = ""
        self.sentence = ""
        self.word_history = []
        self.last_prediction = ""
        self.last_prediction_time = 0
        self.prediction_hold_time = 1.5  # seconds to hold a prediction
        self.delete_hold_time = 2.0  # seconds to hold delete
        self.space_hold_time = 1.0   # seconds to hold space
        
        # Load common words dictionary for auto-correct suggestions
        self.common_words = self.load_common_words()
        
        # Prediction smoothing
        self.prediction_buffer = deque(maxlen=10)
        self.confidence_threshold = 0.75
        
    def load_common_words(self):
        """Load common English words for auto-correction"""
        # You can replace this with a file containing common words
        common_words = [
            'hello', 'world', 'how', 'are', 'you', 'what', 'when', 'where', 
            'why', 'who', 'good', 'bad', 'yes', 'no', 'please', 'thank', 
            'sorry', 'help', 'love', 'like', 'want', 'need', 'time', 'day',
            'night', 'morning', 'afternoon', 'evening', 'food', 'water',
            'home', 'work', 'school', 'friend', 'family', 'happy', 'sad',
            'the', 'and', 'for', 'with', 'this', 'that', 'have', 'will',
            'can', 'could', 'should', 'would', 'make', 'take', 'give',
            'get', 'go', 'come', 'see', 'know', 'think', 'feel', 'look'
        ]
        return set(common_words)
    
    def smooth_prediction(self, predicted_class, confidence):
        """Smooth predictions using a buffer"""
        self.prediction_buffer.append((predicted_class, confidence))
        
        # Only consider high confidence predictions
        high_conf_predictions = [pred for pred, conf in self.prediction_buffer if conf > self.confidence_threshold]
        
        if len(high_conf_predictions) < 3:  # Need at least 3 consistent predictions
            return None, 0
        
        # Get most common prediction
        counter = Counter(high_conf_predictions)
        most_common_pred, count = counter.most_common(1)[0]
        
        # Calculate average confidence for the most common prediction
        avg_confidence = np.mean([conf for pred, conf in self.prediction_buffer if pred == most_common_pred])
        
        return most_common_pred, avg_confidence
    
    def process_prediction(self, predicted_class, confidence):
        """Process the prediction and update word/sentence"""
        current_time = time.time()
        
        # Smooth the prediction
        smoothed_pred, smoothed_conf = self.smooth_prediction(predicted_class, confidence)
        
        if smoothed_pred is None:
            return
        
        # Check if prediction has been held long enough
        if smoothed_pred == self.last_prediction:
            hold_time = current_time - self.last_prediction_time
            
            if smoothed_pred == 'del' and hold_time > self.delete_hold_time:
                self.handle_delete()
                self.last_prediction_time = current_time  # Reset timer
                
            elif smoothed_pred == 'space' and hold_time > self.space_hold_time:
                self.handle_space()
                self.last_prediction_time = current_time  # Reset timer
                
            elif smoothed_pred not in ['del', 'space', 'nothing'] and hold_time > self.prediction_hold_time:
                self.add_letter(smoothed_pred)
                self.last_prediction_time = current_time  # Reset timer
        else:
            # New prediction
            self.last_prediction = smoothed_pred
            self.last_prediction_time = current_time
    
    def add_letter(self, letter):
        """Add letter to current word"""
        if letter not in ['del', 'space', 'nothing']:
            self.current_word += letter.lower()
    
    def handle_delete(self):
        """Handle delete action"""
        if self.current_word:
            self.current_word = self.current_word[:-1]
        elif self.sentence:
            # If no current word, remove last character from sentence
            self.sentence = self.sentence[:-1]
    
    def handle_space(self):
        """Handle space action - finalize word and add to sentence"""
        if self.current_word:
            # Add word to history
            self.word_history.append(self.current_word)
            
            # Add to sentence
            if self.sentence:
                self.sentence += " " + self.current_word
            else:
                self.sentence = self.current_word
            
            # Clear current word
            self.current_word = ""
    
    def get_word_suggestions(self):
        """Get word suggestions based on current partial word"""
        if len(self.current_word) < 2:
            return []
        
        suggestions = []
        for word in self.common_words:
            if word.startswith(self.current_word.lower()):
                suggestions.append(word)
        
        return sorted(suggestions)[:3]  # Return top 3 suggestions
    
    def get_status(self):
        """Get current status for display"""
        return {
            'current_word': self.current_word,
            'sentence': self.sentence,
            'suggestions': self.get_word_suggestions(),
            'last_prediction': self.last_prediction,
            'word_count': len(self.word_history)
        }
    
    def clear_all(self):
        """Clear everything"""
        self.current_word = ""
        self.sentence = ""
        self.word_history = []
    
    def undo_last_word(self):
        """Undo last word"""
        if self.word_history:
            removed_word = self.word_history.pop()
            # Rebuild sentence without last word
            self.sentence = " ".join(self.word_history)
    
    def save_session(self, filename="asl_session.json"):
        """Save current session"""
        session_data = {
            'sentence': self.sentence,
            'word_history': self.word_history,
            'timestamp': time.time()
        }
        
        with open(filename, 'w') as f:
            json.dump(session_data, f)
    
    def load_session(self, filename="asl_session.json"):
        """Load previous session"""
        if os.path.exists(filename):
            with open(filename, 'r') as f:
                session_data = json.load(f)
                self.sentence = session_data.get('sentence', '')
                self.word_history = session_data.get('word_history', [])

def create_resnet_model(num_classes):
    """Create ResNet18 model"""
    model = models.resnet18(pretrained=False)
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    return model

def load_resnet_model(model_path, num_classes=29):
    """Load trained ResNet model"""
    model = create_resnet_model(num_classes)
    model.load_state_dict(torch.load(model_path, map_location='cpu'))
    model.eval()
    return model

def preprocess_frame(frame, image_size=224):
    """Preprocess frame for ResNet"""
    # Convert BGR to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Convert to PIL Image
    pil_image = Image.fromarray(rgb_frame)
    
    # Apply transforms
    transform = transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                           std=[0.229, 0.224, 0.225])
    ])
    
    tensor = transform(pil_image).unsqueeze(0)
    return tensor

def draw_ui(frame, spelling_engine, predicted_class, confidence, fps):
    """Draw enhanced UI with finger spelling information"""
    h, w = frame.shape[:2]
    status = spelling_engine.get_status()
    
    # Draw semi-transparent background for text
    overlay = frame.copy()
    
    # Top panel for current prediction
    cv2.rectangle(overlay, (0, 0), (w, 120), (0, 0, 0), -1)
    
    # Bottom panel for sentence building
    cv2.rectangle(overlay, (0, h-200), (w, h), (0, 0, 0), -1)
    
    # Blend overlay
    alpha = 0.7
    cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0, frame)
    
    # Current prediction with confidence
    pred_text = f"Sign: {predicted_class} ({confidence:.2f})"
    cv2.putText(frame, pred_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
    
    # FPS
    cv2.putText(frame, f"FPS: {fps:.1f}", (w-100, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
    
    # Current word being spelled
    word_text = f"Current Word: {status['current_word']}_"
    cv2.putText(frame, word_text, (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 0), 2)
    
    # Word suggestions
    suggestions = status['suggestions']
    if suggestions:
        sugg_text = f"Suggestions: {', '.join(suggestions)}"
        cv2.putText(frame, sugg_text, (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (200, 200, 200), 1)
    
    # Sentence display (bottom panel)
    sentence_y_start = h - 180
    
    # Title
    cv2.putText(frame, "Sentence:", (10, sentence_y_start), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
    
    # Sentence text (wrap if too long)
    sentence = status['sentence']
    if len(sentence) > 80:  # Wrap long sentences
        lines = [sentence[i:i+80] for i in range(0, len(sentence), 80)]
        for i, line in enumerate(lines[-3:]):  # Show last 3 lines
            cv2.putText(frame, line, (10, sentence_y_start + 30 + i*25), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)
    else:
        cv2.putText(frame, sentence, (10, sentence_y_start + 30), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)
    
    # Instructions
    instructions = [
        "Hold gesture for 1.5s to add letter",
        "Hold SPACE for 1s to add word",
        "Hold DEL for 2s to delete",
        "Keys: C-clear, U-undo, S-save, Q-quit"
    ]
    
    start_y = sentence_y_start + 80
    for i, instruction in enumerate(instructions):
        cv2.putText(frame, instruction, (10, start_y + i*20), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.4, (150, 150, 150), 1)
    
    # Progress indicator for held gestures
    if status['last_prediction'] in ['del', 'space'] or (status['last_prediction'] not in ['nothing']):
        current_time = time.time()
        hold_time = current_time - spelling_engine.last_prediction_time
        
        if status['last_prediction'] == 'del':
            required_time = spelling_engine.delete_hold_time
            color = (0, 0, 255)  # Red
        elif status['last_prediction'] == 'space':
            required_time = spelling_engine.space_hold_time
            color = (255, 0, 0)  # Blue
        else:
            required_time = spelling_engine.prediction_hold_time
            color = (0, 255, 0)  # Green
        
        if hold_time < required_time:
            progress = hold_time / required_time
            bar_width = int(200 * progress)
            cv2.rectangle(frame, (w-220, 50), (w-220 + bar_width, 70), color, -1)
            cv2.rectangle(frame, (w-220, 50), (w-20, 70), (255, 255, 255), 2)

def run_enhanced_asl_detection():
    """Run enhanced ASL detection with finger spelling"""
    # Load model
    model_path = "models/asl_resnet_model.pth"
    
    try:
        model = load_resnet_model(model_path)
        print("Model loaded successfully!")
    except Exception as e:
        print(f"Error loading model: {e}")
        print("Make sure the model file exists at: {model_path}")
        return
    
    # Initialize finger spelling engine
    spelling_engine = FingerSpellingEngine()
    
    # Try to load previous session
    spelling_engine.load_session()
    
    # Initialize camera
    cap = cv2.VideoCapture(0)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
    cap.set(cv2.CAP_PROP_FPS, 30)
    
    if not cap.isOpened():
        print("Error: Could not open camera")
        return
    
    print("Enhanced ASL Detection with Finger Spelling Started!")
    print("Controls:")
    print("  Hold gestures to spell words")
    print("  C - Clear all")
    print("  U - Undo last word")
    print("  S - Save session")
    print("  Q - Quit")
    
    # FPS calculation
    fps_counter = 0
    fps_start_time = time.time()
    current_fps = 0
    
    while True:
        ret, frame = cap.read()
        if not ret:
            print("Error: Could not read frame")
            break
        
        # Flip frame horizontally for mirror effect
        frame = cv2.flip(frame, 1)
        
        # Calculate FPS
        fps_counter += 1
        if fps_counter % 30 == 0:  # Update FPS every 30 frames
            current_fps = 30 / (time.time() - fps_start_time)
            fps_start_time = time.time()
        
        # Create ROI for hand detection
        h, w = frame.shape[:2]
        roi_size = 300
        x1 = (w - roi_size) // 2
        y1 = (h - roi_size) // 2 - 50  # Move ROI up a bit
        x2 = x1 + roi_size
        y2 = y1 + roi_size
        
        # Extract ROI
        roi = frame[y1:y2, x1:x2]
        
        # Preprocess and predict
        try:
            input_tensor = preprocess_frame(roi)
            
            with torch.no_grad():
                outputs = model(input_tensor)
                probabilities = torch.softmax(outputs, dim=1)
                confidence, predicted = torch.max(probabilities, 1)
                
                predicted_class = ASL_CLASSES[predicted.item()]
                confidence_score = confidence.item()
            
            # Process prediction with spelling engine
            spelling_engine.process_prediction(predicted_class, confidence_score)
            
        except Exception as e:
            print(f"Prediction error: {e}")
            predicted_class = "error"
            confidence_score = 0
        
        # Draw ROI rectangle
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        
        # Draw enhanced UI
        draw_ui(frame, spelling_engine, predicted_class, confidence_score, current_fps)
        
        cv2.imshow('Enhanced ASL Detection - Finger Spelling', frame)
        
        # Handle keyboard input
        key = cv2.waitKey(1) & 0xFF
        
        if key == ord('q'):
            break
        elif key == ord('c'):
            spelling_engine.clear_all()
            print("Cleared all text")
        elif key == ord('u'):
            spelling_engine.undo_last_word()
            print("Undid last word")
        elif key == ord('s'):
            spelling_engine.save_session()
            print("Session saved")
        elif key == ord('l'):
            spelling_engine.load_session()
            print("Session loaded")
    
    # Save session before quitting
    spelling_engine.save_session()
    print(f"Final sentence: {spelling_engine.sentence}")
    
    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    run_enhanced_asl_detection()

Model loaded successfully!
Enhanced ASL Detection with Finger Spelling Started!
Controls:
  Hold gestures to spell words
  C - Clear all
  U - Undo last word
  S - Save session
  Q - Quit
Final sentence: wbcciiiiiiii


In [11]:
# enhanced_asl_detection.py
# Requirements:
#   pip install torch torchvision opencv-python Pillow pyttsx3 SpeechRecognition
#   (optional) pip install pyaudio   -> required for live microphone STT

import cv2
import time
import json
import os
import threading
from collections import deque, Counter
from PIL import Image
import numpy as np

# Try optional imports
try:
    import torch
    import torch.nn as nn
    from torchvision import transforms, models
except Exception as e:
    torch = None
    print("Warning: torch not available. Model inference will fail. Install torch to enable prediction.")

try:
    import speech_recognition as sr
except Exception:
    sr = None

try:
    import pyttsx3
except Exception:
    pyttsx3 = None

# ASL classes (29)
ASL_CLASSES = ['A','B','C','D','E','F','G','H','I','J',
               'K','L','M','N','O','P','Q','R','S','T',
               'U','V','W','X','Y','Z','del','nothing','space']

# ---------------------------
# Accuracy tracker
# ---------------------------
class AccuracyTracker:
    def __init__(self):
        self.predictions = []  # list of (timestamp, predicted, confidence)
        self.ground_truths = []  # list of (timestamp, ground_truth)
    
    def log_prediction(self, predicted, confidence):
        self.predictions.append({
            'time': time.time(),
            'predicted': predicted,
            'confidence': float(confidence)
        })
    
    def add_ground_truth(self, gt):
        # link ground truth to last prediction time (best-effort)
        self.ground_truths.append({
            'time': time.time(),
            'ground_truth': gt
        })
    
    def compute_accuracy(self):
        # Pair ground truths to closest prior prediction (naive)
        if not self.ground_truths or not self.predictions:
            return 0.0, 0, 0
        correct = 0
        matched = 0
        for gt in self.ground_truths:
            # find last prediction before gt.time
            preds_before = [p for p in self.predictions if p['time'] <= gt['time']]
            if not preds_before:
                continue
            last_pred = preds_before[-1]
            matched += 1
            if last_pred['predicted'].lower() == gt['ground_truth'].lower():
                correct += 1
        acc = (correct / matched) if matched else 0.0
        return acc, correct, matched

    def export(self, fname="accuracy_log.json"):
        data = {
            'predictions': self.predictions,
            'ground_truths': self.ground_truths,
            'computed_accuracy': self.compute_accuracy()
        }
        with open(fname, 'w') as f:
            json.dump(data, f, indent=2)

# ---------------------------
# FingerSpellingEngine (cleaned)
# ---------------------------
class FingerSpellingEngine:
    def __init__(self, auto_speak=False):
        self.current_word = ""
        self.sentence = ""
        self.word_history = []
        self.last_prediction = ""
        self.last_prediction_time = 0.0
        self.prediction_hold_time = 1.5  # seconds
        self.delete_hold_time = 2.0
        self.space_hold_time = 1.0

        # smoothing
        self.prediction_buffer = deque(maxlen=10)
        self.confidence_threshold = 0.75

        # common words for suggestions
        self.common_words = self.load_common_words()

        # Audio
        self.auto_speak = auto_speak
        self.tts_engine = None
        if pyttsx3 is not None:
            try:
                self.tts_engine = pyttsx3.init()
            except Exception as e:
                self.tts_engine = None
        # Accuracy tracker
        self.accuracy_tracker = AccuracyTracker()

    def load_common_words(self):
        basic = [
            'hello','world','how','are','you','what','when','where','why','who',
            'good','bad','yes','no','please','thank','sorry','help','love','like',
            'want','need','time','day','night','morning','afternoon','evening',
            'food','water','home','work','school','friend','family','happy','sad',
            'the','and','for','with','this','that','have','will','can','could'
        ]
        return set(basic)

    def smooth_prediction(self, predicted_class, confidence):
        self.prediction_buffer.append((predicted_class, confidence))
        high_conf = [p for p,c in self.prediction_buffer if c > self.confidence_threshold]
        if len(high_conf) < 3:
            return None, 0.0
        counter = Counter(high_conf)
        pred, _ = counter.most_common(1)[0]
        avg_conf = np.mean([c for p,c in self.prediction_buffer if p == pred])
        return pred, float(avg_conf)

    def process_prediction(self, predicted_class, confidence):
        now = time.time()
        # log every raw prediction for accuracy analysis
        self.accuracy_tracker.log_prediction(predicted_class, confidence)

        smoothed, sm_conf = self.smooth_prediction(predicted_class, confidence)
        if smoothed is None:
            return

        if smoothed == self.last_prediction:
            hold_time = now - self.last_prediction_time
            if smoothed == 'del' and hold_time > self.delete_hold_time:
                self.handle_delete()
                self.last_prediction_time = now
            elif smoothed == 'space' and hold_time > self.space_hold_time:
                self.handle_space()
                self.last_prediction_time = now
            elif smoothed not in ['del','space','nothing'] and hold_time > self.prediction_hold_time:
                self.add_letter(smoothed)
                self.last_prediction_time = now
        else:
            self.last_prediction = smoothed
            self.last_prediction_time = now

    def add_letter(self, letter):
        if letter not in ['del','space','nothing']:
            self.current_word += letter.lower()
            if self.auto_speak and self.tts_engine:
                # speak the letter
                try:
                    self.tts_engine.say(letter)
                    self.tts_engine.runAndWait()
                except Exception:
                    pass

    def handle_delete(self):
        if self.current_word:
            self.current_word = self.current_word[:-1]
        elif self.sentence:
            self.sentence = self.sentence[:-1]

    def handle_space(self):
        if self.current_word:
            self.word_history.append(self.current_word)
            if self.sentence:
                self.sentence += " " + self.current_word
            else:
                self.sentence = self.current_word
            # Optionally speak word
            if self.auto_speak and self.tts_engine:
                try:
                    self.tts_engine.say(self.current_word)
                    self.tts_engine.runAndWait()
                except Exception:
                    pass
            self.current_word = ""

    def get_word_suggestions(self):
        if len(self.current_word) < 2:
            return []
        s = self.current_word.lower()
        suggestions = [w for w in self.common_words if w.startswith(s)]
        return sorted(suggestions)[:3]

    def get_status(self):
        acc, correct, matched = self.accuracy_tracker.compute_accuracy()
        return {
            'current_word': self.current_word,
            'sentence': self.sentence,
            'suggestions': self.get_word_suggestions(),
            'last_prediction': self.last_prediction,
            'word_count': len(self.word_history),
            'accuracy': acc,
            'accuracy_details': (correct, matched)
        }

    def clear_all(self):
        self.current_word = ""
        self.sentence = ""
        self.word_history = []

    def undo_last_word(self):
        if self.word_history:
            removed = self.word_history.pop()
            self.sentence = " ".join(self.word_history)

    def save_session(self, filename="asl_session.json"):
        data = {
            'sentence': self.sentence,
            'word_history': self.word_history,
            'timestamp': time.time()
        }
        with open(filename, 'w') as f:
            json.dump(data, f)

    def load_session(self, filename="asl_session.json"):
        if os.path.exists(filename):
            with open(filename, 'r') as f:
                data = json.load(f)
                self.sentence = data.get('sentence','')
                self.word_history = data.get('word_history',[])

# ---------------------------
# Model utilities
# ---------------------------
def create_resnet_model(num_classes):
    model = models.resnet18(pretrained=False)
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    return model

def load_resnet_model(model_path, num_classes=29):
    if torch is None:
        raise RuntimeError("torch not available")
    model = create_resnet_model(num_classes)
    state = torch.load(model_path, map_location='cpu')
    model.load_state_dict(state)
    model.eval()
    return model

def preprocess_frame(frame, image_size=224):
    # roi -> RGB PIL -> tensor
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil = Image.fromarray(rgb)
    transform = transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
    ])
    return transform(pil).unsqueeze(0)

# ---------------------------
# UI drawing
# ---------------------------
def draw_ui(frame, spelling_engine, predicted_class, confidence, fps):
    h, w = frame.shape[:2]
    status = spelling_engine.get_status()

    overlay = frame.copy()
    cv2.rectangle(overlay, (0,0),(w,120), (0,0,0), -1)
    cv2.rectangle(overlay, (0,h-200),(w,h), (0,0,0), -1)
    alpha = 0.6
    cv2.addWeighted(overlay, alpha, frame, 1-alpha, 0, frame)

    pred_text = f"Sign: {predicted_class} ({confidence:.2f})"
    cv2.putText(frame, pred_text, (10, 26), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,255,0), 2)
    cv2.putText(frame, f"FPS: {fps:.1f}", (w-140, 26), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,255), 1)
    acc_text = f"Accuracy: {status['accuracy']:.1%} ({status['accuracy_details'][0]}/{status['accuracy_details'][1]})"
    cv2.putText(frame, acc_text, (w-400, 52), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255), 1)

    word_text = f"Current Word: {status['current_word']}_"
    cv2.putText(frame, word_text, (10, 56), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255,255,0), 2)

    suggestions = status['suggestions']
    if suggestions:
        cv2.putText(frame, "Suggestions: " + ", ".join(suggestions), (10, 86), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (200,200,200), 1)

    sentence_y = h - 170
    cv2.putText(frame, "Sentence:", (10, sentence_y), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,255), 1)
    sentence = status['sentence']
    if len(sentence) > 80:
        lines = [sentence[i:i+80] for i in range(0, len(sentence), 80)]
        for i, line in enumerate(lines[-3:]):
            cv2.putText(frame, line, (10, sentence_y + 25 + i*22), cv2.FONT_HERSHEY_SIMPLEX, 0.55, (255,255,255), 1)
    else:
        cv2.putText(frame, sentence, (10, sentence_y + 25), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,255), 1)

    instructions = ["Hold gesture to add letter", "Hold SPACE to add word", "Hold DEL to delete",
                    "Keys: C-clear U-undo S-save G-enter-gt Q-quit A-toggle-tts V-toggle-stt E-export-accuracy"]
    for i, inst in enumerate(instructions):
        cv2.putText(frame, inst, (10, h-40 + i*0), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (180,180,180), 1)

# ---------------------------
# Optional STT thread
# ---------------------------
class SpeechListener(threading.Thread):
    def __init__(self, spelling_engine):
        super().__init__(daemon=True)
        self.engine = spelling_engine
        self.running = True
        self.recognizer = None
        self.microphone = None
        if sr is not None:
            try:
                self.recognizer = sr.Recognizer()
                self.microphone = sr.Microphone()
            except Exception as e:
                self.recognizer = None
                self.microphone = None
                print("SpeechRecognition or microphone not available. STT disabled.")
        else:
            print("SpeechRecognition not installed. STT disabled.")

    def run(self):
        if not self.recognizer or not self.microphone:
            return
        with self.microphone as source:
            self.recognizer.adjust_for_ambient_noise(source, duration=1)
        while self.running:
            try:
                with self.microphone as source:
                    audio = self.recognizer.listen(source, phrase_time_limit=4)
                text = self.recognizer.recognize_google(audio)  # online; change if offline engine used
                # append recognized text
                if text:
                    if self.engine.sentence:
                        self.engine.sentence += " " + text
                    else:
                        self.engine.sentence = text
                    # optionally TTS
                    if self.engine.auto_speak and self.engine.tts_engine:
                        try:
                            self.engine.tts_engine.say(text)
                            self.engine.tts_engine.runAndWait()
                        except Exception:
                            pass
                    print("[STT] Recognized:", text)
            except Exception as e:
                # avoid spamming errors
                time.sleep(0.5)

    def stop(self):
        self.running = False

# ---------------------------
# Main run loop
# ---------------------------
def run_enhanced_asl_detection(model_path="models/asl_resnet_model.pth", use_stt=False):
    # Model load
    model = None
    try:
        model = load_resnet_model(model_path)
        print("Model loaded:", model_path)
    except Exception as e:
        print("Warning: could not load model:", e)
        print("Make sure model exists and torch is installed. Continuing in demo mode (no predictions).")

    # spelling engine
    spelling_engine = FingerSpellingEngine(auto_speak=False)
    spelling_engine.load_session()

    # STT thread
    stt_thread = None
    stt_on = False
    if use_stt:
        stt_thread = SpeechListener(spelling_engine)
        stt_thread.start()
        stt_on = True

    # video capture
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("Error: Could not open camera")
        return

    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

    fps_count = 0
    fps_time = time.time()
    current_fps = 0.0

    predicted_class = "none"
    confidence_score = 0.0

    print("Started. Controls: Q Quit | C Clear | U Undo | S Save | L Load | G Enter GT | A Toggle TTS | V Toggle STT | E Export Accuracy")
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.flip(frame, 1)

            fps_count += 1
            if fps_count % 30 == 0:
                now = time.time()
                current_fps = 30.0 / (now - fps_time) if (now - fps_time) > 0 else 0.0
                fps_time = now

            h, w = frame.shape[:2]
            roi_size = 300
            x1 = (w - roi_size) // 2
            y1 = (h - roi_size) // 2 - 50
            x2, y2 = x1 + roi_size, y1 + roi_size
            roi = frame[y1:y2, x1:x2]

            # Inference if model available
            if model is not None and torch is not None:
                try:
                    input_tensor = preprocess_frame(roi)
                    with torch.no_grad():
                        outputs = model(input_tensor)
                        probs = torch.softmax(outputs, dim=1)
                        conf, idx = torch.max(probs, 1)
                        predicted_class = ASL_CLASSES[int(idx.item())]
                        confidence_score = float(conf.item())
                except Exception as e:
                    predicted_class = "error"
                    confidence_score = 0.0
            else:
                # demo: show nothing
                predicted_class = "nothing"
                confidence_score = 0.0

            # process prediction
            spelling_engine.process_prediction(predicted_class, confidence_score)

            # draw
            cv2.rectangle(frame, (x1,y1),(x2,y2),(0,255,0),2)
            draw_ui(frame, spelling_engine, predicted_class, confidence_score, current_fps)
            cv2.imshow("Enhanced ASL Detection - Finger Spelling", frame)

            key = cv2.waitKey(1) & 0xFF
            if key == ord('q'):
                break
            elif key == ord('c'):
                spelling_engine.clear_all()
                print("Cleared all")
            elif key == ord('u'):
                spelling_engine.undo_last_word()
                print("Undid last word")
            elif key == ord('s'):
                spelling_engine.save_session()
                print("Session saved")
            elif key == ord('l'):
                spelling_engine.load_session()
                print("Session loaded")
            elif key == ord('g'):
                # enter ground truth for last prediction
                gt = input("Enter ground-truth letter/word for last prediction: ").strip()
                if gt:
                    spelling_engine.accuracy_tracker.add_ground_truth(gt)
                    print("Ground truth saved.")
            elif key == ord('a'):
                # toggle auto-speak
                spelling_engine.auto_speak = not spelling_engine.auto_speak
                print("Auto-speak:", spelling_engine.auto_speak)
            elif key == ord('v'):
                # toggle STT
                if stt_on:
                    if stt_thread:
                        stt_thread.stop()
                    stt_on = False
                    print("STT stopped")
                else:
                    if sr is None:
                        print("SpeechRecognition not available, cannot start STT.")
                    else:
                        stt_thread = SpeechListener(spelling_engine)
                        stt_thread.start()
                        stt_on = True
                        print("STT started")
            elif key == ord('e'):
                # export accuracy
                spelling_engine.accuracy_tracker.export()
                print("Exported accuracy_log.json")

    finally:
        if stt_thread:
            stt_thread.stop()
        spelling_engine.save_session()
        cap.release()
        cv2.destroyAllWindows()
        print("Final sentence:", spelling_engine.sentence)
        acc, c, m = spelling_engine.accuracy_tracker.compute_accuracy()
        print(f"Accuracy: {acc:.2%} ({c}/{m})")

if __name__ == "__main__":
    # change model path if needed
    run_enhanced_asl_detection(model_path="models/asl_resnet_model.pth", use_stt=False)


Model loaded: models/asl_resnet_model.pth
Started. Controls: Q Quit | C Clear | U Undo | S Save | L Load | G Enter GT | A Toggle TTS | V Toggle STT | E Export Accuracy
Final sentence: wbcciiiiiiii
Accuracy: 0.00% (0/0)


SyntaxError: incomplete input (2082685946.py, line 714)