In [6]:
import cv2
import numpy as np
from ultralytics import YOLO
import pyttsx3
import threading
import queue
import speech_recognition as sr

# Initialize YOLO model and video capture
model = YOLO('yolo11n-pose.pt')  # Make sure your pose model is available
cap = cv2.VideoCapture(0)

# Thresholds for detecting squat angles
up_thresh = 160  # Angle when standing (high position)
down_thresh = 90  # Angle when squatting (low position)
squat_counter = 0
squat_mode = None  # Default mode is None

# Initialize text-to-speech engine
engine = pyttsx3.init()
engine.setProperty('rate', 150)
speech_queue = queue.Queue()

# Function to listen for commands (voice-controlled mode switching)
def listen_commands():
    recognizer = sr.Recognizer()
    mic = sr.Microphone()
    while True:
        try:
            with mic as source:
                recognizer.adjust_for_ambient_noise(source)
                print("Listening...")
                audio = recognizer.listen(source)
                commands = recognizer.recognize_google(audio).lower()
                print(commands)
                if 'normal' in commands:
                    speak("Normal mode started")
                    set_mode('normal')
                elif 'stop' in commands:
                    speak("Take care and goodbye")
                    set_mode('stop')
                    break
        except sr.UnknownValueError:
            print("Could not understand audio")
        except sr.RequestError:
            print("Could not request results; check your network connection")

# Function to update mode
def set_mode(new_mode):
    global squat_mode
    squat_mode = new_mode

# Function to speak text (using the queue for thread safety)
def speak(text):
    speech_queue.put(text)

# Worker function to process speech synthesis in the background
def worker_speak():
    while True:
        text = speech_queue.get()
        if text is None:
            break
        engine.say(text)
        engine.runAndWait()

# Function to calculate the angle between three points (hip, knee, ankle)
def calculate_angle(a, b, c):
    a = np.array(a)  # Hip
    b = np.array(b)  # Knee
    c = np.array(c)  # Ankle

    radians = np.arctan2(c[1] - b[1], c[0] - b[0]) - np.arctan2(a[1] - b[1], a[0] - b[0])
    angle = np.abs(radians * 180.0 / np.pi)

    if angle > 180.0:
        angle = 360 - angle

    return angle

# Start threads for speech and command listening
thread_speak = threading.Thread(target=worker_speak, daemon=True)
thread_speak.start()
thread_listen = threading.Thread(target=listen_commands, daemon=True)
thread_listen.start()

# Main loop for video capture and processing
while True:
    ret, frame = cap.read()
    if not ret:
        break
    frame = cv2.resize(frame, (1020, 500))

    # Make predictions with YOLO model
    result = model.track(frame)

    if result[0].keypoints is not None:
        keypoints = result[0].keypoints.xy.cpu().numpy()

        for keypoint in keypoints:
            if len(keypoint) > 0:
                for i, point in enumerate(keypoint):
                    cx, cy = int(point[0]), int(point[1])
                    cv2.circle(frame, (cx, cy), 5, (255, 0, 0), -1)
                    cv2.putText(frame, f'{i}', (cx, cy), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                
                # Ensure enough keypoints are available for angle calculation
                if squat_mode and len(keypoint) > 10:
                    left_hip = (int(keypoint[11][0]), int(keypoint[11][1]))
                    left_knee = (int(keypoint[13][0]), int(keypoint[13][1]))
                    left_ankle = (int(keypoint[15][0]), int(keypoint[15][1]))
                    
                    right_hip = (int(keypoint[12][0]), int(keypoint[12][1]))
                    right_knee = (int(keypoint[14][0]), int(keypoint[14][1]))
                    right_ankle = (int(keypoint[16][0]), int(keypoint[16][1]))

                    left_leg_angle = calculate_angle(left_hip, left_knee, left_ankle)
                    right_leg_angle = calculate_angle(right_hip, right_knee, right_ankle)

                    # Display angles
                    cv2.putText(frame, f'Left Leg Angle: {int(left_leg_angle)}', (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
                    cv2.putText(frame, f'Right Leg Angle: {int(right_leg_angle)}', (50, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

                    # Normal mode: track squats by both legs
                    if squat_mode == "normal":
                        if left_leg_angle < down_thresh and right_leg_angle < down_thresh:
                            squat_counter += 1
                            speak(f'Squat {squat_counter}')

                # Display squat counter
                if squat_mode == 'normal':
                    cv2.putText(frame, f'Squat counter: {squat_counter}', (50, 110), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)

    # Display the frame
    cv2.imshow("Squat Detector", frame)

    # Exit on 'Esc' key press
    key = cv2.waitKey(1)
    if squat_mode == 'stop':
        break

# Release resources
cap.release()
cv2.destroyAllWindows()


Exception in thread Thread-21 (listen_commands):
Traceback (most recent call last):
  File "c:\Users\Lenovo\anaconda3\Lib\site-packages\speech_recognition\__init__.py", line 103, in get_pyaudio
    import pyaudio
ModuleNotFoundError: No module named 'pyaudio'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\Lenovo\anaconda3\Lib\threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "c:\Users\Lenovo\anaconda3\Lib\threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\Lenovo\AppData\Local\Temp\ipykernel_27256\419130787.py", line 27, in listen_commands
  File "c:\Users\Lenovo\anaconda3\Lib\site-packages\speech_recognition\__init__.py", line 75, in __init__
    self.pyaudio_module = self.get_pyaudio()
                          ^^^^^^^^^^^^^^^^^^
  File "c:\Users\Lenovo\anaconda3\Lib\site-packages\speech_recognition\__init__.py", line 105, in get_pyaudio
    raise At


0: 320x640 1 person, 176.6ms
Speed: 4.5ms preprocess, 176.6ms inference, 1.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 person, 123.1ms
Speed: 1.0ms preprocess, 123.1ms inference, 1.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 person, 105.6ms
Speed: 3.0ms preprocess, 105.6ms inference, 1.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 person, 149.5ms
Speed: 2.0ms preprocess, 149.5ms inference, 1.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 person, 159.2ms
Speed: 3.9ms preprocess, 159.2ms inference, 2.1ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 person, 166.2ms
Speed: 1.0ms preprocess, 166.2ms inference, 1.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 person, 131.4ms
Speed: 2.0ms preprocess, 131.4ms inference, 1.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 person, 105.3ms
Speed: 3.0ms preprocess, 105.3ms inference, 1.0ms postprocess per image at

KeyboardInterrupt: 