In [None]:
# Install dependencies
!pip install mediapipe opencv-python tensorflow

In [2]:
# Import required libraries
import cv2  # OpenCV for image processing
import mediapipe as mp  # MediaPipe for human pose detection
import numpy as np  # NumPy for array manipulations
from base64 import b64decode, b64encode  # Base64 encoding/decoding for handling image data
from google.colab.output import eval_js  # JavaScript execution in Google Colab
from IPython.display import display, Javascript  # Display JavaScript in Colab environment
import tensorflow as tf  # TensorFlow for building and running the CNN model
from tensorflow.keras.models import Sequential  # Sequential model for CNN
from tensorflow.keras.layers import Dense  # Dense layers for the neural network

In [3]:
# Build CNN Model for Pose Classification
def build_cnn_model():
    """
    This function builds a simple CNN model using Keras.
    The input shape is 99, which corresponds to 33 pose landmarks, each with 3 coordinates (x, y, z).
    The model consists of 3 dense layers, with the last layer classifying into 6 possible pose classes.
    """
    model = Sequential([
        Dense(128, activation='relu', input_shape=(99,)),  # First dense layer with 128 units and ReLU activation
        Dense(64, activation='relu'),  # Second dense layer with 64 units and ReLU activation
        Dense(32, activation='relu'),  # Third dense layer with 32 units and ReLU activation
        Dense(6, activation='softmax')  # Output layer with softmax activation for 6 pose classes
    ])
    # Compile the model using Adam optimizer and categorical crossentropy loss function
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [4]:
# Initialize MediaPipe Pose Detection
mp_pose = mp.solutions.pose  # Load the MediaPipe Pose solution
pose = mp_pose.Pose()  # Create a Pose object for detection
mp_drawing = mp.solutions.drawing_utils  # Utility for drawing the landmarks on the image

In [5]:
# JavaScript to capture image from webcam
def capture_image():
    """
    This function runs JavaScript to access the user's webcam, captures an image,
    and returns the image as a base64-encoded string.
    """
    js = Javascript('''
        async function capture() {
            // Create a video element to capture the webcam feed
            const video = document.createElement('video');
            // Access the webcam stream
            const stream = await navigator.mediaDevices.getUserMedia({video: true});
            document.body.appendChild(video);
            video.srcObject = stream;
            await video.play();

            // Set the dimensions of the video frame
            video.width = 320;
            video.height = 240;
            const canvas = document.createElement('canvas');
            canvas.width = video.width;
            canvas.height = video.height;
            const context = canvas.getContext('2d');
            // Draw the video frame onto the canvas
            context.drawImage(video, 0, 0, canvas.width, canvas.height);

            // Stop the video stream
            stream.getTracks().forEach(track => track.stop());
            video.remove();
            // Return the image data as a base64-encoded string
            return canvas.toDataURL('image/jpeg', 0.8);
        }
        capture();
    ''')
    display(js)  # Display the JavaScript in the Colab notebook
    data = eval_js('capture()')  # Evaluate the JavaScript and get the captured image
    return data

In [6]:
# Convert JavaScript image data to OpenCV image
def js_to_image(js_data):
    """
    This function decodes the base64 image string captured from the webcam
    and converts it into an OpenCV-compatible image format (numpy array).
    """
    img_bytes = b64decode(js_data.split(',')[1])  # Decode the base64 image string
    img_arr = np.frombuffer(img_bytes, dtype=np.uint8)  # Convert it to a numpy array
    img = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)  # Decode the image to OpenCV format (BGR)
    return img

In [7]:
# Preprocess pose landmarks for CNN input
def preprocess_landmarks(landmarks):
    """
    This function extracts the x, y, and z coordinates from the detected landmarks
    and flattens them into a 1D array for input into the CNN model.
    """
    # Extract x, y, z coordinates from the pose landmarks and flatten them into a 1D array
    pose_landmarks = np.array([[lm.x, lm.y, lm.z] for lm in landmarks]).flatten()
    return pose_landmarks

In [8]:
# Process image for pose detection
def process_image_with_pose_detection(image):
    """
    This function takes an image, processes it using MediaPipe to detect human pose,
    and returns the pose landmarks and the image with landmarks drawn on it.
    """
    img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert the image from BGR to RGB format
    results = pose.process(img_rgb)  # Perform pose detection using MediaPipe

    if results.pose_landmarks:  # If pose landmarks are detected
        # Draw the pose landmarks on the image
        mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
        # Preprocess the landmarks for CNN input
        landmarks_array = preprocess_landmarks(results.pose_landmarks.landmark)
        return landmarks_array, image  # Return the landmarks and the processed image
    return None, image  # If no landmarks are detected, return None for landmarks

In [None]:
# Build CNN model
model = build_cnn_model()  # Build the CNN model for pose classification

In [None]:
# Capture and process images from webcam in a loop
while True:
    js_image = capture_image()  # Capture an image from the webcam using JavaScript
    image = js_to_image(js_image)  # Convert the base64 image to OpenCV format

    landmarks, processed_image = process_image_with_pose_detection(image)  # Process the image for pose detection

    if landmarks is not None:  # If pose landmarks are detected
        # Expand dimensions to make it compatible with CNN input (batch size 1)
        input_data = np.expand_dims(landmarks, axis=0)

        # Predict the pose class using the CNN model
        prediction = model.predict(input_data)
        pose_class = np.argmax(prediction)  # Get the predicted pose class

        # Display the predicted pose class on the image
        cv2.putText(processed_image, f'Pose Class: {pose_class}', (50, 50),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)

    # Encode the processed image to base64 format to display it in Colab
    _, im_arr = cv2.imencode('.jpg', processed_image)
    im_bytes = im_arr.tobytes()
    im_b64 = b64encode(im_bytes).decode('utf-8')

    # Use JavaScript to display the processed image with pose class prediction
    display(Javascript(f'''
        var img = new Image();
        img.src = "data:image/jpeg;base64,{im_b64}";
        document.body.appendChild(img);
    '''))