In [1]:
# Basic utilities
import numpy as np
import pandas as pd
import os
import json
from sklearn.model_selection import train_test_split

# Video processing and pose estimation
import cv2
import mediapipe as mp

# Deep learning models
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Concatenate
from tensorflow.keras.optimizers import Adam

# NLP tools
from transformers import CLIPProcessor, CLIPModel
import torch

In [2]:
def extract_pose_features(video_path):
    """
    Extracts pose features from a video using MediaPipe.
    Returns: Array of keypoints and custom features.
    """
    mp_pose = mp.solutions.pose
    pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5)
    
    features = []
    cap = cv2.VideoCapture(video_path)

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Process frame
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = pose.process(frame_rgb)
        
        if results.pose_landmarks:
            landmarks = results.pose_landmarks.landmark
            # Calculate custom features (e.g., relative knee heights)
            rel_knee_height = landmarks[mp_pose.PoseLandmark.LEFT_KNEE].y - landmarks[mp_pose.PoseLandmark.RIGHT_KNEE].y
            features.append(rel_knee_height)

    cap.release()
    return np.array(features)

In [3]:
# Initialize CLIP
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def interpret_language(video_path, text_instruction):
    """
    Uses CLIP to evaluate the video based on a textual instruction.
    """
    # Process video frames
    cap = cv2.VideoCapture(video_path)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame_rgb)
    cap.release()
    
    # Use CLIP to process frames and text
    inputs = clip_processor(text=text_instruction, images=frames, return_tensors="pt", padding=True)
    outputs = clip_model(**inputs)
    logits_per_text = outputs.logits_per_text  # Text-to-image scores
    return logits_per_text.detach().numpy()

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [4]:
def extract_video_features(video_path):
    """
    Extracts spatial and temporal features from the video.
    Returns: Array of video features.
    """
    # Placeholder for video feature extraction (e.g., using I3D or Timesformer)
    # This can be integrated with a pre-trained model like I3D.
    return np.random.rand(1, 1024)  # Example feature vector


In [None]:
def decision_layer(pose_features, language_scores, video_features):
    """
    Combines outputs from all agents and predicts a score.
    """
    # Example: Concatenate all feature vectors
    combined_features = np.concatenate([pose_features.mean(axis=0, keepdims=True),
                                         language_scores,
                                         video_features], axis=1)

    # Simple MLP for prediction
    inputs = Input(shape=(combined_features.shape[1],))
    x = Dense(64, activation='relu')(inputs)
    x = Dense(32, activation='relu')(x)
    outputs = Dense(1, activation='linear')(x)

    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    
    return model, combined_features


In [5]:
def calculate_dynamic_changes(features):
    """
    Calculate velocity and acceleration from relative knee height features.
    - features: Array of relative knee height values over time.
    Returns: Tuple (velocity, acceleration).
    """
    velocities = np.diff(features)  # First derivative (velocity)
    accelerations = np.diff(velocities)  # Second derivative (acceleration)
    
    # Pad arrays to match the original length
    velocities = np.pad(velocities, (0, 1), mode='constant', constant_values=0)
    accelerations = np.pad(accelerations, (0, 2), mode='constant', constant_values=0)
    
    return velocities, accelerations

def extract_pose_features_with_dynamics(video_path):
    """
    Extract pose features with dynamic changes from a video.
    Returns: Combined features (relative knee height, velocity, acceleration).
    """
    relative_knee_heights = extract_pose_features(video_path)
    velocities, accelerations = calculate_dynamic_changes(relative_knee_heights)
    return np.stack([relative_knee_heights, velocities, accelerations], axis=-1)


In [6]:
import mediapipe as mp

def calculate_pose_ratios(landmarks):
    """
    Calculate pose ratios from keypoints.
    - landmarks: List of pose landmarks from MediaPipe.
    Returns: The ratio of hip-to-knee to knee-to-ankle distance.
    """
    left_hip = np.array([landmarks[mp_pose.PoseLandmark.LEFT_HIP].x,
                         landmarks[mp_pose.PoseLandmark.LEFT_HIP].y])
    left_knee = np.array([landmarks[mp_pose.PoseLandmark.LEFT_KNEE].x,
                          landmarks[mp_pose.PoseLandmark.LEFT_KNEE].y])
    left_ankle = np.array([landmarks[mp_pose.PoseLandmark.LEFT_ANKLE].x,
                           landmarks[mp_pose.PoseLandmark.LEFT_ANKLE].y])
    
    # Calculate distances
    hip_knee_dist = np.linalg.norm(left_hip - left_knee)
    knee_ankle_dist = np.linalg.norm(left_knee - left_ankle)
    
    # Avoid division by zero
    if knee_ankle_dist == 0:
        return 0.0
    
    return hip_knee_dist / knee_ankle_dist

def extract_pose_features_with_ratios(video_path):
    """
    Extract pose features with additional ratios from a video.
    Returns: Array of pose ratios over time.
    """
    mp_pose = mp.solutions.pose
    pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5)
    
    ratios = []
    cap = cv2.VideoCapture(video_path)

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Process frame
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = pose.process(frame_rgb)
        
        if results.pose_landmarks:
            landmarks = results.pose_landmarks.landmark
            ratio = calculate_pose_ratios(landmarks)
            ratios.append(ratio)

    cap.release()
    return np.array(ratios)


In [7]:
def keyframe_selection(features, threshold=-0.1):
    """
    Select keyframes based on relative knee height.
    - features: Array of relative knee height values.
    - threshold: Threshold below which keyframes are selected.
    Returns: Indices of keyframes.
    """
    return [i for i, value in enumerate(features) if value < threshold]


In [8]:
def interpret_language_with_keyframes(video_path, text_instruction, keyframes):
    """
    Use CLIP to analyze selected keyframes based on textual instruction.
    - video_path: Path to the video.
    - text_instruction: Criterion description to check against the video.
    - keyframes: Indices of keyframes to analyze.
    Returns: Mean score from CLIP for the selected frames.
    """
    # Process video frames
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_idx = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if frame_idx in keyframes:  # Select only relevant keyframes
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame_rgb)

        frame_idx += 1

    cap.release()

    if len(frames) == 0:
        print(f"No keyframes selected for {video_path}.")
        return 0.0

    # Use CLIP to process frames and text
    inputs = clip_processor(text=text_instruction, images=frames, return_tensors="pt", padding=True)
    outputs = clip_model(**inputs)
    logits_per_text = outputs.logits_per_text  # Text-to-image scores
    return logits_per_text.mean().item()


In [9]:
# Example: Process a video with refined CLIP agent
video_path = "/Users/cezar/Desktop/Team Project/AI/shotput/stage1/videos/1_user2.mp4"  # Update with an actual video path
relative_knee_heights = extract_pose_features(video_path)  # Extract relative knee heights
keyframes = keyframe_selection(relative_knee_heights)  # Select keyframes

text_instruction = "Does the athlete initiate the glide phase with their left knee bent and positioned lower than the right knee?"
clip_score = interpret_language_with_keyframes(video_path, text_instruction, keyframes)

print(f"CLIP Score for {video_path}: {clip_score}")


I0000 00:00:1736770515.072040 9632050 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M2 Pro
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1736770515.148373 9644138 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1736770515.162433 9644142 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1736770515.179216 9644136 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.


No keyframes selected for /Users/cezar/Desktop/Team Project/AI/shotput/stage1/videos/1_user2.mp4.
CLIP Score for /Users/cezar/Desktop/Team Project/AI/shotput/stage1/videos/1_user2.mp4: 0.0
