In [25]:
import cv2
import os

# Paths
video_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage5/videos"
frames_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage5/frames"

# Ensure output directory exists
os.makedirs(frames_dir, exist_ok=True)

def extract_frames(video_path, output_dir, interval=1):
    """
    Extract frames from a video at a given interval (default: every 1 frame).
    """
    cap = cv2.VideoCapture(video_path)
    count = 0
    frame_count = 0
    video_name = os.path.basename(video_path).rsplit('.', 1)[0]  # Get the video name without extension

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Save every 'interval' frame
        if count % interval == 0:
            frame_path = os.path.join(output_dir, f"{video_name}_frame{frame_count}.jpg")
            cv2.imwrite(frame_path, frame)
            frame_count += 1

        count += 1

    cap.release()

# Process all videos
for video_file in os.listdir(video_dir):
    # Check if the filename ends with ".mp4" and starts with a valid number
    if video_file.endswith(".mp4"):
        try:
            # Split by '_' and attempt to convert the first part to float
            prefix = video_file.split('_')[0]
            float(prefix)  # This ensures it works for values like '0.5', '1', or '0'
            
            # Proceed with frame extraction
            video_path = os.path.join(video_dir, video_file)
            video_frames_dir = os.path.join(frames_dir, video_file.rsplit('.', 1)[0])  # Use full name
            os.makedirs(video_frames_dir, exist_ok=True)
            extract_frames(video_path, video_frames_dir)
            print(f"Frames extracted for {video_file}")
        except ValueError:
            print(f"Skipping file with invalid prefix: {video_file}")

print("Frame extraction completed!")

Frames extracted for 1_user2.mp4
Frames extracted for 1_user5.mp4
Frames extracted for 1_user10.mp4
Frames extracted for 1_user7.mp4
Frames extracted for 1_user13.mp4
Frames extracted for 0_user3.mp4
Frames extracted for 0_user22.mp4
Frames extracted for 0_user23.mp4
Frames extracted for 0_user6.mp4
Frames extracted for 0_user12.mp4
Frames extracted for 0_user8.mp4
Frames extracted for 0.5_user1.mp4
Frames extracted for 1_user21.mp4
Frames extracted for 1_user9.mp4
Frames extracted for 1_user20.mp4
Frame extraction completed!


In [26]:
import cv2
import os
import mediapipe as mp
import json
import math

# Paths
frames_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage5/frames"
keypoints_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage5/keypoints"

# Ensure output directory exists
os.makedirs(keypoints_dir, exist_ok=True)

# Initialize MediaPipe Pose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=True, model_complexity=2, enable_segmentation=False)

def extract_keypoints_from_frame(frame_path):
    """
    Extract keypoints relevant to posture and alignment for the 'L' shape landing.
    """
    image = cv2.imread(frame_path)
    rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    result = pose.process(rgb_image)

    if result.pose_landmarks:
        # Extract keypoints
        keypoints = {
            "right_foot_index": {
                "x": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_FOOT_INDEX].x,
                "y": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_FOOT_INDEX].y,
                "z": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_FOOT_INDEX].z,
            },
            "right_heel": {
                "x": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_HEEL].x,
                "y": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_HEEL].y,
                "z": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_HEEL].z,
            },
            "left_foot_index": {
                "x": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_FOOT_INDEX].x,
                "y": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_FOOT_INDEX].y,
                "z": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_FOOT_INDEX].z,
            },
            "left_heel": {
                "x": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_HEEL].x,
                "y": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_HEEL].y,
                "z": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_HEEL].z,
            },
            "right_knee": {
                "x": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_KNEE].x,
                "y": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_KNEE].y,
                "z": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_KNEE].z,
            },
            "left_knee": {
                "x": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_KNEE].x,
                "y": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_KNEE].y,
                "z": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_KNEE].z,
            },
            "right_shoulder": {
                "x": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_SHOULDER].x,
                "y": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_SHOULDER].y,
                "z": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_SHOULDER].z,
            },
            "left_shoulder": {
                "x": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_SHOULDER].x,
                "y": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_SHOULDER].y,
                "z": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_SHOULDER].z,
            },
            "right_hip": {
                "x": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_HIP].x,
                "y": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_HIP].y,
                "z": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_HIP].z,
            },
            "left_hip": { 
                "x": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_HIP].x,
                "y": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_HIP].y,
                "z": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_HIP].z,
            },
        }

        # Calculate key angles for L-shape landing
        torso_angle = calculate_torso_angle(keypoints)
        leg_angle = calculate_leg_angle(keypoints)
        
        # Store the angles
        keypoints["torso_angle"] = torso_angle
        keypoints["leg_angle"] = leg_angle
        
        return keypoints
    return None


def calculate_torso_angle(keypoints):
    """
    Calculate the angle between the shoulders and hips to determine torso alignment for 'L' shape landing.
    """
    # Get the coordinates of the shoulders and hips
    right_shoulder = keypoints["right_shoulder"]
    left_shoulder = keypoints["left_shoulder"]
    right_hip = keypoints["right_hip"]
    left_hip = keypoints["left_hip"]
    
    # Define vectors between the shoulders and hips
    shoulder_line = (right_shoulder["x"] - left_shoulder["x"], right_shoulder["y"] - left_shoulder["y"])
    hip_line = (right_hip["x"] - left_hip["x"], right_hip["y"] - left_hip["y"])
    
    # Calculate angle between the shoulder and hip lines
    dot_product = shoulder_line[0] * hip_line[0] + shoulder_line[1] * hip_line[1]
    shoulder_magnitude = math.sqrt(shoulder_line[0]**2 + shoulder_line[1]**2)
    hip_magnitude = math.sqrt(hip_line[0]**2 + hip_line[1]**2)
    
    cos_theta = dot_product / (shoulder_magnitude * hip_magnitude)
    angle = math.acos(cos_theta) * 180 / math.pi  # Convert from radians to degrees
    
    return angle


def calculate_leg_angle(keypoints):
    """
    Calculate the angle between the hips, knees, and ankles to determine leg extension.
    """
    # Get the coordinates of the hips, knees, and ankles
    right_hip = keypoints["right_hip"]
    right_knee = keypoints["right_knee"]
    right_heel = keypoints["right_heel"]

    # Calculate vectors from the hip to knee and knee to heel
    hip_to_knee = (right_knee["x"] - right_hip["x"], right_knee["y"] - right_hip["y"])
    knee_to_heel = (right_heel["x"] - right_knee["x"], right_heel["y"] - right_knee["y"])

    # Calculate the angle between the vectors using dot product
    dot_product = hip_to_knee[0] * knee_to_heel[0] + hip_to_knee[1] * knee_to_heel[1]
    hip_to_knee_magnitude = math.sqrt(hip_to_knee[0]**2 + hip_to_knee[1]**2)
    knee_to_heel_magnitude = math.sqrt(knee_to_heel[0]**2 + knee_to_heel[1]**2)
    
    cos_theta = dot_product / (hip_to_knee_magnitude * knee_to_heel_magnitude)
    angle = math.acos(cos_theta) * 180 / math.pi  # Convert from radians to degrees
    
    return angle


def process_frames(video_frames_dir, output_dir, user_id):
    """
    Process all frames for a video and save keypoints as JSON for a specific user.
    """
    keypoints_data = []
    for frame_file in sorted(os.listdir(video_frames_dir)):
        frame_path = os.path.join(video_frames_dir, frame_file)
        keypoints = extract_keypoints_from_frame(frame_path)
        if keypoints:
            keypoints_data.append(keypoints)

    # Save the keypoints to a JSON file for each user
    output_file = os.path.join(output_dir, f"{user_id}_keypoints.json")
    with open(output_file, "w") as json_file:
        json.dump(keypoints_data, json_file)

# Process frames for each user
for video_name in os.listdir(frames_dir):
    video_frames_dir = os.path.join(frames_dir, video_name)
    if os.path.isdir(video_frames_dir):
        # Assuming video_name corresponds to the user_id
        process_frames(video_frames_dir, keypoints_dir, video_name)  # Save keypoints for each user
        print(f"Keypoints extracted for {video_name}")

print("Keypoint extraction completed!")

I0000 00:00:1736972328.078203 4629893 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M4 Pro
W0000 00:00:1736972328.133190 4653542 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1736972328.164888 4653554 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Keypoints extracted for 0.5_user1
Keypoints extracted for 0_user3
Keypoints extracted for 1_user13
Keypoints extracted for 1_user5
Keypoints extracted for 1_user2
Keypoints extracted for 0_user6
Keypoints extracted for 0_user8
Keypoints extracted for 1_user10
Keypoints extracted for 0_user23
Keypoints extracted for 1_user21
Keypoints extracted for 0_user12
Keypoints extracted for 1_user7
Keypoints extracted for 1_user20
Keypoints extracted for 0_user22
Keypoints extracted for 1_user9
Keypoint extraction completed!


In [27]:
import os
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image

# Paths
frames_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage5/frames"
clip_features_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage5/clip_features"
os.makedirs(clip_features_dir, exist_ok=True)

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def extract_clip_features(video_frames_dir, output_path, batch_size=8):
    """
    Extract CLIP features for all frames in a video using batch processing.
    """
    frame_features = []
    frame_paths = sorted(os.listdir(video_frames_dir))
    images = []

    for i, frame_file in enumerate(frame_paths):
        try:
            frame_path = os.path.join(video_frames_dir, frame_file)
            image = Image.open(frame_path).convert("RGB")
            images.append(image)

            # Process batch
            if len(images) == batch_size or i == len(frame_paths) - 1:
                inputs = processor(images=images, return_tensors="pt", padding=True).to(device)
                with torch.no_grad():
                    image_features = model.get_image_features(**inputs).cpu().numpy()
                    frame_features.extend(image_features)
                images = []  # Clear batch to free memory
        except Exception as e:
            print(f"Error processing frame {frame_file}: {e}")
            continue

    # Save features
    torch.save(frame_features, output_path)

# Process all videos
for video_name in os.listdir(frames_dir):
    video_frames_dir = os.path.join(frames_dir, video_name)
    if os.path.isdir(video_frames_dir):
        output_path = os.path.join(clip_features_dir, f"{video_name}_clip.pt")
        extract_clip_features(video_frames_dir, output_path)
        print(f"CLIP features extracted for {video_name}")

print("CLIP feature extraction completed!")

CLIP features extracted for 0.5_user1
CLIP features extracted for 0_user3
CLIP features extracted for 1_user13
CLIP features extracted for 1_user5
CLIP features extracted for 1_user2
CLIP features extracted for 0_user6
CLIP features extracted for 0_user8
CLIP features extracted for 1_user10
CLIP features extracted for 0_user23
CLIP features extracted for 1_user21
CLIP features extracted for 0_user12
CLIP features extracted for 1_user7
CLIP features extracted for 1_user20
CLIP features extracted for 0_user22
CLIP features extracted for 1_user9
CLIP feature extraction completed!


In [28]:
import json
import os
import torch
import numpy as np
import math

# Paths
keypoints_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage5/keypoints"
clip_features_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage5/clip_features"
combined_features_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage5/combined_features"

# Ensure output directory exists
os.makedirs(combined_features_dir, exist_ok=True)

def calculate_landing_position(keypoints):
    """
    Check if the athlete lands in an 'L' shape, perpendicular to the bar.
    """
    landing_frames = []
    for i in range(1, len(keypoints)):
        try:
            # Check if the keypoints contain the required points
            if len(keypoints[i]) < 27:
                print(f"Missing keypoints data at frame {i}, skipping.")
                continue
            
            # Extract keypoints for the athlete's body (e.g., hips, knees, spine)
            left_hip = keypoints[i][23]  # LEFT_HIP
            right_hip = keypoints[i][24]  # RIGHT_HIP
            left_knee = keypoints[i][25]  # LEFT_KNEE
            right_knee = keypoints[i][26]  # RIGHT_KNEE
            spine = keypoints[i][2]  # SPINE
        except IndexError as e:
            print(f"Error accessing keypoint data at frame {i}: {e}, skipping.")
            continue

        # Check the angle between the thigh and the torso to determine the body position
        left_leg_angle = calculate_angle(left_hip, left_knee, spine)
        right_leg_angle = calculate_angle(right_hip, right_knee, spine)

        # Threshold for perpendicular landing (L-shape)
        if 80 < left_leg_angle < 100 and 80 < right_leg_angle < 100:  # Angle values for 'L' shape
            landing_frames.append(i)

    return landing_frames


def calculate_angle(p1, p2, p3):
    """
    Calculate the angle between three points.
    """
    v1 = np.array([p1['x'] - p2['x'], p1['y'] - p2['y']])
    v2 = np.array([p3['x'] - p2['x'], p3['y'] - p2['y']])
    
    cos_angle = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    angle = np.degrees(np.arccos(np.clip(cos_angle, -1.0, 1.0)))  # Ensures the value is within the valid range
    return angle

def combine_features(video_name, pose_path, clip_path, output_path):
    """
    Combine pose-based features and CLIP embeddings for a video.
    """
    # Load pose-based features
    with open(pose_path, 'r') as f:
        keypoints_data = json.load(f)

    # Detect landing in 'L' shape
    landing_frames = calculate_landing_position(keypoints_data)

    # Load CLIP embeddings
    clip_data = torch.load(clip_path)

    # Combine features for each frame
    combined_data = []
    for i, clip_frame in enumerate(clip_data):
        # Use pose features for the current frame
        pose_features = {
            "landed_in_L_shape": 1 if i in landing_frames else 0,  # Binary flag for landing in 'L' shape
        }
        pose_features_array = np.array(list(pose_features.values()))  # Convert pose features to numpy array
        clip_frame = clip_frame.numpy() if isinstance(clip_frame, torch.Tensor) else clip_frame  # Ensure clip_frame is a numpy array

        # Combine the clip frame and pose features
        combined_frame = np.concatenate([clip_frame, pose_features_array])  # Combine as NumPy array
        combined_data.append(combined_frame)

    # Save combined features
    combined_data_tensor = torch.tensor(combined_data)  # Convert the combined data into a tensor
    torch.save(combined_data_tensor, output_path)  # Save as tensor

# Process all videos
for video_name in os.listdir(keypoints_dir):
    if video_name.endswith("_keypoints.json"):
        video_name_base = video_name.replace("_keypoints.json", "")
        pose_path = os.path.join(keypoints_dir, video_name)
        clip_path = os.path.join(clip_features_dir, f"{video_name_base}_clip.pt")
        output_path = os.path.join(combined_features_dir, f"{video_name_base}_combined.pt")

        if os.path.exists(clip_path):
            combine_features(video_name_base, pose_path, clip_path, output_path)
            print(f"Combined features saved for {video_name_base}")

print("Feature combination completed!")


Missing keypoints data at frame 1, skipping.
Missing keypoints data at frame 2, skipping.
Missing keypoints data at frame 3, skipping.
Missing keypoints data at frame 4, skipping.
Missing keypoints data at frame 5, skipping.
Missing keypoints data at frame 6, skipping.
Missing keypoints data at frame 7, skipping.
Missing keypoints data at frame 8, skipping.
Combined features saved for 0_user3
Missing keypoints data at frame 1, skipping.
Missing keypoints data at frame 2, skipping.
Missing keypoints data at frame 3, skipping.
Combined features saved for 1_user7
Missing keypoints data at frame 1, skipping.
Missing keypoints data at frame 2, skipping.
Missing keypoints data at frame 3, skipping.
Missing keypoints data at frame 4, skipping.
Missing keypoints data at frame 5, skipping.
Missing keypoints data at frame 6, skipping.
Missing keypoints data at frame 7, skipping.
Combined features saved for 0_user12
Missing keypoints data at frame 1, skipping.
Missing keypoints data at frame 2, s

  clip_data = torch.load(clip_path)


In [29]:
import torch
from torch.utils.data import Dataset, DataLoader
import os

# Paths
combined_features_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage5/combined_features"

# Hyperparameters
sequence_length = 20  # Adjust as needed (still remains as sequence length)
batch_size = 16       # Adjust as needed

# Ensure input size is always 515
input_size = 515

class AthleticsDataset(Dataset):
    def __init__(self, combined_features_dir, sequence_length, input_size):
        """
        Handles loading and processing of combined features for athletics data.
        Ensures input size is always 515.
        """
        self.data = []
        self.labels = []
        self.sequence_length = sequence_length
        self.input_size = input_size

        for file in os.listdir(combined_features_dir):
            if file.endswith("_combined.pt"):
                # Load combined features
                video_features = torch.tensor(torch.load(os.path.join(combined_features_dir, file)), dtype=torch.float32)

                # Truncate or pad sequences to the desired length
                if video_features.shape[0] >= self.sequence_length:
                    video_features = video_features[:self.sequence_length]
                else:
                    padding = torch.zeros((self.sequence_length - video_features.shape[0], video_features.shape[1]))
                    video_features = torch.cat((video_features, padding), dim=0)

                # Ensure features match the input size (515)
                if video_features.shape[1] != self.input_size:
                    padding = torch.zeros((video_features.shape[0], self.input_size - video_features.shape[1]))
                    video_features = torch.cat((video_features, padding), dim=1)
                self.data.append(video_features)

                # Extract label from filename
                label = float(file.split("_")[0])  # Extract label from file name
                self.labels.append(label)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Initialize dataset and DataLoader
def get_data_loaders(combined_features_dir, sequence_length, batch_size, input_size, train_split=0.8):
    dataset = AthleticsDataset(combined_features_dir, sequence_length, input_size)
    train_size = int(train_split * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, input_size  # Return loaders and input size

# Create DataLoaders
train_loader, val_loader, input_size = get_data_loaders(combined_features_dir, sequence_length, batch_size, input_size)

print(f"DataLoaders created. Input size: {input_size}")


DataLoaders created. Input size: 515


  video_features = torch.tensor(torch.load(os.path.join(combined_features_dir, file)), dtype=torch.float32)
  video_features = torch.tensor(torch.load(os.path.join(combined_features_dir, file)), dtype=torch.float32)


In [30]:
import torch.nn as nn

class TemporalModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        """
        LSTM-based model for sequence prediction.
        """
        super(TemporalModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)  # Use the final hidden state
        output = self.fc(hidden[-1])  # Fully connected output
        return output

In [31]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import itertools
import os

# Paths
combined_features_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/disc_throwing/stages/stage5/combined_features"

# Dataset and DataLoader
class AthleticsDataset(torch.utils.data.Dataset):
    def __init__(self, combined_features_dir, sequence_length):
        self.data = []
        self.labels = []
        self.sequence_length = sequence_length

        for file in os.listdir(combined_features_dir):
            if file.endswith("_combined.pt"):
                video_features = torch.tensor(torch.load(os.path.join(combined_features_dir, file)), dtype=torch.float32)

                if video_features.shape[0] >= self.sequence_length:
                    video_features = video_features[:self.sequence_length]
                else:
                    padding = torch.zeros((self.sequence_length - video_features.shape[0], video_features.shape[1]))
                    video_features = torch.cat((video_features, padding), dim=0)

                self.data.append(video_features)
                label = float(file.split("_")[0])  # Extract label from file name
                self.labels.append(label)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

def get_data_loaders(combined_features_dir, sequence_length, batch_size, train_split=0.8):
    dataset = AthleticsDataset(combined_features_dir, sequence_length)
    train_size = int(train_split * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, dataset[0][0].shape[1]

# Define LSTM model
class TemporalModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout):
        super(TemporalModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        output = self.fc(hidden[-1])
        return output

# Training function with early stopping
def train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=500, patience=100):
    best_val_loss = float("inf")
    patience_counter = 0
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for features, labels in train_loader:
            features, labels = features.to(device), torch.tensor(labels, dtype=torch.float32).to(device)

            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for features, labels in val_loader:
                features, labels = features.to(device), torch.tensor(labels, dtype=torch.float32).to(device)
                outputs = model(features)
                loss = criterion(outputs.squeeze(), labels)
                val_loss += loss.item()

        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

        # Early stopping logic with 100 epochs patience
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"Early stopping triggered after {patience} epochs without improvement!")
            break

    return best_val_loss


# Hyperparameter grid search
# def hyperparameter_search():
    # Hyperparameter grid
    hidden_sizes = [32, 64, 128]
    num_layers = [1, 2, 3]
    learning_rates = [0.001, 0.005, 0.0001, 0.0005]
    dropouts = [0.0, 0.2, 0.3]

    # Initialize variables to track the best model
    best_val_loss = float("inf")
    best_params = None
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Prepare data
    train_loader, val_loader, input_size = get_data_loaders(combined_features_dir, sequence_length=20, batch_size=8)

    # Iterate over all combinations of hyperparameters
    for hidden_size, num_layer, learning_rate, dropout in itertools.product(hidden_sizes, num_layers, learning_rates, dropouts):
        print(f"Testing configuration: Hidden Size={hidden_size}, Num Layers={num_layer}, LR={learning_rate}, Dropout={dropout}")
        
        # Initialize model, criterion, and optimizer
        model = TemporalModel(input_size=input_size, hidden_size=hidden_size, num_layers=num_layer, output_size=1, dropout=dropout).to(device)
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

        # Train the model
        val_loss = train_model(model, train_loader, val_loader, criterion, optimizer, device)

        # Update best model if this configuration is better
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_params = {
                "hidden_size": hidden_size,
                "num_layers": num_layer,
                "learning_rate": learning_rate,
                "dropout": dropout,
            }

    print(f"Best Configuration: {best_params}, Validation Loss: {best_val_loss:.4f}")
    return best_params

# Run hyperparameter search
# best_params = hyperparameter_search()

In [46]:
# Train the final model with the best configuration
final_hidden_size = 128
final_num_layers = 1
final_learning_rate = 0.0008
final_dropout = 0.3

# Prepare DataLoaders (use full dataset for training)
train_loader, val_loader, input_size = get_data_loaders(combined_features_dir, sequence_length=20, batch_size=32)
print(f"Input size for the model: {input_size}")

# Initialize final model
device = "cuda" if torch.cuda.is_available() else "cpu"
final_model = TemporalModel(input_size=input_size, hidden_size=final_hidden_size, num_layers=final_num_layers, output_size=1, dropout=final_dropout).to(device)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(final_model.parameters(), lr=final_learning_rate)

# Train final model
final_val_loss = train_model(final_model, train_loader, val_loader, criterion, optimizer, device, num_epochs=10000, patience=100)
print(f"Final model trained. Validation Loss: {final_val_loss:.4f}")

  video_features = torch.tensor(torch.load(os.path.join(combined_features_dir, file)), dtype=torch.float32)
  features, labels = features.to(device), torch.tensor(labels, dtype=torch.float32).to(device)
  features, labels = features.to(device), torch.tensor(labels, dtype=torch.float32).to(device)


Input size for the model: 514
Epoch [1/10000], Train Loss: 0.7540, Val Loss: 0.8599
Epoch [2/10000], Train Loss: 0.7268, Val Loss: 0.7911
Epoch [3/10000], Train Loss: 0.6999, Val Loss: 0.7295
Epoch [4/10000], Train Loss: 0.6731, Val Loss: 0.6745
Epoch [5/10000], Train Loss: 0.6459, Val Loss: 0.6214
Epoch [6/10000], Train Loss: 0.6177, Val Loss: 0.5679
Epoch [7/10000], Train Loss: 0.5882, Val Loss: 0.5138
Epoch [8/10000], Train Loss: 0.5565, Val Loss: 0.4596
Epoch [9/10000], Train Loss: 0.5216, Val Loss: 0.4050
Epoch [10/10000], Train Loss: 0.4820, Val Loss: 0.3475
Epoch [11/10000], Train Loss: 0.4358, Val Loss: 0.2842
Epoch [12/10000], Train Loss: 0.3801, Val Loss: 0.2146
Epoch [13/10000], Train Loss: 0.3128, Val Loss: 0.1474
Epoch [14/10000], Train Loss: 0.2372, Val Loss: 0.1170
Epoch [15/10000], Train Loss: 0.1791, Val Loss: 0.1783
Epoch [16/10000], Train Loss: 0.2153, Val Loss: 0.1805
Epoch [17/10000], Train Loss: 0.2481, Val Loss: 0.1279
Epoch [18/10000], Train Loss: 0.2162, Val Lo

In [47]:
import matplotlib.pyplot as plt

def evaluate_and_collect_predictions(model, data_loader, device):
    """
    Evaluate the model on the provided DataLoader and collect true vs. predicted values.
    """
    model.eval()
    true_labels = []
    predicted_labels = []

    with torch.no_grad():
        for features, labels in data_loader:
            features = features.to(device)
            labels = labels.to(device)

            # Get model predictions
            outputs = model(features).squeeze()
            predicted_labels.extend(outputs.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return true_labels, predicted_labels

# Use validation set for evaluation
true_labels, predicted_labels = evaluate_and_collect_predictions(final_model, val_loader, device)


In [48]:
import cv2
import os
import torch
from PIL import Image
import numpy as np
from transformers import CLIPProcessor, CLIPModel
import mediapipe as mp

# Paths
video_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage5/videos"
sequence_length = 20  # Sequence length for LSTM

# Initialize CLIP and MediaPipe
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=True, model_complexity=2, enable_segmentation=False)

# Ensure input size is 514 (fix the mismatch)
input_size = 514

# Helper functions
def extract_frames(video_path, interval=5):
    frames = []
    cap = cv2.VideoCapture(video_path)
    count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if count % interval == 0:
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(rgb_frame)

        count += 1

    cap.release()
    return frames

def extract_keypoints(frames):
    keypoints = []
    for frame in frames:
        result = pose.process(frame)
        if result.pose_landmarks:
            keypoints.append([
                {"x": lm.x, "y": lm.y, "z": lm.z, "visibility": lm.visibility}
                for lm in result.pose_landmarks.landmark
            ])
    return keypoints

def calculate_velocity_and_acceleration(keypoints):
    velocities, accelerations = [], []
    for i in range(1, len(keypoints)):
        spine_prev = keypoints[i - 1][11]  # Left shoulder (spine related)
        spine_curr = keypoints[i][11]  # Left shoulder (spine related)
        hip_prev = keypoints[i - 1][23]  # Left hip
        hip_curr = keypoints[i][23]  # Left hip

        velocity_spine = np.sqrt((spine_curr['x'] - spine_prev['x'])**2 + (spine_curr['y'] - spine_prev['y'])**2)
        velocity_hip = np.sqrt((hip_curr['x'] - hip_prev['x'])**2 + (hip_curr['y'] - hip_prev['y'])**2)
        velocities.append((velocity_spine + velocity_hip) / 2)

        if i > 1:
            accelerations.append(velocities[-1] - velocities[-2])
    return velocities, accelerations

def assess_back_and_spine_angle(keypoints):
    angles = []
    for i in range(1, len(keypoints)):
        shoulder = keypoints[i][11]  # Left shoulder
        hip = keypoints[i][23]  # Left hip
        spine_mid = keypoints[i][24]  # Mid spine

        back_vector = np.array([spine_mid['x'] - shoulder['x'], spine_mid['y'] - shoulder['y']])
        hip_vector = np.array([hip['x'] - shoulder['x'], hip['y'] - shoulder['y']])

        dot_product = np.dot(back_vector, hip_vector)
        magnitude_back = np.linalg.norm(back_vector)
        magnitude_hip = np.linalg.norm(hip_vector)

        angle = np.arccos(dot_product / (magnitude_back * magnitude_hip))
        angles.append(np.degrees(angle))  # Convert to degrees

    return angles

def extract_clip_features(frames):
    embeddings = []
    for frame in frames:
        image = Image.fromarray(frame)
        inputs = clip_processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            embedding = clip_model.get_image_features(**inputs).cpu().numpy().flatten()
            embeddings.append(embedding)
    return embeddings

def combine_features(clip_embeddings, velocities, accelerations, back_spine_angles):
    """
    Combine CLIP embeddings, velocities, accelerations, and back/spine angles into a single feature tensor.
    Ensure the final feature vector has 514 features (fixing the mismatch).
    """
    combined_features = []
    for i, clip_embedding in enumerate(clip_embeddings):
        # Combine features with pose features (velocity, acceleration, and back/spine angle)
        feature_vector = np.concatenate([clip_embedding, 
                                        [velocities[i] if i < len(velocities) else 0],
                                        [accelerations[i] if i < len(accelerations) else 0],
                                        [back_spine_angles[i] if i < len(back_spine_angles) else 0]])

        # Ensure the feature vector has 514 features
        feature_vector = np.resize(feature_vector, (input_size,))
        combined_features.append(feature_vector)
    
    return torch.tensor(combined_features, dtype=torch.float32)

def process_and_predict(video_path, model, sequence_length):
    print(f"Processing video: {os.path.basename(video_path)}")
    # Step 1: Extract frames
    frames = extract_frames(video_path)

    # Step 2: Extract keypoints
    keypoints = extract_keypoints(frames)

    # Step 3: Calculate pose-based features (velocity, acceleration)
    velocities, accelerations = calculate_velocity_and_acceleration(keypoints)
    back_spine_angles = assess_back_and_spine_angle(keypoints)  # Assess back/spine angle

    # Step 4: Extract CLIP features
    clip_embeddings = extract_clip_features(frames)

    # Step 5: Combine features
    combined_features = combine_features(clip_embeddings, velocities, accelerations, back_spine_angles)

    # Step 6: Truncate or pad sequences to fixed length (514 features)
    if combined_features.shape[0] >= sequence_length:
        combined_features = combined_features[:sequence_length]
    else:
        padding = torch.zeros((sequence_length - combined_features.shape[0], combined_features.shape[1]))
        combined_features = torch.cat((combined_features, padding), dim=0)

    # Step 7: Predict using the model
    model.eval()
    with torch.no_grad():
        combined_features = combined_features.unsqueeze(0).to(device)  # Add batch dimension
        prediction = model(combined_features).squeeze().cpu().item()

    print(f"Prediction: {prediction:.4f}")
    return prediction

# Process all videos in the directory and predict
results = {}
for video_file in os.listdir(video_dir):
    if video_file.endswith(".mp4"):
        video_path = os.path.join(video_dir, video_file)
        prediction = process_and_predict(video_path, final_model, sequence_length)
        results[video_file] = prediction

# Print results
print("\nPrediction Results:")
for video, prediction in results.items():
    print(f"{video}: {prediction:.4f}")

I0000 00:00:1736972468.323507 4629893 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M4 Pro
W0000 00:00:1736972468.377280 4658134 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1736972468.418138 4658134 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Processing video: 1_user2.mp4
Prediction: 0.9613
Processing video: 1_user5.mp4
Prediction: 0.6387
Processing video: 1_user10.mp4
Prediction: 0.7979
Processing video: 1_user7.mp4
Prediction: 0.9517
Processing video: 1_user13.mp4
Prediction: 0.7977
Processing video: 0_user3.mp4
Prediction: 0.9161
Processing video: 0_user22.mp4
Prediction: 0.8655
Processing video: 0_user23.mp4
Prediction: 0.1475
Processing video: 0_user6.mp4
Prediction: 0.9109
Processing video: 0_user12.mp4
Prediction: 0.9199
Processing video: 0_user8.mp4
Prediction: 0.6972
Processing video: 0.5_user1.mp4
Prediction: 0.9331
Processing video: 1_user21.mp4
Prediction: 0.5458
Processing video: 1_user9.mp4
Prediction: 0.6169
Processing video: 1_user20.mp4
Prediction: 0.8252

Prediction Results:
1_user2.mp4: 0.9613
1_user5.mp4: 0.6387
1_user10.mp4: 0.7979
1_user7.mp4: 0.9517
1_user13.mp4: 0.7977
0_user3.mp4: 0.9161
0_user22.mp4: 0.8655
0_user23.mp4: 0.1475
0_user6.mp4: 0.9109
0_user12.mp4: 0.9199
0_user8.mp4: 0.6972
0.5_user1.

In [49]:
torch.save(final_model.state_dict(), "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage5/stage5_height.pth")

___

## feedback generation

In [36]:
def process_video_with_metadata(video_path, model, sequence_length):
    """
    Process a video and return structured metadata with predictions and pose metrics.
    """
    print(f"Processing video: {os.path.basename(video_path)}")
    
    # Step 1: Extract frames
    frames = extract_frames(video_path)
    
    # Step 2: Extract keypoints
    keypoints = extract_keypoints(frames)

    # Step 3: Calculate pose-based features
    velocities, accelerations = calculate_velocity_and_acceleration(keypoints)
    stride_lengths = calculate_stride_length(keypoints)

    # Pose metrics
    pose_metrics = {
        "average_velocity": np.mean(velocities) if velocities else 0,
        "max_stride_length": np.max(stride_lengths) if stride_lengths else 0,
        "average_acceleration": np.mean(accelerations) if accelerations else 0
    }

    # Step 4: Extract CLIP features
    clip_embeddings = extract_clip_features(frames)

    # CLIP insights
    clip_summary = "High contextual alignment with accelerating motion."  # Placeholder for now

    # Step 5: Combine features
    combined_features = combine_features(clip_embeddings, velocities, accelerations, stride_lengths)

    # Step 6: Truncate or pad sequences
    if combined_features.shape[0] >= sequence_length:
        combined_features = combined_features[:sequence_length]
    else:
        padding = torch.zeros((sequence_length - combined_features.shape[0], combined_features.shape[1]))
        combined_features = torch.cat((combined_features, padding), dim=0)

    # Step 7: Predict using the model
    model.eval()
    with torch.no_grad():
        combined_features = combined_features.unsqueeze(0).to(device)  # Add batch dimension
        prediction = model(combined_features).squeeze().cpu().item()

    print(f"Prediction: {prediction:.4f}")

    # Return structured metadata
    return {
        "video_name": os.path.basename(video_path),
        "prediction": prediction,
        "pose_metrics": pose_metrics,
        "clip_features": {"embedding_summary": clip_summary}
    }

In [37]:
# from transformers import pipeline

# generator = pipeline("text2text-generation", model="google/flan-t5-base")
# result = generator("Explain the performance of an athlete based on metrics.", max_length=50)
# print(result[0]['generated_text'])


# def validate_metadata(metadata):
#     required_keys = {
#         'video_name': str,
#         'prediction': float,
#         'pose_metrics': dict,
#         'clip_features': dict,
#     }
#     pose_metrics_keys = ['average_velocity', 'max_stride_length', 'average_acceleration']
#     clip_features_keys = ['embedding_summary']

#     for key, expected_type in required_keys.items():
#         if key not in metadata or not isinstance(metadata[key], expected_type):
#             raise ValueError(f"Invalid or missing key: {key}, expected type: {expected_type}")
    
#     for key in pose_metrics_keys:
#         if key not in metadata['pose_metrics']:
#             raise ValueError(f"Missing pose metric: {key}")
    
#     for key in clip_features_keys:
#         if key not in metadata['clip_features']:
#             raise ValueError(f"Missing clip feature: {key}")

# def generate_justification(metadata, max_length=150, num_return_sequences=1):
#     validate_metadata(metadata)
    
#     # Dynamic prompt construction
#     metrics_prompt = []
#     for metric, value in metadata['pose_metrics'].items():
#         metrics_prompt.append(f"- {metric.replace('_', ' ').title()}: {value:.2f}")
#     metrics_text = "\n".join(metrics_prompt)

#     prompt = f"""
#     Analyze the performance for {metadata['video_name']}.
#     The predicted score is {metadata['prediction']:.2f}.
#     Key metrics:
#     {metrics_text}
#     - CLIP embedding summary: {metadata['clip_features']['embedding_summary']}
    
#     Based on these metrics, explain why the score is appropriate and provide constructive feedback for improvement.
#     """
#     print("Generated prompt:", prompt)  # Debugging log
#     result = generator(prompt, max_length=max_length, num_return_sequences=num_return_sequences)
#     return result[0]['generated_text']


In [38]:
# # Directory containing videos
# video_dir = "/Users/cezar/Desktop/Team Project/AI/distance_jump/stage1/videos"

# # Dictionary to store results
# results = {}

# for video_file in os.listdir(video_dir):
#     if video_file.endswith(".mp4"):
#         video_path = os.path.join(video_dir, video_file)
        
#         # Step 1: Process video and generate metadata
#         metadata = process_video_with_metadata(video_path, final_model, sequence_length=20)
        
#         # Step 2: Generate justification using Hugging Face model
#         justification = generate_justification(metadata)
#         metadata["justification"] = justification
        
#         # Store results
#         results[video_file] = metadata

# # Print results
# for video, data in results.items():
#     print(f"Video: {video}")
#     print(f"Prediction: {data['prediction']:.4f}")
#     print(f"Justification: {data['justification']}")
#     print("-" * 50)

# # Optional: Save results to JSON
# # import json
# # with open("video_predictions_with_justifications.json", "w") as f:
# #     json.dump(results, f, indent=4)