In [29]:
import cv2
import os

# Paths
video_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage2/videos"
frames_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage2/frames"

# Ensure output directory exists
os.makedirs(frames_dir, exist_ok=True)

def extract_frames(video_path, output_dir, interval=1):
    """
    Extract frames from a video at a given interval (default: every 1 frame).
    """
    cap = cv2.VideoCapture(video_path)
    count = 0
    frame_count = 0
    video_name = os.path.basename(video_path).rsplit('.', 1)[0]  # Get the video name without extension

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Save every 'interval' frame
        if count % interval == 0:
            frame_path = os.path.join(output_dir, f"{video_name}_frame{frame_count}.jpg")
            cv2.imwrite(frame_path, frame)
            frame_count += 1

        count += 1

    cap.release()

# Process all videos
for video_file in os.listdir(video_dir):
    # Check if the filename ends with ".mp4" and starts with a valid number
    if video_file.endswith(".mp4"):
        try:
            # Split by '_' and attempt to convert the first part to float
            prefix = video_file.split('_')[0]
            float(prefix)  # This ensures it works for values like '0.5', '1', or '0'
            
            # Proceed with frame extraction
            video_path = os.path.join(video_dir, video_file)
            video_frames_dir = os.path.join(frames_dir, video_file.rsplit('.', 1)[0])  # Use full name
            os.makedirs(video_frames_dir, exist_ok=True)
            extract_frames(video_path, video_frames_dir)
            print(f"Frames extracted for {video_file}")
        except ValueError:
            print(f"Skipping file with invalid prefix: {video_file}")

print("Frame extraction completed!")

Frames extracted for 1_user2.mp4
Frames extracted for 1_user1.mp4
Frames extracted for 1_user5.mp4
Frames extracted for 1_user10.mp4
Frames extracted for 1_user6.mp4
Frames extracted for 0_user22.mp4
Frames extracted for 0_user7.mp4
Frames extracted for 0_user12.mp4
Frames extracted for 0_user13.mp4
Frames extracted for 1_user23.mp4
Frames extracted for 1_user21.mp4
Frames extracted for 1_user8.mp4
Frames extracted for 1_user9.mp4
Frames extracted for 1_user20.mp4
Frame extraction completed!


In [30]:
import cv2
import os
import mediapipe as mp
import json
import math

# Paths
frames_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage2/frames"
keypoints_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage2/keypoints"

# Ensure output directory exists
os.makedirs(keypoints_dir, exist_ok=True)

# Initialize MediaPipe Pose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=True, model_complexity=2, enable_segmentation=False)

def extract_keypoints_from_frame(frame_path):
    """
    Extract keypoints relevant to posture and alignment, including the lean during the jump.
    """
    image = cv2.imread(frame_path)
    rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    result = pose.process(rgb_image)

    if result.pose_landmarks:
        # Extract keypoints
        keypoints = {
            "right_foot_index": {
                "x": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_FOOT_INDEX].x,
                "y": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_FOOT_INDEX].y,
                "z": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_FOOT_INDEX].z,
            },
            "right_heel": {
                "x": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_HEEL].x,
                "y": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_HEEL].y,
                "z": result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_HEEL].z,
            },
            "left_foot_index": {
                "x": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_FOOT_INDEX].x,
                "y": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_FOOT_INDEX].y,
                "z": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_FOOT_INDEX].z,
            },
            "left_heel": {
                "x": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_HEEL].x,
                "y": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_HEEL].y,
                "z": result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_HEEL].z,
            },
            "hip_center": {
                "x": (result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_HIP].x +
                      result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_HIP].x) / 2,
                "y": (result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_HIP].y +
                      result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_HIP].y) / 2,
                "z": (result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_HIP].z +
                      result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_HIP].z) / 2,
            },
            "shoulder_center": {
                "x": (result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_SHOULDER].x +
                      result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_SHOULDER].x) / 2,
                "y": (result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_SHOULDER].y +
                      result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_SHOULDER].y) / 2,
                "z": (result.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_SHOULDER].z +
                      result.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_SHOULDER].z) / 2,
            },
        }
        
        # Calculate lean angle
        lean_angle = calculate_lean_angle(keypoints)
        keypoints["lean_angle"] = lean_angle
        
        return keypoints
    return None

def calculate_lean_angle(keypoints):
    """
    Calculate the lean angle based on the difference in the y-coordinates of the hips and shoulders.
    If the body is leaned, the angle will differ from a vertical line (90 degrees).
    """
    # Get the hip center and shoulder center coordinates
    hip_center = keypoints["hip_center"]
    shoulder_center = keypoints["shoulder_center"]
    
    # Calculate the difference in the x and y coordinates
    delta_x = shoulder_center["x"] - hip_center["x"]
    delta_y = shoulder_center["y"] - hip_center["y"]
    
    # Calculate the angle between the line joining shoulder_center and hip_center, and the vertical axis (y-axis)
    angle = math.atan2(delta_y, delta_x) * 180 / math.pi  # Convert from radians to degrees
    
    return angle

def process_frames(video_frames_dir, output_path):
    """
    Process all frames for a video and save keypoints as JSON.
    """
    keypoints_data = []
    for frame_file in sorted(os.listdir(video_frames_dir)):
        frame_path = os.path.join(video_frames_dir, frame_file)
        keypoints = extract_keypoints_from_frame(frame_path)
        if keypoints:
            keypoints_data.append(keypoints)

    # Save to JSON
    with open(output_path, 'w') as f:
        json.dump(keypoints_data, f, indent=4)

# Process frames for each video
for video_name in os.listdir(frames_dir):
    video_frames_dir = os.path.join(frames_dir, video_name)
    if os.path.isdir(video_frames_dir):
        output_path = os.path.join(keypoints_dir, f"{video_name}_keypoints.json")
        process_frames(video_frames_dir, output_path)
        print(f"Keypoints extracted for {video_name}")

print("Keypoint extraction completed!")

I0000 00:00:1736959294.434592 4345238 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M4 Pro
W0000 00:00:1736959294.487277 4367238 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1736959294.518368 4367239 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Keypoints extracted for 0_user3
Keypoints extracted for 1_user5
Keypoints extracted for 1_user2
Keypoints extracted for 1_user23
Keypoints extracted for 0_user7
Keypoints extracted for 1_user10
Keypoints extracted for 1_user8
Keypoints extracted for 1_user21
Keypoints extracted for 1_user1
Keypoints extracted for 0_user12
Keypoints extracted for 1_user6
Keypoints extracted for 0_user13
Keypoints extracted for 1_user20
Keypoints extracted for 0_user22
Keypoints extracted for 1_user9
Keypoint extraction completed!


In [31]:
import os
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image

# Paths
frames_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage2/frames"
clip_features_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage2/clip_features"
os.makedirs(clip_features_dir, exist_ok=True)

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def extract_clip_features(video_frames_dir, output_path, batch_size=8):
    """
    Extract CLIP features for all frames in a video using batch processing.
    """
    frame_features = []
    frame_paths = sorted(os.listdir(video_frames_dir))
    images = []

    for i, frame_file in enumerate(frame_paths):
        try:
            frame_path = os.path.join(video_frames_dir, frame_file)
            image = Image.open(frame_path).convert("RGB")
            images.append(image)

            # Process batch
            if len(images) == batch_size or i == len(frame_paths) - 1:
                inputs = processor(images=images, return_tensors="pt", padding=True).to(device)
                with torch.no_grad():
                    image_features = model.get_image_features(**inputs).cpu().numpy()
                    frame_features.extend(image_features)
                images = []  # Clear batch to free memory
        except Exception as e:
            print(f"Error processing frame {frame_file}: {e}")
            continue

    # Save features
    torch.save(frame_features, output_path)

# Process all videos
for video_name in os.listdir(frames_dir):
    video_frames_dir = os.path.join(frames_dir, video_name)
    if os.path.isdir(video_frames_dir):
        output_path = os.path.join(clip_features_dir, f"{video_name}_clip.pt")
        extract_clip_features(video_frames_dir, output_path)
        print(f"CLIP features extracted for {video_name}")

print("CLIP feature extraction completed!")

CLIP features extracted for 1_user5
CLIP features extracted for 1_user2
CLIP features extracted for 1_user23
CLIP features extracted for 0_user7
CLIP features extracted for 1_user10
CLIP features extracted for 1_user8
CLIP features extracted for 1_user21
CLIP features extracted for 1_user1
CLIP features extracted for 0_user12
CLIP features extracted for 1_user6
CLIP features extracted for 0_user13
CLIP features extracted for 1_user20
CLIP features extracted for 0_user22
CLIP features extracted for 1_user9
CLIP feature extraction completed!


In [32]:
import json
import os
import torch
import numpy as np
import math

# Paths
keypoints_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage2/keypoints"
clip_features_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage2/clip_features"
combined_features_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage2/combined_features"

# Ensure output directory exists
os.makedirs(combined_features_dir, exist_ok=True)

def calculate_upright_posture(keypoints):
    """
    Detect upright posture based on torso angle.
    """
    upright_frames = []
    for i in range(1, len(keypoints)):
        try:
            # Extract keypoints for left and right shoulders, and the hip
            left_shoulder = keypoints[i][11]  # Index 11 for LEFT_SHOULDER
            right_shoulder = keypoints[i][12]  # Index 12 for RIGHT_SHOULDER
            mid_hip = keypoints[i][24]  # Index 24 for MID_HIP (rough center between hips)
        except KeyError as e:
            print(f"Missing keypoint {e} at frame {i}, skipping.")
            continue

        # Calculate the angle between the shoulders and the hips
        shoulder_vector = np.array([right_shoulder['x'] - left_shoulder['x'], right_shoulder['y'] - left_shoulder['y']])
        hip_vector = np.array([mid_hip['x'] - left_shoulder['x'], mid_hip['y'] - left_shoulder['y']])
        
        # Compute angle between the vectors
        dot_product = np.dot(shoulder_vector, hip_vector)
        norm_shoulder = np.linalg.norm(shoulder_vector)
        norm_hip = np.linalg.norm(hip_vector)
        angle = np.arccos(dot_product / (norm_shoulder * norm_hip))

        # Define threshold for upright posture
        if angle < np.pi / 4:  # 45 degrees or less is considered upright
            upright_frames.append(i)

    return upright_frames

def calculate_acceleration(keypoints):
    """
    Detect acceleration based on foot movement (using right foot index).
    """
    accelerating_frames = []
    for i in range(1, len(keypoints)):
        try:
            # Extract keypoints for the right foot index (ball of foot)
            right_foot_index_prev = keypoints[i - 1][31]  # Index 31 corresponds to the right foot index
            right_foot_index_curr = keypoints[i][31]
        except KeyError as e:
            print(f"Missing keypoint {e} at frame {i}, skipping.")
            continue

        # Calculate the distance traveled by the foot
        foot_distance = np.sqrt(
            (right_foot_index_curr['x'] - right_foot_index_prev['x'])**2 +
            (right_foot_index_curr['y'] - right_foot_index_prev['y'])**2
        )
        
        # Use a threshold for acceleration detection
        if foot_distance > 0.02:  # Adjust this threshold as necessary
            accelerating_frames.append(i)

    return accelerating_frames

def detect_leaning_into_curve(keypoints):
    """
    Detect if the person is leaning into the curve while jumping.
    This can be based on the angle between the shoulders and hips and the lateral position of the torso.
    """
    leaning_frames = []
    for i in range(1, len(keypoints)):
        try:
            # Extract keypoints for left and right shoulders, and the hip
            left_shoulder = keypoints[i][11]  # Index 11 for LEFT_SHOULDER
            right_shoulder = keypoints[i][12]  # Index 12 for RIGHT_SHOULDER
            mid_hip = keypoints[i][24]  # Index 24 for MID_HIP (rough center between hips)
        except KeyError as e:
            print(f"Missing keypoint {e} at frame {i}, skipping.")
            continue

        # Calculate the angle between the shoulders and the hips
        shoulder_vector = np.array([right_shoulder['x'] - left_shoulder['x'], right_shoulder['y'] - left_shoulder['y']])
        hip_vector = np.array([mid_hip['x'] - left_shoulder['x'], mid_hip['y'] - left_shoulder['y']])
        
        # Compute angle between the vectors
        dot_product = np.dot(shoulder_vector, hip_vector)
        norm_shoulder = np.linalg.norm(shoulder_vector)
        norm_hip = np.linalg.norm(hip_vector)
        angle = np.arccos(dot_product / (norm_shoulder * norm_hip))

        # Define a threshold for leaning into the curve
        # For simplicity, assuming leaning is detected when the angle between shoulders and hips
        # exceeds a certain threshold.
        if angle > np.pi / 6:  # A higher angle suggests the person might be leaning
            leaning_frames.append(i)

    return leaning_frames

def combine_features(video_name, pose_path, clip_path, output_path):
    """
    Combine pose-based features and CLIP embeddings for a video.
    """
    # Load pose-based features
    with open(pose_path, 'r') as f:
        keypoints_data = json.load(f)

    # Detect upright posture, acceleration, and leaning into curve
    upright_frames = calculate_upright_posture(keypoints_data)
    accelerating_frames = calculate_acceleration(keypoints_data)
    leaning_frames = detect_leaning_into_curve(keypoints_data)

    # Load CLIP embeddings
    clip_data = torch.load(clip_path)

    # Combine features for each frame
    combined_data = []
    for i, clip_frame in enumerate(clip_data):
        # Use pose features for the current frame
        pose_features = {
            "upright_posture": 1 if i in upright_frames else 0,  # Binary flag for upright posture
            "accelerating": 1 if i in accelerating_frames else 0,  # Binary flag for acceleration
            "leaning_into_curve": 1 if i in leaning_frames else 0,  # Binary flag for leaning into curve
        }
        combined_frame = np.concatenate([clip_frame, list(pose_features.values())])
        combined_data.append(combined_frame)

    # Save combined features
    torch.save(combined_data, output_path)

# Process all videos
for video_name in os.listdir(keypoints_dir):
    if video_name.endswith("_keypoints.json"):
        video_name_base = video_name.replace("_keypoints.json", "")
        pose_path = os.path.join(keypoints_dir, video_name)
        clip_path = os.path.join(clip_features_dir, f"{video_name_base}_clip.pt")
        output_path = os.path.join(combined_features_dir, f"{video_name_base}_combined.pt")

        if os.path.exists(clip_path):
            combine_features(video_name_base, pose_path, clip_path, output_path)
            print(f"Combined features saved for {video_name_base}")

print("Feature combination completed!")

Missing keypoint 11 at frame 1, skipping.
Missing keypoint 31 at frame 1, skipping.
Missing keypoint 11 at frame 1, skipping.
Combined features saved for 1_user8
Missing keypoint 11 at frame 1, skipping.
Missing keypoint 11 at frame 2, skipping.
Missing keypoint 11 at frame 3, skipping.
Missing keypoint 11 at frame 4, skipping.
Missing keypoint 11 at frame 5, skipping.
Missing keypoint 11 at frame 6, skipping.
Missing keypoint 11 at frame 7, skipping.
Missing keypoint 11 at frame 8, skipping.
Missing keypoint 11 at frame 9, skipping.
Missing keypoint 11 at frame 10, skipping.
Missing keypoint 11 at frame 11, skipping.
Missing keypoint 11 at frame 12, skipping.
Missing keypoint 11 at frame 13, skipping.
Missing keypoint 11 at frame 14, skipping.
Missing keypoint 11 at frame 15, skipping.
Missing keypoint 11 at frame 16, skipping.
Missing keypoint 11 at frame 17, skipping.
Missing keypoint 11 at frame 18, skipping.
Missing keypoint 11 at frame 19, skipping.
Missing keypoint 11 at frame 2

  clip_data = torch.load(clip_path)


In [33]:
import torch
from torch.utils.data import Dataset, DataLoader
import os

# Paths
combined_features_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage2/combined_features"

# Hyperparameters
sequence_length = 20  # Adjust as needed (still remains as sequence length)
batch_size = 16       # Adjust as needed

# Ensure input size is always 515
input_size = 515

class AthleticsDataset(Dataset):
    def __init__(self, combined_features_dir, sequence_length, input_size):
        """
        Handles loading and processing of combined features for athletics data.
        Ensures input size is always 515.
        """
        self.data = []
        self.labels = []
        self.sequence_length = sequence_length
        self.input_size = input_size

        for file in os.listdir(combined_features_dir):
            if file.endswith("_combined.pt"):
                # Load combined features
                video_features = torch.tensor(torch.load(os.path.join(combined_features_dir, file)), dtype=torch.float32)

                # Truncate or pad sequences to the desired length
                if video_features.shape[0] >= self.sequence_length:
                    video_features = video_features[:self.sequence_length]
                else:
                    padding = torch.zeros((self.sequence_length - video_features.shape[0], video_features.shape[1]))
                    video_features = torch.cat((video_features, padding), dim=0)

                # Ensure features match the input size (515)
                if video_features.shape[1] != self.input_size:
                    padding = torch.zeros((video_features.shape[0], self.input_size - video_features.shape[1]))
                    video_features = torch.cat((video_features, padding), dim=1)
                self.data.append(video_features)

                # Extract label from filename
                label = float(file.split("_")[0])  # Extract label from file name
                self.labels.append(label)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Initialize dataset and DataLoader
def get_data_loaders(combined_features_dir, sequence_length, batch_size, input_size, train_split=0.8):
    dataset = AthleticsDataset(combined_features_dir, sequence_length, input_size)
    train_size = int(train_split * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, input_size  # Return loaders and input size

# Create DataLoaders
train_loader, val_loader, input_size = get_data_loaders(combined_features_dir, sequence_length, batch_size, input_size)

print(f"DataLoaders created. Input size: {input_size}")


DataLoaders created. Input size: 515


  video_features = torch.tensor(torch.load(os.path.join(combined_features_dir, file)), dtype=torch.float32)


In [34]:
import torch.nn as nn

class TemporalModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        """
        LSTM-based model for sequence prediction.
        """
        super(TemporalModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)  # Use the final hidden state
        output = self.fc(hidden[-1])  # Fully connected output
        return output

In [35]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import itertools
import os

# Paths
combined_features_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/disc_throwing/stages/stage2/combined_features"

# Dataset and DataLoader
class AthleticsDataset(torch.utils.data.Dataset):
    def __init__(self, combined_features_dir, sequence_length):
        self.data = []
        self.labels = []
        self.sequence_length = sequence_length

        for file in os.listdir(combined_features_dir):
            if file.endswith("_combined.pt"):
                video_features = torch.tensor(torch.load(os.path.join(combined_features_dir, file)), dtype=torch.float32)

                if video_features.shape[0] >= self.sequence_length:
                    video_features = video_features[:self.sequence_length]
                else:
                    padding = torch.zeros((self.sequence_length - video_features.shape[0], video_features.shape[1]))
                    video_features = torch.cat((video_features, padding), dim=0)

                self.data.append(video_features)
                label = float(file.split("_")[0])  # Extract label from file name
                self.labels.append(label)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

def get_data_loaders(combined_features_dir, sequence_length, batch_size, train_split=0.8):
    dataset = AthleticsDataset(combined_features_dir, sequence_length)
    train_size = int(train_split * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, dataset[0][0].shape[1]

# Define LSTM model
class TemporalModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout):
        super(TemporalModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        output = self.fc(hidden[-1])
        return output

# Training function with early stopping
def train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=500, patience=100):
    best_val_loss = float("inf")
    patience_counter = 0
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for features, labels in train_loader:
            features, labels = features.to(device), torch.tensor(labels, dtype=torch.float32).to(device)

            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for features, labels in val_loader:
                features, labels = features.to(device), torch.tensor(labels, dtype=torch.float32).to(device)
                outputs = model(features)
                loss = criterion(outputs.squeeze(), labels)
                val_loss += loss.item()

        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

        # Early stopping logic with 100 epochs patience
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"Early stopping triggered after {patience} epochs without improvement!")
            break

    return best_val_loss


# Hyperparameter grid search
# def hyperparameter_search():
    # Hyperparameter grid
    hidden_sizes = [32, 64, 128]
    num_layers = [1, 2, 3]
    learning_rates = [0.001, 0.005, 0.0001, 0.0005]
    dropouts = [0.0, 0.2, 0.3]

    # Initialize variables to track the best model
    best_val_loss = float("inf")
    best_params = None
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Prepare data
    train_loader, val_loader, input_size = get_data_loaders(combined_features_dir, sequence_length=20, batch_size=8)

    # Iterate over all combinations of hyperparameters
    for hidden_size, num_layer, learning_rate, dropout in itertools.product(hidden_sizes, num_layers, learning_rates, dropouts):
        print(f"Testing configuration: Hidden Size={hidden_size}, Num Layers={num_layer}, LR={learning_rate}, Dropout={dropout}")
        
        # Initialize model, criterion, and optimizer
        model = TemporalModel(input_size=input_size, hidden_size=hidden_size, num_layers=num_layer, output_size=1, dropout=dropout).to(device)
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

        # Train the model
        val_loss = train_model(model, train_loader, val_loader, criterion, optimizer, device)

        # Update best model if this configuration is better
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_params = {
                "hidden_size": hidden_size,
                "num_layers": num_layer,
                "learning_rate": learning_rate,
                "dropout": dropout,
            }

    print(f"Best Configuration: {best_params}, Validation Loss: {best_val_loss:.4f}")
    return best_params

# Run hyperparameter search
# best_params = hyperparameter_search()

In [36]:
# Train the final model with the best configuration
final_hidden_size = 128
final_num_layers = 1
final_learning_rate = 0.002
final_dropout = 0.5

# Prepare DataLoaders (use full dataset for training)
train_loader, val_loader, input_size = get_data_loaders(combined_features_dir, sequence_length=20, batch_size=16)
print(f"Input size for the model: {input_size}")

# Initialize final model
device = "cuda" if torch.cuda.is_available() else "cpu"
final_model = TemporalModel(input_size=input_size, hidden_size=final_hidden_size, num_layers=final_num_layers, output_size=1, dropout=final_dropout).to(device)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(final_model.parameters(), lr=final_learning_rate)

# Train final model
final_val_loss = train_model(final_model, train_loader, val_loader, criterion, optimizer, device, num_epochs=500, patience=100)
print(f"Final model trained. Validation Loss: {final_val_loss:.4f}")

Input size for the model: 513
Epoch [1/500], Train Loss: 0.6743, Val Loss: 0.5461
Epoch [2/500], Train Loss: 0.5238, Val Loss: 0.4715
Epoch [3/500], Train Loss: 0.4860, Val Loss: 0.4276
Epoch [4/500], Train Loss: 0.4218, Val Loss: 0.3886
Epoch [5/500], Train Loss: 0.3588, Val Loss: 0.3442
Epoch [6/500], Train Loss: 0.3025, Val Loss: 0.2881
Epoch [7/500], Train Loss: 0.2439, Val Loss: 0.2218
Epoch [8/500], Train Loss: 0.1807, Val Loss: 0.1592
Epoch [9/500], Train Loss: 0.1291, Val Loss: 0.1389
Epoch [10/500], Train Loss: 0.1255, Val Loss: 0.1787
Epoch [11/500], Train Loss: 0.1528, Val Loss: 0.1930
Epoch [12/500], Train Loss: 0.1399, Val Loss: 0.2030
Epoch [13/500], Train Loss: 0.1209, Val Loss: 0.2228
Epoch [14/500], Train Loss: 0.1158, Val Loss: 0.2310
Epoch [15/500], Train Loss: 0.1146, Val Loss: 0.2207
Epoch [16/500], Train Loss: 0.1097, Val Loss: 0.1997
Epoch [17/500], Train Loss: 0.1030, Val Loss: 0.1768
Epoch [18/500], Train Loss: 0.0981, Val Loss: 0.1579
Epoch [19/500], Train Los

  video_features = torch.tensor(torch.load(os.path.join(combined_features_dir, file)), dtype=torch.float32)
  features, labels = features.to(device), torch.tensor(labels, dtype=torch.float32).to(device)
  features, labels = features.to(device), torch.tensor(labels, dtype=torch.float32).to(device)


Epoch [29/500], Train Loss: 0.0829, Val Loss: 0.1746
Epoch [30/500], Train Loss: 0.0825, Val Loss: 0.1740
Epoch [31/500], Train Loss: 0.0808, Val Loss: 0.1704
Epoch [32/500], Train Loss: 0.0784, Val Loss: 0.1654
Epoch [33/500], Train Loss: 0.0759, Val Loss: 0.1606
Epoch [34/500], Train Loss: 0.0736, Val Loss: 0.1574
Epoch [35/500], Train Loss: 0.0717, Val Loss: 0.1567
Epoch [36/500], Train Loss: 0.0700, Val Loss: 0.1591
Epoch [37/500], Train Loss: 0.0679, Val Loss: 0.1646
Epoch [38/500], Train Loss: 0.0654, Val Loss: 0.1728
Epoch [39/500], Train Loss: 0.0622, Val Loss: 0.1828
Epoch [40/500], Train Loss: 0.0586, Val Loss: 0.1935
Epoch [41/500], Train Loss: 0.0550, Val Loss: 0.2039
Epoch [42/500], Train Loss: 0.0518, Val Loss: 0.2117
Epoch [43/500], Train Loss: 0.0487, Val Loss: 0.2143
Epoch [44/500], Train Loss: 0.0456, Val Loss: 0.2108
Epoch [45/500], Train Loss: 0.0421, Val Loss: 0.2087
Epoch [46/500], Train Loss: 0.0382, Val Loss: 0.2157
Epoch [47/500], Train Loss: 0.0341, Val Loss: 

In [37]:
import matplotlib.pyplot as plt

def evaluate_and_collect_predictions(model, data_loader, device):
    """
    Evaluate the model on the provided DataLoader and collect true vs. predicted values.
    """
    model.eval()
    true_labels = []
    predicted_labels = []

    with torch.no_grad():
        for features, labels in data_loader:
            features = features.to(device)
            labels = labels.to(device)

            # Get model predictions
            outputs = model(features).squeeze()
            predicted_labels.extend(outputs.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return true_labels, predicted_labels

# Use validation set for evaluation
true_labels, predicted_labels = evaluate_and_collect_predictions(final_model, val_loader, device)


In [38]:
import cv2
import os
import torch
from PIL import Image
import numpy as np
from transformers import CLIPProcessor, CLIPModel
import mediapipe as mp

# Paths
video_dir = "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage2/videos"
sequence_length = 20  # Sequence length for LSTM

# Initialize CLIP and MediaPipe
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=True, model_complexity=2, enable_segmentation=False)

# Ensure input size is always 515
input_size = 515

# Helper functions
def extract_frames(video_path, interval=5):
    """
    Extract frames from a video at a given interval.
    """
    frames = []
    cap = cv2.VideoCapture(video_path)
    count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if count % interval == 0:
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(rgb_frame)

        count += 1

    cap.release()
    return frames

def extract_keypoints(frames):
    """
    Extract pose keypoints from frames using MediaPipe.
    """
    keypoints = []
    for frame in frames:
        result = pose.process(frame)
        if result.pose_landmarks:
            keypoints.append([
                {"x": lm.x, "y": lm.y, "z": lm.z, "visibility": lm.visibility}
                for lm in result.pose_landmarks.landmark
            ])
    return keypoints

def calculate_velocity_and_acceleration(keypoints):
    """
    Calculate velocity and acceleration using the torso and hip keypoints.
    Assumes that torso and hip points are relevant for assessing posture.
    """
    velocities, accelerations = [], []
    for i in range(1, len(keypoints)):
        # Calculate velocity using distance between torso and hips
        torso_prev = keypoints[i - 1][11]  # Left shoulder
        torso_curr = keypoints[i][11]  # Left shoulder

        # Calculate velocity as Euclidean distance between two points
        velocity = np.sqrt((torso_curr['x'] - torso_prev['x'])**2 + (torso_curr['y'] - torso_prev['y'])**2)
        velocities.append(velocity)

        if i > 1:
            accelerations.append(velocities[-1] - velocities[-2])
    return velocities, accelerations

def assess_leaning_into_curve(keypoints):
    """
    Assess whether the individual is leaning into a curve by calculating the angle
    between the torso and legs (using shoulder, hip, and knee keypoints).
    A smaller angle may indicate leaning into a curve.
    """
    angles = []
    for i in range(1, len(keypoints)):
        # Compute the angle between the torso and the legs (using shoulder, hip, and knee keypoints)
        shoulder = keypoints[i][11]  # Left shoulder
        hip = keypoints[i][23]  # Left hip
        knee = keypoints[i][25]  # Left knee

        # Calculate vectors for the torso and thigh
        torso_vector = np.array([shoulder['x'] - hip['x'], shoulder['y'] - hip['y']])
        thigh_vector = np.array([knee['x'] - hip['x'], knee['y'] - hip['y']])

        # Compute the angle between the torso and thigh using dot product
        dot_product = np.dot(torso_vector, thigh_vector)
        magnitude_torso = np.linalg.norm(torso_vector)
        magnitude_thigh = np.linalg.norm(thigh_vector)

        # Angle in radians
        angle = np.arccos(dot_product / (magnitude_torso * magnitude_thigh))
        angles.append(np.degrees(angle))  # Convert to degrees

    return angles

def extract_clip_features(frames):
    """
    Extract CLIP embeddings for each frame.
    """
    embeddings = []
    for frame in frames:
        image = Image.fromarray(frame)
        inputs = clip_processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            embedding = clip_model.get_image_features(**inputs).cpu().numpy().flatten()
            embeddings.append(embedding)
    return embeddings

def combine_features(clip_embeddings, velocities, accelerations, leaning_angles):
    """
    Combine CLIP embeddings, velocities, accelerations, and leaning angles into a single feature tensor.
    Ensure the final feature vector has 513 features.
    """
    combined_features = []
    for i, clip_embedding in enumerate(clip_embeddings):
        # Combine features with pose features (velocity, acceleration, and leaning angle)
        feature_vector = np.concatenate([clip_embedding, 
                                        [velocities[i] if i < len(velocities) else 0],
                                        [accelerations[i] if i < len(accelerations) else 0],
                                        [leaning_angles[i] if i < len(leaning_angles) else 0]])

        # Ensure the feature vector has 513 features
        if len(feature_vector) > 513:
            feature_vector = feature_vector[:513]  # Truncate if there are more than 513 features
        elif len(feature_vector) < 513:
            padding = np.zeros(513 - len(feature_vector))  # Pad with zeros if there are fewer than 513 features
            feature_vector = np.concatenate([feature_vector, padding])

        combined_features.append(feature_vector)
    
    return torch.tensor(combined_features, dtype=torch.float32)


def process_and_predict(video_path, model, sequence_length):
    """
    Process a single video and predict its score.
    """
    print(f"Processing video: {os.path.basename(video_path)}")
    # Step 1: Extract frames
    frames = extract_frames(video_path)

    # Step 2: Extract keypoints
    keypoints = extract_keypoints(frames)

    # Step 3: Calculate pose-based features (velocity, acceleration)
    velocities, accelerations = calculate_velocity_and_acceleration(keypoints)
    leaning_angles = assess_leaning_into_curve(keypoints)  # Assess leaning into a curve

    # Step 4: Extract CLIP features
    clip_embeddings = extract_clip_features(frames)

    # Step 5: Combine features
    combined_features = combine_features(clip_embeddings, velocities, accelerations, leaning_angles)

    # Step 6: Truncate or pad sequences to fixed length (515 features)
    if combined_features.shape[0] >= sequence_length:
        combined_features = combined_features[:sequence_length]
    else:
        padding = torch.zeros((sequence_length - combined_features.shape[0], combined_features.shape[1]))
        combined_features = torch.cat((combined_features, padding), dim=0)

    # Step 7: Predict using the model
    model.eval()
    with torch.no_grad():
        combined_features = combined_features.unsqueeze(0).to(device)  # Add batch dimension
        prediction = model(combined_features).squeeze().cpu().item()

    print(f"Prediction: {prediction:.4f}")
    return prediction

# Process all videos in the directory and predict
results = {}
for video_file in os.listdir(video_dir):
    if video_file.endswith(".mp4"):
        video_path = os.path.join(video_dir, video_file)
        prediction = process_and_predict(video_path, final_model, sequence_length)
        results[video_file] = prediction

# Print results
print("\nPrediction Results:")
for video, prediction in results.items():
    print(f"{video}: {prediction:.4f}")


I0000 00:00:1736959327.342315 4345238 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M4 Pro
W0000 00:00:1736959327.399122 4368215 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1736959327.429373 4368220 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Processing video: 1_user2.mp4
Prediction: 0.9992
Processing video: 1_user1.mp4
Prediction: 0.9743
Processing video: 1_user5.mp4
Prediction: 0.3303
Processing video: 1_user10.mp4
Prediction: 0.9742
Processing video: 1_user6.mp4
Prediction: 0.7224
Processing video: 0_user22.mp4
Prediction: 0.9808
Processing video: 0_user7.mp4
Prediction: 0.0590
Processing video: 0_user12.mp4
Prediction: 0.0039
Processing video: 0_user13.mp4
Prediction: 0.0060
Processing video: 1_user23.mp4
Prediction: 0.7090
Processing video: 1_user21.mp4
Prediction: 0.3852
Processing video: 1_user8.mp4
Prediction: 0.9747
Processing video: 1_user9.mp4
Prediction: 0.1936
Processing video: 1_user20.mp4
Prediction: 0.9865

Prediction Results:
1_user2.mp4: 0.9992
1_user1.mp4: 0.9743
1_user5.mp4: 0.3303
1_user10.mp4: 0.9742
1_user6.mp4: 0.7224
0_user22.mp4: 0.9808
0_user7.mp4: 0.0590
0_user12.mp4: 0.0039
0_user13.mp4: 0.0060
1_user23.mp4: 0.7090
1_user21.mp4: 0.3852
1_user8.mp4: 0.9747
1_user9.mp4: 0.1936
1_user20.mp4: 0.9865

In [39]:
torch.save(final_model.state_dict(), "/Users/danyukezz/Desktop/2 year 1 semester/team project/danya_preprocessing_sports/height_jump/stages/stage2/stage2_height.pth")

___

## feedback generation

In [40]:
def process_video_with_metadata(video_path, model, sequence_length):
    """
    Process a video and return structured metadata with predictions and pose metrics.
    """
    print(f"Processing video: {os.path.basename(video_path)}")
    
    # Step 1: Extract frames
    frames = extract_frames(video_path)
    
    # Step 2: Extract keypoints
    keypoints = extract_keypoints(frames)

    # Step 3: Calculate pose-based features
    velocities, accelerations = calculate_velocity_and_acceleration(keypoints)
    stride_lengths = calculate_stride_length(keypoints)

    # Pose metrics
    pose_metrics = {
        "average_velocity": np.mean(velocities) if velocities else 0,
        "max_stride_length": np.max(stride_lengths) if stride_lengths else 0,
        "average_acceleration": np.mean(accelerations) if accelerations else 0
    }

    # Step 4: Extract CLIP features
    clip_embeddings = extract_clip_features(frames)

    # CLIP insights
    clip_summary = "High contextual alignment with accelerating motion."  # Placeholder for now

    # Step 5: Combine features
    combined_features = combine_features(clip_embeddings, velocities, accelerations, stride_lengths)

    # Step 6: Truncate or pad sequences
    if combined_features.shape[0] >= sequence_length:
        combined_features = combined_features[:sequence_length]
    else:
        padding = torch.zeros((sequence_length - combined_features.shape[0], combined_features.shape[1]))
        combined_features = torch.cat((combined_features, padding), dim=0)

    # Step 7: Predict using the model
    model.eval()
    with torch.no_grad():
        combined_features = combined_features.unsqueeze(0).to(device)  # Add batch dimension
        prediction = model(combined_features).squeeze().cpu().item()

    print(f"Prediction: {prediction:.4f}")

    # Return structured metadata
    return {
        "video_name": os.path.basename(video_path),
        "prediction": prediction,
        "pose_metrics": pose_metrics,
        "clip_features": {"embedding_summary": clip_summary}
    }

In [41]:
# from transformers import pipeline

# generator = pipeline("text2text-generation", model="google/flan-t5-base")
# result = generator("Explain the performance of an athlete based on metrics.", max_length=50)
# print(result[0]['generated_text'])


# def validate_metadata(metadata):
#     required_keys = {
#         'video_name': str,
#         'prediction': float,
#         'pose_metrics': dict,
#         'clip_features': dict,
#     }
#     pose_metrics_keys = ['average_velocity', 'max_stride_length', 'average_acceleration']
#     clip_features_keys = ['embedding_summary']

#     for key, expected_type in required_keys.items():
#         if key not in metadata or not isinstance(metadata[key], expected_type):
#             raise ValueError(f"Invalid or missing key: {key}, expected type: {expected_type}")
    
#     for key in pose_metrics_keys:
#         if key not in metadata['pose_metrics']:
#             raise ValueError(f"Missing pose metric: {key}")
    
#     for key in clip_features_keys:
#         if key not in metadata['clip_features']:
#             raise ValueError(f"Missing clip feature: {key}")

# def generate_justification(metadata, max_length=150, num_return_sequences=1):
#     validate_metadata(metadata)
    
#     # Dynamic prompt construction
#     metrics_prompt = []
#     for metric, value in metadata['pose_metrics'].items():
#         metrics_prompt.append(f"- {metric.replace('_', ' ').title()}: {value:.2f}")
#     metrics_text = "\n".join(metrics_prompt)

#     prompt = f"""
#     Analyze the performance for {metadata['video_name']}.
#     The predicted score is {metadata['prediction']:.2f}.
#     Key metrics:
#     {metrics_text}
#     - CLIP embedding summary: {metadata['clip_features']['embedding_summary']}
    
#     Based on these metrics, explain why the score is appropriate and provide constructive feedback for improvement.
#     """
#     print("Generated prompt:", prompt)  # Debugging log
#     result = generator(prompt, max_length=max_length, num_return_sequences=num_return_sequences)
#     return result[0]['generated_text']


In [42]:
# # Directory containing videos
# video_dir = "/Users/cezar/Desktop/Team Project/AI/distance_jump/stage1/videos"

# # Dictionary to store results
# results = {}

# for video_file in os.listdir(video_dir):
#     if video_file.endswith(".mp4"):
#         video_path = os.path.join(video_dir, video_file)
        
#         # Step 1: Process video and generate metadata
#         metadata = process_video_with_metadata(video_path, final_model, sequence_length=20)
        
#         # Step 2: Generate justification using Hugging Face model
#         justification = generate_justification(metadata)
#         metadata["justification"] = justification
        
#         # Store results
#         results[video_file] = metadata

# # Print results
# for video, data in results.items():
#     print(f"Video: {video}")
#     print(f"Prediction: {data['prediction']:.4f}")
#     print(f"Justification: {data['justification']}")
#     print("-" * 50)

# # Optional: Save results to JSON
# # import json
# # with open("video_predictions_with_justifications.json", "w") as f:
# #     json.dump(results, f, indent=4)