#CPMC and SCGP algorithm implementation


In [None]:
from google.colab import drive
drive.mount('/content/drive')

#CPMC ALGO

In [None]:
import cv2
import numpy as np
import concurrent.futures
from skimage import segmentation
import os

def generate_superpixels(img, num_segments=100, compactness=30):
    """
    Generate superpixels using scikit-image's SLIC.
    """
    img_lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
    segments = segmentation.slic(img_lab, n_segments=num_segments, compactness=compactness, start_label=1)
    return segments

def compute_proposal_grabcut(img, segment_mask, global_foreground_mask, iter_count=2):
    """
    Run GrabCut on a given superpixel (mask) with a specified number of iterations.
    The global_foreground_mask ensures that already detected foreground is not included again.
    """
    mask = np.full(img.shape[:2], cv2.GC_BGD, dtype=np.uint8)
    mask[segment_mask > 0] = cv2.GC_PR_FGD

    # Ensure previously detected foreground is marked as background
    mask[global_foreground_mask > 0] = cv2.GC_BGD

    bgdModel = np.zeros((1, 65), np.float64)
    fgdModel = np.zeros((1, 65), np.float64)

    cv2.grabCut(img, mask, None, bgdModel, fgdModel, iter_count, cv2.GC_INIT_WITH_MASK)
    proposal = np.where((mask == cv2.GC_FGD) | (mask == cv2.GC_PR_FGD), 1, 0).astype(np.uint8)

    # Update global foreground mask (accumulate detected foreground regions)
    global_foreground_mask[proposal > 0] = 1

    return proposal

def process_task(args):
    img, segment, global_foreground_mask, iter_count = args
    return compute_proposal_grabcut(img, segment, global_foreground_mask, iter_count)

def cpmc_object_proposals(img, num_segments=100, num_iters=2):
    """
    Generate non-overlapping CPMC-style object proposals for an image.
    """
    scale_factor = 0.75  # Downscale factor
    img_small = cv2.resize(img, (0, 0), fx=scale_factor, fy=scale_factor)
    superpixels = generate_superpixels(img_small, num_segments=num_segments, compactness=30)

    # Initialize global foreground mask (to track detected regions)
    global_foreground_mask = np.zeros(img_small.shape[:2], dtype=np.uint8)

    tasks = [
        (img_small, (superpixels == label).astype(np.uint8), global_foreground_mask, iter_count)
        for label in np.unique(superpixels)
        for iter_count in range(1, num_iters + 1)
    ]

    with concurrent.futures.ThreadPoolExecutor() as executor:
        proposals_small = list(executor.map(process_task, tasks))

    # Resize proposals back to the original image size
    proposals_resized = [cv2.resize(p, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_NEAREST)
                         for p in proposals_small]

    # Filter out small proposals
    min_size = 500  # Minimum number of pixels for a proposal to be considered
    filtered_proposals = [p for p in proposals_resized if np.sum(p) > min_size]

    # Convert binary masks to colored proposals
    colored_proposals = []
    for mask in filtered_proposals:
        mask_3 = np.repeat(mask[:, :, np.newaxis], 3, axis=2)  # Convert to 3-channel mask
        colored = img * mask_3
        colored_proposals.append(colored)

    return colored_proposals

def process_video_frames(video_path, num_segments=100, num_iters=2, max_frames=None, target_fps=15, save_dir="output_proposals"):
    """
    Process a video file frame-by-frame and generate non-overlapping object proposals.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise ValueError("Error opening video file")

    orig_fps = cap.get(cv2.CAP_PROP_FPS)
    skip_frames = int(orig_fps / target_fps)
    print(f"Original FPS: {orig_fps}, processing every {skip_frames}th frame to get ~{target_fps} fps.")

    frame_proposals = {}
    frame_count = 0
    processed_frames = 0

    os.makedirs(save_dir, exist_ok=True)

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame_count += 1

        if frame_count % skip_frames != 0:
            continue

        processed_frames += 1
        print(f"Processing frame {frame_count} (processed {processed_frames} frames)...")
        proposals = cpmc_object_proposals(frame, num_segments=num_segments, num_iters=num_iters)

        # Filter proposals based on area
        h, w = frame.shape[:2]
        filtered = [p for p in proposals if 100 < np.count_nonzero(p) < 0.8 * h * w]
        frame_proposals[frame_count] = filtered

        # Save proposals
        frame_dir = os.path.join(save_dir, f"frame_{frame_count}")
        os.makedirs(frame_dir, exist_ok=True)
        for i, prop in enumerate(filtered):
            proposal_path = os.path.join(frame_dir, f"proposal_{i}.png")
            cv2.imwrite(proposal_path, prop)

        if max_frames is not None and processed_frames >= max_frames:
            break

    cap.release()
    return frame_proposals

if __name__ == '__main__':
    video_path = "/content/drive/MyDrive/data_composite_01/data/s0001_f_w000006.mp4"
    proposals_dict = process_video_frames(video_path, num_segments=10, num_iters=1, max_frames=None, target_fps=15)

    # Print number of proposals per frame
    for frame_idx, proposals in proposals_dict.items():
        print(f"Frame {frame_idx}: {len(proposals)} proposals")


#SCGP ALGO

In [None]:
import cv2
import numpy as np
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
import matplotlib.pyplot as plt
import os
import concurrent.futures

##############################################
# SCGP FUNCTIONS FOR VIDEO-WIDE VISUAL ATOM EXTRACTION
##############################################

# --- Step 1: Load Pretrained AlexNet Model for Feature Extraction ---
class AlexNetFc7(nn.Module):
    def __init__(self):
        super(AlexNetFc7, self).__init__()
        alexnet = models.alexnet(weights=models.AlexNet_Weights.IMAGENET1K_V1)
        self.features = alexnet.features
        self.avgpool = alexnet.avgpool
        self.fc6 = alexnet.classifier[1]
        self.relu6 = alexnet.classifier[2]
        self.fc7 = alexnet.classifier[4]
        self.relu7 = alexnet.classifier[5]

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc6(x)
        x = self.relu6(x)
        x = self.fc7(x)
        x = self.relu7(x)
        return x

# Define image transforms for AlexNet.
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to AlexNet's expected input size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# --- Utility: Compute Bounding Box from a Colored Proposal ---
def colored_proposal_to_bbox(colored_prop, thresh=1):
    """
    Given a colored object proposal (an RGB image with a black background),
    convert it to grayscale and compute a bounding box from non-black pixels.

    Arguments:
        colored_prop: The colored proposal image (as a NumPy array, BGR order).
        thresh: Threshold value; pixels with intensity greater than this (in grayscale)
                are considered foreground.

    Returns:
        A list [x_min, y_min, x_max, y_max] representing the bounding box, or None if no foreground is found.
    """
    # Convert the colored proposal (BGR) to grayscale.
    gray = cv2.cvtColor(colored_prop, cv2.COLOR_BGR2GRAY)
    # Threshold the grayscale image: pixels > thresh are foreground.
    _, binary = cv2.threshold(gray, thresh, 255, cv2.THRESH_BINARY)
    ys, xs = np.where(binary > 0)
    if ys.size == 0 or xs.size == 0:
        return None
    return [int(xs.min()), int(ys.min()), int(xs.max()), int(ys.max())]

# --- Step 2: Extract Features from Object Proposals Using Bounding Boxes ---
def extract_features_from_bounding_boxes(video_path, proposals_dict, device, model):
    """
    For each colored object proposal in proposals_dict, compute its bounding box
    (from non-black foreground), crop the corresponding region from the original frame,
    and extract an fc7 feature vector from the cropped region.

    Returns:
        all_features: A numpy array of shape (total_proposals, feature_dim)
        proposal_mapping: A list mapping each feature index to (frame_num, proposal_idx)
    """
    all_features = []
    proposal_mapping = []

    cap = cv2.VideoCapture(video_path)
    for frame_num, proposals in proposals_dict.items():
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num - 1)
        ret, frame = cap.read()
        if not ret:
            print(f"Could not read frame {frame_num}. Skipping...")
            continue

        for idx, colored_prop in enumerate(proposals):
            # Compute bounding box from the colored proposal.
            bbox = colored_proposal_to_bbox(colored_prop)
            if bbox is None:
                continue
            x_min, y_min, x_max, y_max = bbox
            # Crop the object region from the original frame.
            cropped_obj = frame[y_min:y_max, x_min:x_max]
            if cropped_obj.size == 0:
                continue

            # Convert cropped region to PIL image (RGB) and apply transform.
            pil_crop = Image.fromarray(cv2.cvtColor(cropped_obj, cv2.COLOR_BGR2RGB))
            input_tensor = transform(pil_crop).unsqueeze(0).to(device)
            with torch.no_grad():
                feat = model(input_tensor)
            all_features.append(feat.cpu().numpy().flatten())
            proposal_mapping.append((frame_num, idx))
    cap.release()

    if all_features:
        all_features = np.vstack(all_features)
    else:
        all_features = np.array([])
    return all_features, proposal_mapping

# --- Step 3: Compute Cosine Similarity Matrix ---
# def cosine_similarity_matrix(features):
#     """
#     Compute the cosine similarity matrix between feature vectors.
#     """
#     norms = np.linalg.norm(features, axis=1, keepdims=True) + 1e-8
#     features_norm = features / norms
#     return features_norm @ features_norm.T
def cosine_similarity_matrix(features):
    """
    Compute the cosine similarity matrix between feature vectors.
    """
    norms = np.linalg.norm(features, axis=1, keepdims=True) + 1e-8  # Avoid division by zero
    features_norm = features / norms
    similarity_matrix = features_norm @ features_norm.T

    # Explicitly set diagonal to 1 to correct numerical precision errors
    np.fill_diagonal(similarity_matrix, 1.0)

    return similarity_matrix


# --- Step 4: Find Optimal Threshold Incrementally ---
def find_optimal_threshold_rank1(u):
    """
    Rank-1 thresholding to find subset v that maximizes (sum of top-n elements)^2 / n.
    Returns the threshold value, best score, and the number of elements.
    """
    u_min, u_max = u.min(), u.max()
    if u_max - u_min < 1e-12:
        return u[0], 0.0, len(u)
    u_norm = (u - u_min) / (u_max - u_min)

    sorted_idx = np.argsort(-u_norm)
    u_sorted = u_norm[sorted_idx]

    best_r = -np.inf
    best_t = None
    best_n = 0
    running_sum = 0.0

    for n in range(1, len(u_sorted) + 1):
        running_sum += u_sorted[n-1]
        score_n = (running_sum ** 2) / n
        if score_n > best_r:
            best_r = score_n
            best_n = n
            best_t = u_sorted[n-1]
    return best_t, best_r, best_n

# --- Step 5: Perform SCGP Clustering ---
def extract_video_wide_clusters(features, min_cluster_size=3):
    """
    Iteratively extracts visual atoms (clusters) from feature vectors using an optimal threshold.
    Returns a list of clusters, where each cluster is an array of global indices (into features).
    """
    if features.shape[0] == 0:
        print("No valid proposals for clustering.")
        return []

    clusters = []
    remaining_indices = np.arange(features.shape[0])
    features_remaining = features.copy()

    iteration = 0
    while features_remaining.shape[0] > 0:
        iteration += 1

        A = cosine_similarity_matrix(features_remaining)
        eigenvalues, eigenvectors = np.linalg.eig(A)
        idx = np.argmax(eigenvalues)
        dominant_vec = np.real(eigenvectors[:, idx])

        u = (dominant_vec - dominant_vec.min()) / (dominant_vec.max() - dominant_vec.min() + 1e-8)

        t_opt, best_r, best_n = find_optimal_threshold_rank1(u)

        cluster_indicator = u >= t_opt
        selected_idx = np.where(cluster_indicator)[0]
        cluster_indices = remaining_indices[selected_idx]

        if cluster_indices.size < min_cluster_size:
            print(f"Stopping at iteration {iteration}: Remaining proposals too few for meaningful clusters.")
            break

        print(f"Visual Atom {iteration}: Contains {len(cluster_indices)} proposals (threshold = {t_opt:.3f}, best_r = {best_r:.3f}).")
        clusters.append(cluster_indices)

        mask = np.ones(features_remaining.shape[0], dtype=bool)
        mask[selected_idx] = False
        features_remaining = features_remaining[mask]
        remaining_indices = remaining_indices[mask]

    return clusters

##############################################
# MAIN SCGP CODE FOR VIDEO-WIDE CLUSTERING
##############################################

if __name__ == '__main__':
    try:
        proposals_dict  # proposals_dict should be generated by your CPMC code.
    except NameError:
        print("Proposals dictionary not found. Please generate proposals using the CPMC code first.")
        exit(1)

    video_path = "/content/drive/MyDrive/data_composite_01/data/s0001_f_w000006.mp4"

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AlexNetFc7().to(device)
    model.eval()

    print("\nExtracting features for all proposals (using bounding boxes) across frames...")
    features, proposal_mapping = extract_features_from_bounding_boxes(video_path, proposals_dict, device, model)
    print("Total proposals extracted:", len(proposal_mapping))

    print("\nPerforming SCGP clustering across all frames...")
    clusters = extract_video_wide_clusters(features, min_cluster_size=10)
    print("Total visual atoms (clusters) formed:", len(clusters))

    # Optional: Visualize clustering results (e.g., show frames for each cluster)
    cap = cv2.VideoCapture(video_path)
    clustered_frames = {}
    for cluster in clusters:
        for idx in cluster:
            frame_num, proposal_idx = proposal_mapping[idx]
            if frame_num not in clustered_frames:
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num - 1)
                ret, frame = cap.read()
                if not ret:
                    continue
                clustered_frames[frame_num] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    cap.release()

    # (Visualization code can be extended as needed.)


#INFERENCE

In [None]:
print("Visual Atoms (Clusters) and their corresponding proposal indices:")

for atom_num, cluster_indices in enumerate(clusters[:10], start=1):  # Limit to first 10 visual atoms
    if isinstance(cluster_indices, np.ndarray):
        cluster_indices = cluster_indices.tolist()  # Convert NumPy array to list

    proposals_in_atom = [proposal_mapping[i] for i in cluster_indices]
    print(f"Visual Atom {atom_num}: Contains proposals with indices (frame_num, proposal_idx): {proposals_in_atom}")


In [None]:
import os
import numpy as np
import cv2

# ----- Configuration -----
# Parent directory to save visual atom proposals.
parent_dir = "visual_atoms_masks"
os.makedirs(parent_dir, exist_ok=True)

# ----- Save Visual Atom Proposals -----
# Iterate over each visual atom (cluster)
for atom_num, cluster in enumerate(clusters, start=1):
    # Create a folder for the current visual atom.
    atom_folder = os.path.join(parent_dir, f"visual_atom_{atom_num}")
    os.makedirs(atom_folder, exist_ok=True)

    # Iterate over each proposal index in the cluster.
    for idx in cluster:
        # Retrieve the corresponding (frame_num, proposal_idx) from proposal_mapping.
        frame_num, proposal_idx = proposal_mapping[idx]

        # Retrieve the colored proposal image from proposals_dict.
        # It is assumed that proposals_dict is a dictionary where each key is a frame number
        # and each value is a list of colored proposals (RGB images).
        proposal = proposals_dict[frame_num][proposal_idx]

        # If the proposal image is stored as a float image in [0, 1], convert it to 0-255.
        if proposal.dtype in [np.float32, np.float64]:
            proposal = np.clip(proposal * 255, 0, 255).astype(np.uint8)
        else:
            proposal = proposal.astype(np.uint8)



        # Create a filename and save the colored proposal.
        filename = f"frame_{frame_num}_proposal_{proposal_idx}.png"
        file_path = os.path.join(atom_folder, filename)
        cv2.imwrite(file_path, proposal)

print("Visual atom proposals have been saved in the folder 'visual_atoms_masks'.")


In [None]:
# Zip the folder
!zip -r /content/visual_atoms_masks.zip /content/visual_atoms_masks

# Download the zipped folder
from google.colab import files
files.download("/content/visual_atoms_masks.zip")


In [None]:
import cv2
import numpy as np
from google.colab.patches import cv2_imshow
img = cv2.imread('frame_28_proposal_9.png')
g = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
_, t = cv2.threshold(g, 1, 255, cv2.THRESH_BINARY)
c = cv2.findNonZero(t)
x, y, w, h = cv2.boundingRect(c)
crop = img[y:y+h, x:x+w]
resized = cv2.resize(crop, (224, 224), interpolation=cv2.INTER_LINEAR)
cv2_imshow(resized)


In [None]:
# Assume these variables are already available from your SCGP pipeline:
# - features: a NumPy array of shape (total_proposals, feature_dim)
# - clusters: a list of clusters where each cluster is an array of indices (into features)
# - cosine_similarity_matrix: your function that computes the cosine similarity matrix

# For Visual Atom 1 (first cluster):
if len(clusters) == 0:
    print("No clusters were found.")
else:
    cluster2_indices = clusters[1]  # Get indices for the first visual atom
    print("Visual Atom 2 contains proposals with indices:", cluster2_indices)

    # Extract features corresponding to these proposals
    features_cluster2 = features[cluster2_indices]

    # Compute the cosine similarity matrix for proposals in Visual Atom 1
    similarity_matrix = cosine_similarity_matrix(features_cluster2)

    # Print the similarity matrix
    print("Cosine similarity matrix for Visual Atom 2:")
    print(similarity_matrix)


In [None]:
# Suppose target is the tuple representing (frame_number, proposal_index)
target = (28, 7)

# Loop over proposal_mapping to find the global index
global_index = None
for i, mapping in enumerate(proposal_mapping):
    if mapping == target:
        global_index = i
        break

if global_index is not None:
    print(f"Global index for {target} is {global_index}.")
else:
    print(f"{target} not found in the proposal mapping.")


In [None]:
if len(clusters) < 2:
    print("Visual Atom 2 does not exist.")
else:
    cluster2_indices = clusters[1]  # Get indices for Visual Atom 2
    print("Visual Atom 2 contains proposals with indices:", cluster2_indices)

    if len(cluster2_indices) < 2:
        print("Not enough proposals in Visual Atom 2 to compute pairwise similarity.")
    else:
        # Extract features corresponding to these proposals
        features_cluster2 = features[cluster2_indices]

        # Compute the cosine similarity matrix for Visual Atom 2
        similarity_matrix = cosine_similarity_matrix(features_cluster2)

        # Print the similarity matrix
        print("Cosine similarity matrix for Visual Atom 2:")
        print(similarity_matrix)


In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt

# ---- Step 1: Identify the target proposal's index within Visual Atom 2 ----
target = (28, 7)  # Example: Target is the 8th proposal of frame 28

# Get global indices of proposals in Visual Atom 2
cluster2_indices = np.array(clusters[1])  # Visual Atom 2
cluster2_mapping = [proposal_mapping[i] for i in cluster2_indices]  # Map to (frame, proposal)

# Find the local index of the target proposal **within Visual Atom 2**
if target in cluster2_mapping:
    target_local_idx = cluster2_mapping.index(target)  # Local index within Visual Atom 2
    target_global_idx = cluster2_indices[target_local_idx]  # Global index in full dataset
else:
    print(f"Target proposal {target} not found in Visual Atom 2.")
    exit()

print(f"Target proposal {target} is at global index {target_global_idx} (local index {target_local_idx}) within Visual Atom 2.")

# ---- Step 2: Extract similarity values **only for Visual Atom 2** ----
# Map the similarity matrix to use **only local indices of Visual Atom 2**
# Call the function to compute the cosine similarity matrix using your feature data
cosine_sim_matrix = cosine_similarity_matrix(features)  # Pass the correct feature data
sim_matrix_atom2 = cosine_sim_matrix[np.ix_(cluster2_indices, cluster2_indices)]

# Get the similarity row corresponding to the target proposal (within Visual Atom 2)
sim_row_within_atom2 = sim_matrix_atom2[target_local_idx].copy()
sim_row_within_atom2[target_local_idx] = -np.inf  # Exclude self-similarity

# ---- Step 3: Find the top 5 highest similarity proposals **within Visual Atom 2** ----
top5_local_indices = np.argsort(sim_row_within_atom2)[-5:][::-1]  # Sorted descending

print("\nTop 5 similar proposals (within Visual Atom 2, global indices):")
top5_global_indices = cluster2_indices[top5_local_indices]  # Convert back to global indices

for idx in top5_global_indices:
    frame_num, proposal_idx = proposal_mapping[idx]
    similarity_value = cosine_sim_matrix[target_global_idx, idx]
    print(f"Global index {idx}: Frame {frame_num}, Proposal {proposal_idx}, Similarity = {similarity_value:.4f}")

    # ---- Step 4: Display the proposals ----
    if frame_num in proposals_dict and proposal_idx < len(proposals_dict[frame_num]):
        prop_img = proposals_dict[frame_num][proposal_idx]
        plt.figure(figsize=(4,4))
        plt.imshow(cv2.cvtColor(prop_img, cv2.COLOR_BGR2RGB))  # Convert from BGR to RGB
        plt.title(f"Frame {frame_num} - Proposal {proposal_idx} (Similarity: {similarity_value:.4f})")
        plt.axis("off")
        plt.show()
    else:
        print(f"Proposal for frame {frame_num}, proposal index {proposal_idx} not found.")


#USING MEDIAPIPE,AUTOENCODER,KMEANS CLUSTERING AND HMM

#MEDIAPIPE KEYPOINT EXTRACTION AND KEYPOINTS NORMALIZATION

In [None]:
!pip install mediapipe
!pip install opencv-python
!pip install numpy==1.26.4
!pip install --upgrade pandas mediapipe

In [None]:
import mediapipe as mp

In [None]:
import cv2
# import mediapipe as mp
import pandas as pd
import numpy as np
from itertools import chain
import os

In [None]:
import cv2
import pickle
import numpy as np
import mediapipe as mp

# Input and Output Paths
input_video_path = '/content/drive/MyDrive/data_composite_01/data/s0001_f_w000067.mp4'
output_pickle_path = '/content/drive/MyDrive/s0001_f_w000067.pkl'

# Initialize MediaPipe Holistic
mp_holistic = mp.solutions.holistic
holistic = mp_holistic.Holistic(static_image_mode=False,
                                model_complexity=2,
                                enable_segmentation=False,
                                refine_face_landmarks=False)

# Open video
cap = cv2.VideoCapture(input_video_path)
video_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
video_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))

# Storage for keypoints
left_hand_all = []
right_hand_all = []
pose_all = []

# Process each frame
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Convert the frame to RGB
    image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = holistic.process(image_rgb)

    # Get left hand keypoints (21 x 3)
    if results.left_hand_landmarks:
        left_hand = np.array([[lm.x, lm.y, lm.z] for lm in results.left_hand_landmarks.landmark])
    else:
        left_hand = np.zeros((21, 3))
    left_hand_all.append(left_hand)

    # Get right hand keypoints (21 x 3)
    if results.right_hand_landmarks:
        right_hand = np.array([[lm.x, lm.y, lm.z] for lm in results.right_hand_landmarks.landmark])
    else:
        right_hand = np.zeros((21, 3))
    right_hand_all.append(right_hand)

    # Get pose keypoints (33 x 4) – including visibility
    if results.pose_landmarks:
        pose = np.array([[lm.x, lm.y, lm.z, lm.visibility] for lm in results.pose_landmarks.landmark])
    else:
        pose = np.zeros((33, 4))
    pose_all.append(pose)

# Release resources
cap.release()
holistic.close()

# Convert lists to numpy arrays
left_hand_all = np.stack(left_hand_all)
right_hand_all = np.stack(right_hand_all)
pose_all = np.stack(pose_all)

# Create dictionary
data_dict = {
    "Video height": video_height,
    "Video width": video_width,
    "left hand": left_hand_all,
    "right hand": right_hand_all,
    "pose": pose_all
}

# Save to pickle
with open(output_pickle_path, 'wb') as f:
    pickle.dump(data_dict, f)

print("Keypoints successfully saved to:", output_pickle_path)


In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt

# Pose connection map (based on your diagram)
POSE_CONNECTIONS = [
    (0, 1), (1, 2), (2, 3), (3, 7),  # right eye
    (0, 4), (4, 5), (5, 6), (6, 8),  # left eye
    (9, 10),                         # mouth
    (11, 12), (11, 13), (13, 15),    # right arm
    (12, 14), (14, 16),              # left arm
    (11, 23), (12, 24),              # torso
    (23, 24), (23, 25), (24, 26),    # hips
    (25, 27), (27, 29), (29, 31),    # right leg
    (26, 28), (28, 30), (30, 32)     # left leg
]

HAND_CONNECTIONS = [
    (0, 1), (1, 2), (2, 3), (3, 4),      # thumb
    (0, 5), (5, 6), (6, 7), (7, 8),      # index
    (0, 9), (9, 10), (10, 11), (11, 12), # middle
    (0, 13), (13, 14), (14, 15), (15, 16), # ring
    (0, 17), (17, 18), (18, 19), (19, 20)  # pinky
]

class SkeletonVisualizer:
    def __init__(self, pickle_path):
        with open(pickle_path, 'rb') as f:
            self.data = pickle.load(f)
        self.video_height = self.data["Video height"]
        self.video_width = self.data["Video width"]
        self.left_hand = self.data["left hand"]
        self.right_hand = self.data["right hand"]
        self.pose = self.data["pose"]

    def __getitem__(self, frame_idx):
        lh = self.left_hand[frame_idx] * [self.video_width, self.video_height, 1]
        rh = self.right_hand[frame_idx] * [self.video_width, self.video_height, 1]
        pose = self.pose[frame_idx][:, :3] * [self.video_width, self.video_height, 1]
        self.plot_skeleton(pose, lh[:, :3], rh[:, :3], frame_idx)
        print(f"\nFrame {frame_idx}")
        print("Left Hand:\n", lh)
        print("Right Hand:\n", rh)
        print("Pose:\n", pose)

    def plot_skeleton(self, pose, lh, rh, frame_idx):
        print(lh)
        print(rh)
        plt.figure(figsize=(10, 10))
        ax = plt.gca()
        ax.set_title(f"Skeleton Frame {frame_idx}")
        ax.set_xlim(0, self.video_width)
        ax.set_ylim(self.video_height, 0)  # Flip y-axis

        # Pose connections
        for i, j in POSE_CONNECTIONS:
            plt.plot([pose[i, 0], pose[j, 0]], [pose[i, 1], pose[j, 1]], 'g-', lw=2)
        plt.scatter(pose[:, 0], pose[:, 1], c='r', s=10, label='Pose')

        # Left hand
        lh_wrist = pose[15, 2]  # Left wrist as base
        lh_coords = lh[:, :2] + lh_wrist  # Offset relative to wrist
        for i, j in HAND_CONNECTIONS:
            plt.plot([lh_coords[i, 0], lh_coords[j, 0]], [lh_coords[i, 1], lh_coords[j, 1]], 'b-', lw=1.5)
        plt.scatter(lh_coords[:, 0], lh_coords[:, 1], c='blue', s=8, label='Left Hand')

        # Right hand
        rh_coords = rh[:, :2]  # Offset relative to wrist
        print(rh_coords)
        for i, j in HAND_CONNECTIONS:
            plt.plot([rh_coords[i, 0], rh_coords[j, 0]], [rh_coords[i, 1], rh_coords[j, 1]], 'm-', lw=1.5)
        plt.scatter(rh_coords[:, 0], rh_coords[:, 1], c='magenta', s=8, label='Right Hand')

        plt.legend()
        plt.grid(True)
        plt.show()


In [None]:
vis = SkeletonVisualizer("/content/drive/MyDrive/s0001_f_w000067.pkl")
vis[75]  # Show frame 50


In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt

class SkeletonVisualizer:
    def __init__(self, pickle_path):
        with open(pickle_path, 'rb') as f:
            data = pickle.load(f)
        self.video_height = data['Video height']
        self.video_width = data['Video width']
        self.left_hand = data['left hand']
        self.right_hand = data['right hand']
        self.pose = data['pose']

    def __len__(self):
        return len(self.pose)

    def __getitem__(self, frame_idx):
        lh = self.left_hand[frame_idx] * [self.video_width, self.video_height, 1]
        rh = self.right_hand[frame_idx] * [self.video_width, self.video_height, 1]
        pose = self.pose[frame_idx][:, :3] * [self.video_width, self.video_height, 1]

        self.plot_pose(pose, frame_idx)
        self.plot_hand(lh[:, :2], 'Left Hand', 'blue', frame_idx)
        self.plot_hand(rh[:, :2], 'Right Hand', 'magenta', frame_idx)

    def plot_pose(self, pose_xyc, frame_idx):
        POSE_CONNECTIONS = [
            (0, 1), (1, 2), (2, 3), (3, 7),
            (0, 4), (4, 5), (5, 6), (6, 8),
            (9, 10), (11, 12), (11, 13), (13, 15),
            (12, 14), (14, 16), (15, 17),(15,19),(15,21), (16, 18),(16,20),(16,22),
            (11, 23), (12, 24), (23, 24), (23, 25),
            (24, 26), (25, 27), (26, 28),
            (27, 29), (28, 30), (29, 31), (30, 32)
        ]

        x, y = pose_xyc[:, 0], pose_xyc[:, 1]

        plt.figure(figsize=(6, 8))
        plt.scatter(x, y, c='red', label='Pose')
        for connection in POSE_CONNECTIONS:
            x0, y0 = pose_xyc[connection[0], 0], pose_xyc[connection[0], 1]
            x1, y1 = pose_xyc[connection[1], 0], pose_xyc[connection[1], 1]
            plt.plot([x0, x1], [y0, y1], color='green')

        plt.title(f'Pose Skeleton Frame {frame_idx}')
        plt.xlim(0, self.video_width)
        plt.ylim(0, self.video_height)
        plt.gca().invert_yaxis()  # Fix upside-down issue
        plt.legend()
        plt.grid(True)
        plt.show()

    def plot_hand(self, hand_coords, hand_name, color, frame_idx):
        HAND_CONNECTIONS = [
            (0,1),(0,17),(1,2),(2,3),(3,4),
            (0,5),(5,6),(6,7),(7,8),
            (5,9),(9,10),(10,11),(11,12),
            (9,13),(13,14),(14,15),(15,16),
            (13,17),(17,18),(18,19),(19,20)
        ]

        x, y = hand_coords[:, 0], hand_coords[:, 1]

        plt.figure(figsize=(5, 5))
        plt.scatter(x, y, c=color, label=hand_name)
        for connection in HAND_CONNECTIONS:
            x0, y0 = hand_coords[connection[0], 0], hand_coords[connection[0], 1]
            x1, y1 = hand_coords[connection[1], 0], hand_coords[connection[1], 1]
            plt.plot([x0, x1], [y0, y1], color=color)

        # Zoom around hand keypoints
        margin = 30
        x_min, x_max = np.min(x) - margin, np.max(x) + margin
        y_min, y_max = np.min(y) - margin, np.max(y) + margin
        plt.xlim(x_min, x_max)
        plt.ylim(y_max, y_min)  # Inverted manually for zoom view

        plt.title(f'{hand_name} Skeleton Frame {frame_idx}')
        #plt.gca().invert_yaxis()  # Fix upside-down issue
        plt.legend()
        plt.grid(True)
        plt.show()


In [None]:
vis = SkeletonVisualizer('/content/drive/MyDrive/s0001_f_w000067.pkl')
vis[75]  # Change the number to visualize other frames


In [None]:
class SkeletonVisualizer:
    def __init__(self, pickle_path):
        with open(pickle_path, 'rb') as f:
            self.data = pickle.load(f)
        self.video_height = self.data["Video height"]
        self.video_width = self.data["Video width"]
        self.left_hand = self.data["left hand"]
        self.right_hand = self.data["right hand"]
        self.pose = self.data["pose"]

    def __getitem__(self, frame_idx):
        # Scale to pixel space
        lh = self.left_hand[frame_idx] * [self.video_width, self.video_height, 1]
        rh = self.right_hand[frame_idx] * [self.video_width, self.video_height, 1]
        pose = self.pose[frame_idx][:, :3] * [self.video_width, self.video_height, 1]

        self.plot_skeleton(pose, lh, rh, frame_idx)

    def plot_skeleton(self, pose, lh, rh, frame_idx):
        plt.figure(figsize=(10, 10))
        ax = plt.gca()
        ax.set_title(f"Skeleton Frame {frame_idx}")
        ax.set_xlim(0, self.video_width)
        ax.set_ylim(self.video_height, 0)  # Flip y-axis

        # Pose
        for i, j in POSE_CONNECTIONS:
            plt.plot([pose[i, 0], pose[j, 0]], [pose[i, 1], pose[j, 1]], 'g-', lw=2)
        plt.scatter(pose[:, 0], pose[:, 1], c='r', s=10, label='Pose')

        # Left hand (translated to left wrist)
        lh_wrist_pose = pose[15, :2]
        if not np.allclose(lh, 0):  # skip if all zeros
            lh_relative = lh[:, :2] - lh[0, :2]  # shift hand relative to wrist center
            lh_coords = lh_relative + lh_wrist_pose
            for i, j in HAND_CONNECTIONS:
                plt.plot([lh_coords[i, 0], lh_coords[j, 0]], [lh_coords[i, 1], lh_coords[j, 1]], 'b-', lw=1.5)
            plt.scatter(lh_coords[:, 0], lh_coords[:, 1], c='blue', s=8, label='Left Hand')

        # Right hand (translated to right wrist)
        rh_wrist_pose = pose[16, :2]
        if not np.allclose(rh, 0):  # skip if all zeros
            rh_relative = rh[:, :2] - rh[0, :2]  # shift hand relative to wrist center
            rh_coords = rh_relative + rh_wrist_pose
            for i, j in HAND_CONNECTIONS:
                plt.plot([rh_coords[i, 0], rh_coords[j, 0]], [rh_coords[i, 1], rh_coords[j, 1]], 'm-', lw=1.5)
            plt.scatter(rh_coords[:, 0], rh_coords[:, 1], c='magenta', s=8, label='Right Hand')

        plt.legend()
        plt.grid(True)
        plt.show()


In [None]:
# Initialize the visualizer with the path to your pickle file
visualizer = SkeletonVisualizer('/content/drive/MyDrive/s0001_f_w000067.pkl')

# Visualize the skeleton for frame 10
visualizer[95]


In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

class SkeletonVisualizer3D:
    def __init__(self, pose_connections, left_hand_connections, right_hand_connections, video_height=1080):
        self.pose_connections = pose_connections
        self.left_hand_connections = left_hand_connections
        self.right_hand_connections = right_hand_connections
        self.video_height = video_height

    def plot_3d_skeleton(self, pose, lh=None, rh=None, frame_num=0):
        fig = plt.figure(figsize=(10, 10))
        ax = fig.add_subplot(111, projection='3d')
        ax.set_title(f"3D Skeleton Frame {frame_num}")

        # Pose
        if pose is not None:
            x = pose[:, 0]
            y = self.video_height - pose[:, 1]
            z = -pose[:, 2]
            ax.scatter(x, y, z, c='red', label='Pose')
            for connection in self.pose_connections:
                ax.plot(
                    [x[connection[0]], x[connection[1]]],
                    [y[connection[0]], y[connection[1]]],
                    [z[connection[0]], z[connection[1]]],
                    color='green'
                )

        # Left hand
        if lh is not None:
            x = lh[:, 0]
            y = self.video_height - lh[:, 1]
            z = -lh[:, 2]
            ax.scatter(x, y, z, c='blue', label='Left Hand')
            for connection in self.left_hand_connections:
                ax.plot(
                    [x[connection[0]], x[connection[1]]],
                    [y[connection[0]], y[connection[1]]],
                    [z[connection[0]], z[connection[1]]],
                    color='blue'
                )

        # Right hand
        if rh is not None:
            x = rh[:, 0]
            y = self.video_height - rh[:, 1]
            z = -rh[:, 2]
            ax.scatter(x, y, z, c='magenta', label='Right Hand')
            for connection in self.right_hand_connections:
                ax.plot(
                    [x[connection[0]], x[connection[1]]],
                    [y[connection[0]], y[connection[1]]],
                    [z[connection[0]], z[connection[1]]],
                    color='magenta'
                )

        ax.set_xlabel('X (Width)')
        ax.set_ylabel('Y (Height)')
        ax.set_zlabel('Z (Depth)')
        ax.legend()
        ax.view_init(elev=45, azim=-70)
        plt.tight_layout()
        plt.show()


In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

class SkeletonVisualizer:
    def __init__(self, pickle_path):
        with open(pickle_path, 'rb') as f:
            data = pickle.load(f)
        self.video_height = data['Video height']
        self.video_width = data['Video width']
        self.left_hand = data['left hand']
        self.right_hand = data['right hand']
        self.pose = data['pose']

    def __len__(self):
        return len(self.pose)


    def __getitem__(self, frame_idx):
          lh = self.left_hand[frame_idx] * [self.video_width, self.video_height, 1]
          rh = self.right_hand[frame_idx] * [self.video_width, self.video_height, 1]
          pose = self.pose[frame_idx][:, :3] * [self.video_width, self.video_height, 1]




          # Adjust hand z-coordinates using corresponding pose wrist depth
          lh[:, 2] += pose[15, 2]  # Left wrist
          rh[:, 2] += pose[16, 2]  # Right wrist

          # Reorder axes to make ZX the base and Y the vertical (flip Z to fix upside down)
          lh = lh[:, [0, 2, 1]]
          rh = rh[:, [0, 2, 1]]
          pose = pose[:, [0, 2, 1]]

          # Flip vertically to fix upside-down issue
          lh[:, 2] = self.video_height - lh[:, 2]
          rh[:, 2] = self.video_height - rh[:, 2]
          pose[:, 2] = self.video_height - pose[:, 2]

          self.plot_skeleton_3d(pose, lh, rh, frame_idx)





    def plot_skeleton_3d(self, pose, left_hand, right_hand, frame_idx):
        POSE_CONNECTIONS = [
            (0, 1), (1, 2), (2, 3), (3, 7),
            (0, 4), (4, 5), (5, 6), (6, 8),
            (9, 10), (11, 12), (11, 13), (13, 15),
            (12, 14), (14, 16), (15, 17),(15,19),(15,21),
            (16, 18),(16,20),(16,22),
            (11, 23), (12, 24), (23, 24), (23, 25),
            (24, 26), (25, 27), (26, 28),
            (27, 29), (28, 30), (29, 31), (30, 32)
        ]

        HAND_CONNECTIONS = [
            (0,1),(0,17),(1,2),(2,3),(3,4),
            (0,5),(5,6),(6,7),(7,8),
            (5,9),(9,10),(10,11),(11,12),
            (9,13),(13,14),(14,15),(15,16),
            (13,17),(17,18),(18,19),(19,20)
        ]

        fig = plt.figure(figsize=(10, 10))
        ax = fig.add_subplot(111, projection='3d')

        # Plot pose skeleton
        for connection in POSE_CONNECTIONS:
            p1, p2 = pose[connection[0]], pose[connection[1]]
            ax.plot([p1[0], p2[0]], [p1[1], p2[1]], [p1[2], p2[2]], color='green')

        ax.scatter(pose[:, 0], pose[:, 1], pose[:, 2], color='red', label='Pose')

        # Plot left hand
        for connection in HAND_CONNECTIONS:
            p1, p2 = left_hand[connection[0]], left_hand[connection[1]]
            ax.plot([p1[0], p2[0]], [p1[1], p2[1]], [p1[2], p2[2]], color='blue')
        ax.scatter(left_hand[:, 0], left_hand[:, 1], left_hand[:, 2], color='blue', label='Left Hand')

        # Plot right hand
        for connection in HAND_CONNECTIONS:
            p1, p2 = right_hand[connection[0]], right_hand[connection[1]]
            ax.plot([p1[0], p2[0]], [p1[1], p2[1]], [p1[2], p2[2]], color='magenta')
        ax.scatter(right_hand[:, 0], right_hand[:, 1], right_hand[:, 2], color='magenta', label='Right Hand')


        ax.set_xlim(0, self.video_width)
        ax.set_ylim(0, np.max([pose[:, 1].max(), left_hand[:, 1].max(), right_hand[:, 1].max()]) + 10)  # Depth
        ax.set_zlim(0, self.video_height)  # Height
        ax.set_xlabel('X')
        ax.set_ylabel('Depth')
        ax.set_zlabel('Height')


        ax.view_init(elev=15, azim=-70)
        ax.legend()
        plt.show()


In [None]:
# Initialize the visualizer with the path to your pickle file
visualizer = SkeletonVisualizer('/content/drive/MyDrive/s0001_f_w000067.pkl')

# Visualize the skeleton for frame 10
visualizer[108]


In [None]:
import pickle
import numpy as np
from tqdm import tqdm

def normalize_and_align_skeleton(pose, left_hand, right_hand):
    """
    Apply scale and rotation normalization to 3D keypoints.
    - Aligns middle finger tip to point upwards.
    - Scales based on average bone length.
    """
    def normalize_kpts(kpts):
        if np.isnan(kpts).any():
            return kpts  # skip if any NaNs
        # Center around wrist for hands or mid-hip for pose
        center = kpts[0]  # wrist or central joint
        kpts_centered = kpts - center

        # Normalize scale (bone length)
        dists = np.linalg.norm(kpts_centered[1:] - center, axis=1)
        scale = np.mean(dists) if np.mean(dists) > 0 else 1.0
        kpts_scaled = kpts_centered / scale

        # Align z-axis based on the middle finger or spine direction
        # Use arbitrary vector (0, 1, 0) as target up
        if len(kpts) == 21:
            direction = kpts_scaled[12] - kpts_scaled[0]  # wrist to middle finger tip
        elif len(kpts) == 33:
            direction = kpts_scaled[11] - kpts_scaled[0]  # mid-spine to neck
        else:
            return kpts_scaled

        direction /= np.linalg.norm(direction) + 1e-6
        target = np.array([0, 1, 0])
        v = np.cross(direction, target)
        s = np.linalg.norm(v)
        c = np.dot(direction, target)
        if s == 0:
            R = np.eye(3)
        else:
            vx = np.array([[0, -v[2], v[1]],
                           [v[2], 0, -v[0]],
                           [-v[1], v[0], 0]])
            R = np.eye(3) + vx + (vx @ vx) * ((1 - c) / (s ** 2))
        kpts_rotated = kpts_scaled @ R.T
        return kpts_rotated

    pose_new = normalize_kpts(pose[:, :3]) if pose is not None else None
    left_new = normalize_kpts(left_hand) if left_hand is not None else None
    right_new = normalize_kpts(right_hand) if right_hand is not None else None
    return pose_new, left_new, right_new


# Load original file
input_path = '/content/drive/MyDrive/s0001_f_w000067.pkl'
with open(input_path, 'rb') as f:
    data = pickle.load(f)

video_height = data['Video height']
video_width = data['Video width']
pose_data = data['pose']
left_data = data['left hand']
right_data = data['right hand']

# Initialize new arrays
transformed_pose = []
transformed_left = []
transformed_right = []

for i in tqdm(range(len(pose_data))):
    pose = pose_data[i]
    left = left_data[i]
    right = right_data[i]
    p_t, l_t, r_t = normalize_and_align_skeleton(pose, left, right)
    # Add back confidence score to pose
    if p_t is not None:
        conf = pose[:, 3:4]
        p_t = np.concatenate([p_t, conf], axis=-1)
    transformed_pose.append(p_t)
    transformed_left.append(l_t)
    transformed_right.append(r_t)

transformed_data = {
    'Video height': video_height,
    'Video width': video_width,
    'pose': np.array(transformed_pose),
    'left hand': np.array(transformed_left),
    'right hand': np.array(transformed_right),
}

# Save to a new pickle
output_path = '/content/drive/MyDrive/s0001_f_w000067_transformed.pkl'
with open(output_path, 'wb') as f:
    pickle.dump(transformed_data, f)

print("✅ Transformed keypoints saved to:", output_path)


In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Define connections for drawing skeletons
POSE_CONNECTIONS = [
    (11, 13), (13, 15),(15,17),(15,19),(15,21),(16,20),(16,22),(16,18),(12, 14), (14, 16),
    (11, 12), (23, 24), (11, 23), (12, 24),
    (23, 25), (25, 27), (24, 26), (26, 28),
    (27, 31), (28, 32)
]

HAND_CONNECTIONS = [
    (0, 1),(0,17), (1, 2), (2, 3), (3, 4),
    (0, 5), (5, 6), (6, 7), (7, 8),
    (5, 9), (9, 10), (10, 11), (11, 12),
    (9, 13), (13, 14), (14, 15), (15, 16),
    (13, 17), (17, 18), (18, 19), (19, 20)
]

def plot_skeleton(ax, keypoints, connections, label, color):
    for i, j in connections:
        if i < len(keypoints) and j < len(keypoints):
            ax.plot([keypoints[i, 0], keypoints[j, 0]],
                    [keypoints[i, 1], keypoints[j, 1]],
                    [keypoints[i, 2], keypoints[j, 2]],
                    color=color, lw=2)
    ax.scatter(keypoints[:, 0], keypoints[:, 1], keypoints[:, 2], label=label, s=10, color=color)
    ax.set_title(label)
    ax.set_xlabel("X")
    ax.set_ylabel("Y")
    ax.set_zlabel("Z")
    ax.view_init(elev=20, azim=120)

# Load transformed file
pickle_path = '/content/drive/MyDrive/s0001_f_w000067_transformed.pkl'
with open(pickle_path, 'rb') as f:
    data = pickle.load(f)

# Choose the frame number
frame_num = 123  # <- Change this to visualize different frames

pose_kpts = data['pose'][frame_num][:, :3]
left_kpts = data['left hand'][frame_num]
right_kpts = data['right hand'][frame_num]

# Plot each skeleton separately
fig = plt.figure(figsize=(18, 5))

# Pose
ax1 = fig.add_subplot(131, projection='3d')
plot_skeleton(ax1, pose_kpts, POSE_CONNECTIONS, label='Pose', color='blue')

# Left hand
ax2 = fig.add_subplot(132, projection='3d')
plot_skeleton(ax2, left_kpts, HAND_CONNECTIONS, label='Left Hand', color='green')

# Right hand
ax3 = fig.add_subplot(133, projection='3d')
plot_skeleton(ax3, right_kpts, HAND_CONNECTIONS, label='Right Hand', color='red')

plt.tight_layout()
plt.show()


In [None]:
!pip install plotly


In [None]:

import pickle
import numpy as np
import plotly.graph_objects as go

# Define skeleton connections
POSE_CONNECTIONS = [
    (11, 13), (13, 15), (15, 17), (15, 19), (15, 21), (16, 20), (16, 22), (16, 18),
    (12, 14), (14, 16), (11, 12), (23, 24), (11, 23), (12, 24),
    (23, 25), (25, 27), (24, 26), (26, 28), (27, 31), (28, 32)
]

HAND_CONNECTIONS = [
    (0, 1), (0, 17), (1, 2), (2, 3), (3, 4),
    (0, 5), (5, 6), (6, 7), (7, 8),
    (5, 9), (9, 10), (10, 11), (11, 12),
    (9, 13), (13, 14), (14, 15), (15, 16),
    (13, 17), (17, 18), (18, 19), (19, 20)
]

def create_skeleton_trace(kpts, connections, color, name):
    lines = []
    for i, j in connections:
        if i < len(kpts) and j < len(kpts):
            lines.append(go.Scatter3d(
                x=[kpts[i, 0], kpts[j, 0]],
                y=[kpts[i, 1], kpts[j, 1]],
                z=[kpts[i, 2], kpts[j, 2]],
                mode='lines',
                line=dict(color=color, width=4),
                showlegend=False
            ))
    joints = go.Scatter3d(
        x=kpts[:, 0],
        y=kpts[:, 1],
        z=kpts[:, 2],
        mode='markers',
        marker=dict(size=4, color=color),
        name=f"{name} joints",
        showlegend=False
    )
    return lines + [joints]

# Load transformed data
with open('/content/drive/MyDrive/s0001_f_w000067_transformed.pkl', 'rb') as f:
    data = pickle.load(f)

frame_num = 105  # ← Change frame number as needed
pose_kpts = data['pose'][frame_num][:, :3]
left_kpts = data['left hand'][frame_num]
right_kpts = data['right hand'][frame_num]

# Plot Pose
pose_fig = go.Figure(data=create_skeleton_trace(pose_kpts, POSE_CONNECTIONS, 'blue', 'Pose'))
pose_fig.update_layout(
    title=f"Pose Skeleton - Frame {frame_num}",
    scene=dict(xaxis_title='X', yaxis_title='Y', zaxis_title='Z', aspectmode='data'),
    margin=dict(l=0, r=0, t=30, b=0)
)
pose_fig.show()

# Plot Left Hand
left_fig = go.Figure(data=create_skeleton_trace(left_kpts, HAND_CONNECTIONS, 'green', 'Left Hand'))
left_fig.update_layout(
    title=f"Left Hand Skeleton - Frame {frame_num}",
    scene=dict(xaxis_title='X', yaxis_title='Y', zaxis_title='Z', aspectmode='data'),
    margin=dict(l=0, r=0, t=30, b=0)
)
left_fig.show()

# Plot Right Hand
right_fig = go.Figure(data=create_skeleton_trace(right_kpts, HAND_CONNECTIONS, 'red', 'Right Hand'))
right_fig.update_layout(
    title=f"Right Hand Skeleton - Frame {frame_num}",
    scene=dict(xaxis_title='X', yaxis_title='Y', zaxis_title='Z', aspectmode='data'),
    margin=dict(l=0, r=0, t=30, b=0)
)
right_fig.show()


In [None]:
import pickle
import numpy as np
from tqdm import tqdm

def normalize_and_align_skeleton(pose, left_hand, right_hand, video_width, video_height):
    """
    Normalize and rotate-align 3D skeletons (pose, left hand, right hand).
    Pose is centered at mid-shoulders and scaled to 3x shoulder distance.
    Hands are centered at wrist and scaled to finger chain length.
    """

    def normalize_kpts(kpts, part):
        if np.isnan(kpts).any():
            return kpts  # Skip if any NaNs

        if part == 'pose':
            # Center at midpoint of shoulders (keypoints 11 and 12)
            shoulder_l, shoulder_r = kpts[11], kpts[12]
            center = (shoulder_l + shoulder_r) / 2.0
            kpts_centered = kpts - center

            # Scale = 3 * shoulder distance
            scale = np.linalg.norm(shoulder_l - shoulder_r) * 6.0
            scale = scale if scale > 0 else 1.0
            kpts_scaled = kpts_centered / scale

            # Direction = neck - center
            direction = kpts[0] - center  # Approx. spine direction
        else:
            # Hands
            center = kpts[0]
            kpts_centered = kpts - center

            # Scale = wrist to middle fingertip (0→9→10→11→12)
            chain = [0, 9, 10, 11, 12]
            scale = sum(np.linalg.norm(kpts[chain[i+1]] - kpts[chain[i]]) for i in range(len(chain)-1))
            scale = scale if scale > 0 else 1.0
            kpts_scaled = kpts_centered / scale

            direction = kpts[12] - kpts[0]  # Middle finger tip direction

        # Normalize and rotate to align "up" to +Y
        direction /= np.linalg.norm(direction) + 1e-6
        target = np.array([0, 1, 0])  # Up

        v = np.cross(direction, target)
        s = np.linalg.norm(v)
        c = np.dot(direction, target)
        if s == 0:
            R = np.eye(3)
        else:
            vx = np.array([[0, -v[2], v[1]],
                           [v[2], 0, -v[0]],
                           [-v[1], v[0], 0]])
            R = np.eye(3) + vx + (vx @ vx) * ((1 - c) / (s ** 2))

        kpts_rotated = kpts_scaled @ R.T
        return kpts_rotated

    pose_new = normalize_kpts(pose, 'pose') if pose is not None else None
    left_new = normalize_kpts(left_hand, 'hand') if left_hand is not None else None
    right_new = normalize_kpts(right_hand, 'hand') if right_hand is not None else None
    return pose_new, left_new, right_new


# Load original file
input_path = '/content/drive/MyDrive/s0001_f_w000067.pkl'
with open(input_path, 'rb') as f:
    data = pickle.load(f)

video_height = data['Video height']
video_width = data['Video width']
pose_data = data['pose']
left_data = data['left hand']
right_data = data['right hand']

transformed_pose = []
transformed_left = []
transformed_right = []

for i in tqdm(range(len(pose_data))):
    # Scale to pixel space
    pose = pose_data[i][:, :3] * [video_width, video_height, video_width]
    left = left_data[i] * [video_width, video_height, video_width]
    right = right_data[i] * [video_width, video_height, video_width]

    # Normalize and rotate
    p_t, l_t, r_t = normalize_and_align_skeleton(pose, left, right, video_width, video_height)

    # Reattach confidence for pose
    if p_t is not None:
        conf = pose_data[i][:, 3:4]  # Keep original confidence
        p_t = np.concatenate([p_t, conf], axis=-1)

    transformed_pose.append(p_t)
    transformed_left.append(l_t)
    transformed_right.append(r_t)

# Save transformed data
transformed_data = {
    'Video height': video_height,
    'Video width': video_width,
    'pose': np.array(transformed_pose),
    'left hand': np.array(transformed_left),
    'right hand': np.array(transformed_right),
}

output_path = '/content/drive/MyDrive/s0001_f_w000067_transformed_c.pkl'
with open(output_path, 'wb') as f:
    pickle.dump(transformed_data, f)

print("✅ Transformed keypoints saved to:", output_path)


In [None]:
import pickle
import numpy as np
import plotly.graph_objects as go

# Define skeleton connections
POSE_CONNECTIONS = [
    (11, 13), (13, 15), (15, 17), (15, 19), (15, 21), (16, 20), (16, 22), (16, 18),
    (12, 14), (14, 16), (11, 12), (23, 24), (11, 23), (12, 24),
    (23, 25), (25, 27), (24, 26), (26, 28), (27, 31), (28, 32)
]

HAND_CONNECTIONS = [
    (0, 1), (0, 17), (1, 2), (2, 3), (3, 4),
    (0, 5), (5, 6), (6, 7), (7, 8),
    (5, 9), (9, 10), (10, 11), (11, 12),
    (9, 13), (13, 14), (14, 15), (15, 16),
    (13, 17), (17, 18), (18, 19), (19, 20)
]

def create_skeleton_trace(kpts, connections, color, name):
    lines = []
    for i, j in connections:
        if i < len(kpts) and j < len(kpts) and not (np.isnan(kpts[i]).any() or np.isnan(kpts[j]).any()):
            lines.append(go.Scatter3d(
                x=[kpts[i, 0], kpts[j, 0]],
                y=[kpts[i, 1], kpts[j, 1]],
                z=[kpts[i, 2], kpts[j, 2]],
                mode='lines',
                line=dict(color=color, width=4),
                showlegend=False
            ))
    # Plot joints
    valid = ~np.isnan(kpts).any(axis=1)
    joints = go.Scatter3d(
        x=kpts[valid, 0],
        y=kpts[valid, 1],
        z=kpts[valid, 2],
        mode='markers',
        marker=dict(size=4, color=color),
        name=f"{name} joints",
        showlegend=False
    )
    return lines + [joints]

# Load transformed data
with open('/content/drive/MyDrive/s0001_f_w000067_transformed_c.pkl', 'rb') as f:
    data = pickle.load(f)

frame_num = 105  # Change as needed
pose_kpts = data['pose'][frame_num][:, :3]
left_kpts = data['left hand'][frame_num]
right_kpts = data['right hand'][frame_num]

# Visualize pose
pose_fig = go.Figure(data=create_skeleton_trace(pose_kpts, POSE_CONNECTIONS, 'blue', 'Pose'))
pose_fig.update_layout(
    title=f"Normalized Pose Skeleton - Frame {frame_num}",
    scene=dict(
        xaxis_title='X', yaxis_title='Y', zaxis_title='Z',
        aspectmode='data'
    ),
    margin=dict(l=0, r=0, t=30, b=0)
)
pose_fig.show()

# Visualize left hand
left_fig = go.Figure(data=create_skeleton_trace(left_kpts, HAND_CONNECTIONS, 'green', 'Left Hand'))
left_fig.update_layout(
    title=f"Normalized Left Hand Skeleton - Frame {frame_num}",
    scene=dict(
        xaxis_title='X', yaxis_title='Y', zaxis_title='Z',
        aspectmode='data'
    ),
    margin=dict(l=0, r=0, t=30, b=0)
)
left_fig.show()

# Visualize right hand
right_fig = go.Figure(data=create_skeleton_trace(right_kpts, HAND_CONNECTIONS, 'red', 'Right Hand'))
right_fig.update_layout(
    title=f"Normalized Right Hand Skeleton - Frame {frame_num}",
    scene=dict(
        xaxis_title='X', yaxis_title='Y', zaxis_title='Z',
        aspectmode='data'
    ),
    margin=dict(l=0, r=0, t=30, b=0)
)
right_fig.show()


In [None]:
import os
import cv2
import pickle
import numpy as np
from tqdm import tqdm
import mediapipe as mp

# Initialize MediaPipe Holistic
mp_holistic = mp.solutions.holistic
holistic = mp_holistic.Holistic(static_image_mode=False)

# Path to videos directory
video_dir = "/content/drive/MyDrive/data_composite_01/data/"
output_pkl = "/content/drive/MyDrive/all_transformed_keypoints.pkl"

# Dictionary to hold all keypoints
all_keypoints = {}

# Normalize and rotate-align skeletons
def normalize_and_align_skeleton(pose, left_hand, right_hand, video_width, video_height):
    def normalize_kpts(kpts, part):
        if np.isnan(kpts).any():
            return None

        if part == 'pose':
            shoulder_l, shoulder_r = kpts[11], kpts[12]
            center = (shoulder_l + shoulder_r) / 2.0
            kpts_centered = kpts - center
            scale = np.linalg.norm(shoulder_l - shoulder_r) * 6.0
            scale = scale if scale > 0 else 1.0
            kpts_scaled = kpts_centered / scale
            direction = kpts[0] - center
        else:
            center = kpts[0]
            kpts_centered = kpts - center
            chain = [0, 9, 10, 11, 12]
            scale = sum(np.linalg.norm(kpts[chain[i+1]] - kpts[chain[i]]) for i in range(len(chain)-1))
            scale = scale if scale > 0 else 1.0
            kpts_scaled = kpts_centered / scale
            direction = kpts[12] - kpts[0]

        direction /= np.linalg.norm(direction) + 1e-6
        target = np.array([0, 1, 0])
        v = np.cross(direction, target)
        s = np.linalg.norm(v)
        c = np.dot(direction, target)

        if s == 0:
            R = np.eye(3)
        else:
            vx = np.array([
                [0, -v[2], v[1]],
                [v[2], 0, -v[0]],
                [-v[1], v[0], 0]
            ])
            R = np.eye(3) + vx + (vx @ vx) * ((1 - c) / (s ** 2))

        return kpts_scaled @ R.T

    pose_new = normalize_kpts(pose, 'pose') if pose is not None else None
    left_new = normalize_kpts(left_hand, 'hand') if left_hand is not None else None
    right_new = normalize_kpts(right_hand, 'hand') if right_hand is not None else None
    return pose_new, left_new, right_new

# Process each video
for video_name in tqdm(os.listdir(video_dir)):
    if not video_name.endswith((".mp4", ".avi", ".mov")):
        continue

    video_path = os.path.join(video_dir, video_name)
    cap = cv2.VideoCapture(video_path)
    video_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    video_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    video_id = os.path.splitext(video_name)[0]

    frame_id = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        result = holistic.process(rgb)

        # Extract landmarks
        pose = np.array([[lm.x, lm.y, lm.z, lm.visibility] for lm in result.pose_landmarks.landmark]) if result.pose_landmarks else None
        left = np.array([[lm.x, lm.y, lm.z] for lm in result.left_hand_landmarks.landmark]) if result.left_hand_landmarks else None
        right = np.array([[lm.x, lm.y, lm.z] for lm in result.right_hand_landmarks.landmark]) if result.right_hand_landmarks else None

        # Skip if any part is missing
        if pose is None or left is None or right is None:
            frame_id += 1
            continue

        # Scale to pixel space
        pose_scaled = pose[:, :3] * [video_width, video_height, video_width]
        left_scaled = left * [video_width, video_height, video_width]
        right_scaled = right * [video_width, video_height, video_width]

        # Normalize and align
        p_norm, l_norm, r_norm = normalize_and_align_skeleton(pose_scaled, left_scaled, right_scaled, video_width, video_height)

        # Skip if normalization fails or any NaNs appear
        if (
            p_norm is None or np.isnan(p_norm).any() or
            l_norm is None or np.isnan(l_norm).any() or
            r_norm is None or np.isnan(r_norm).any()
        ):
            frame_id += 1
            continue

        # Reattach confidence
        p_norm = np.concatenate([p_norm, pose[:, 3:4]], axis=-1)

        # Store in dictionary
        key = f"{video_id}_{frame_id}"
        all_keypoints[key] = {
            "Video height": video_height,
            "Video width": video_width,
            "pose": p_norm,
            "left hand": l_norm,
            "right hand": r_norm
        }

        frame_id += 1

    cap.release()

# Save all keypoints to a single .pkl file
with open(output_pkl, 'wb') as f:
    pickle.dump(all_keypoints, f)

print("✅ Saved all transformed and normalized keypoints to:", output_pkl)


#AUTOENCODER TRAINING

In [None]:
# 📦 Imports
import pickle
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
import os
# 📍 Path to your saved all_transformed_keypoints.pkl
pkl_path = '/content/drive/MyDrive/all_transformed_keypoints.pkl'

# 📖 Load keypoints dictionary
with open(pkl_path, 'rb') as f:
    data_dict = pickle.load(f)

print(f"✅ Loaded {len(data_dict)} keypoint samples.")

In [None]:
# These are the keypoint indices for upper body:
upper_body_idx = [0, 2, 5 ,11, 12, 13, 14, 15, 16]

In [None]:

# -------------------- 2. Custom Dataset --------------------
class KeypointDataset(Dataset):
    def __init__(self, data_dict):
        self.keys = list(data_dict.keys())
        self.data = data_dict

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        key = self.keys[idx]
        sample = self.data[key]
        return {
            'pose': torch.tensor(sample['pose'], dtype=torch.float32),
            'left': torch.tensor(sample['left hand'], dtype=torch.float32),
            'right': torch.tensor(sample['right hand'], dtype=torch.float32),
        }

dataset = KeypointDataset(data_dict)
dataloader = DataLoader(dataset, batch_size=512, shuffle=True)


In [None]:
import torch
import torch.nn as nn

class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim=16):
        super(Autoencoder, self).__init__()

        # Encoder: input → 128 → 64 → latent
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim)  # latent vector
        )

        # Decoder: latent → 64 → 128 → output
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)  # reconstruct original input
        )

    def forward(self, x):
        z = self.encoder(x)
        recon = self.decoder(z)
        return recon


In [None]:
import torch
import torch.nn as nn

class Autoencoderpose(nn.Module):
    def __init__(self, input_dim, latent_dim=16):
        super(Autoencoderpose, self).__init__()

        # Encoder: input → 128 → 64 → latent
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.Sigmoid(),
            nn.Linear(64, 32),
            nn.Sigmoid(),
            nn.Linear(32, latent_dim)  # latent vector
        )

        # Decoder: latent → 64 → 128 → output
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 32),
            nn.Sigmoid(),
            nn.Linear(32, 64),
            nn.Sigmoid(),
            nn.Linear(64, input_dim)  # reconstruct original input
        )

    def forward(self, x):
        z = self.encoder(x)
        recon = self.decoder(z)
        return recon


In [None]:
import torch
from torch.utils.data import Dataset

class KeypointDataset(Dataset):
    def __init__(self, data_dict):
        self.data = data_dict
        self.keys = list(data_dict.keys())
        self.upper_body_idx = [0, 2, 5, 11, 12, 13, 14, 15, 16]

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        key = self.keys[idx]
        sample = self.data[key]

        # Safely extract and reshape keypoints
        pose = sample['pose'][self.upper_body_idx, :3].reshape(-1)      # shape: (27,)
        # pose = sample['pose'][:, :3].reshape(-1)  # shape: (99,)

        left = sample['left hand'][:, :3].reshape(-1)                   # shape: (63,)
        right = sample['right hand'][:, :3].reshape(-1)                 # shape: (63,)

        return (
            torch.from_numpy(pose).float(),
            torch.from_numpy(left).float(),
            torch.from_numpy(right).float()
        )


In [None]:
from torch.utils.data import DataLoader
import pickle
# Load from file
with open('/content/drive/MyDrive/all_transformed_keypoints.pkl', 'rb') as f:
    all_data = pickle.load(f)

# Dataset and Dataloader
dataset = KeypointDataset(all_data)
dataloader = DataLoader(dataset, batch_size=1024, shuffle=True)


In [None]:
pose, left, right = dataset[0]
print(pose.shape, left.shape, right.shape)
print(type(pose), type(left), type(right))


In [None]:
# sample = dataset[0]  # take the first sample

# pose_dim = sample['pose'].shape[0]
# left_dim = sample['left'].shape[0]
# right_dim = sample['right'].shape[0]

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pose, left, right = dataset[0]  # unpack the tuple

pose_dim = pose.shape[0]
left_dim = left.shape[0]
right_dim = right.shape[0]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
# pose_ae = Autoencoderpose(input_dim=99, latent_dim=16).to(device)
pose_ae = Autoencoderpose(input_dim=27, latent_dim=16).to(device)
left_ae = Autoencoder(input_dim=63, latent_dim=16).to(device)
right_ae = Autoencoder(input_dim=63, latent_dim=16).to(device)


In [None]:
import torch.nn as nn

criterion = nn.MSELoss()

optimizer_pose = torch.optim.Adam(pose_ae.parameters(), lr=1e-3)
optimizer_left = torch.optim.Adam(left_ae.parameters(), lr=1e-3)
optimizer_right = torch.optim.Adam(right_ae.parameters(), lr=1e-3)


In [None]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Scheduler setup (after optimizer_pose)
scheduler_pose = ReduceLROnPlateau(
    optimizer_pose,
    mode='min',
    factor=0.5,
    #patience=100,
    patience=50,
    #threshold=1e-6,
    threshold = 1e-5,
    verbose=True
)


In [None]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Scheduler setup (after optimizer_pose)
scheduler_left = ReduceLROnPlateau(
    optimizer_left,
    mode='min',
    factor=0.5,
    #patience=100,
    patience=50,
    #threshold=1e-6,
    threshold = 1e-5,
    verbose=True
)


In [None]:
pose, left, right = dataset[0]
print(type(pose), type(left), type(right))  # All should be <class 'torch.Tensor'>


In [None]:
pose, _, _ = dataset[0]
print(pose)


In [None]:
epochs = 4000
patience = 50
best_loss = float('inf')
epochs_no_improve = 0
early_stop = False

for epoch in range(epochs):
    total_loss_pose = 0

    for pose, left, right in dataloader:
        pose = pose.to(device)

        optimizer_pose.zero_grad()
        recon_pose = pose_ae(pose)
        loss_pose = criterion(recon_pose, pose)
        loss_pose.backward()
        optimizer_pose.step()

        total_loss_pose += loss_pose.item()

    avg_loss_pose = total_loss_pose / len(dataloader)
    scheduler_pose.step(avg_loss_pose)

    current_lr = optimizer_pose.param_groups[0]['lr']
    print(f"Epoch {epoch+1}/{epochs} | Total Pose Loss: {total_loss_pose:.6f} | LR: {current_lr:.6f}")

    # Early Stopping Check
    if avg_loss_pose < best_loss - 1e-10:  # small threshold to count only meaningful improvements
        best_loss = avg_loss_pose
        epochs_no_improve = 0
        torch.save(pose_ae.state_dict(), 'best_pose_model.pth')  # Optional: save best model
    else:
        epochs_no_improve += 1


    if epochs_no_improve >= patience:
        print(f"⏹️ Early stopping triggered after {epoch+1} epochs! Best loss: {best_loss:.6f}")
        early_stop = True
        break

if not early_stop:
    print("✅ Training completed without early stopping.")
else:
    print("📦 Loaded best model for further use.")
    pose_ae.load_state_dict(torch.load('best_pose_model.pth'))  # Optional: reload best weights
import os
import torch

# 📂 Save directory (in your GDrive)
save_dir = '/content/drive/MyDrive/autoencoder_models/'
os.makedirs(save_dir, exist_ok=True)

# ✅ Save each model
torch.save(pose_ae.state_dict(), os.path.join(save_dir, 'pose_aese.pth'))
print("✅ All 3 Autoencoder models saved to your Google Drive successfully!")

In [None]:
epochs = 4000
patience = 500
best_loss = float('inf')
epochs_no_improve = 0
early_stop = False

for epoch in range(epochs):
    total_loss_left = 0

    for pose, left, right in dataloader:
        left = left.to(device)

        optimizer_left.zero_grad()
        recon_left = left_ae(left)
        loss_left = criterion(recon_left, left)
        loss_left.backward()
        optimizer_left.step()

        total_loss_left += loss_left.item()

    avg_loss_left = total_loss_left / len(dataloader)
    scheduler_left.step(avg_loss_left)

    current_lr = optimizer_left.param_groups[0]['lr']
    print(f"Epoch {epoch+1}/{epochs} | Total left Loss: {total_loss_left:.10f} | LR: {current_lr:.10f}")

    # Early Stopping Check
    if avg_loss_left < best_loss - 1e-100:  # small threshold to count only meaningful improvements
        best_loss = avg_loss_left
        epochs_no_improve = 0
        torch.save(left_ae.state_dict(), 'best_left_model.pth')  # Optional: save best model
    else:
        epochs_no_improve += 1


    if epochs_no_improve >= patience:
        print(f"⏹️ Early stopping triggered after {epoch+1} epochs! Best loss: {best_loss:.6f}")
        early_stop = True
        break

if not early_stop:
    print("✅ Training completed without early stopping.")
else:
    print("📦 Loaded best model for further use.")
    left_ae.load_state_dict(torch.load('best_left_model.pth'))  # Optional: reload best weights
import os
import torch

# 📂 Save directory (in your GDrive)
save_dir = '/content/drive/MyDrive/autoencoder_models/'
os.makedirs(save_dir, exist_ok=True)

# ✅ Save each model
torch.save(left_ae.state_dict(), os.path.join(save_dir, 'left_aenew.pth'))
print("✅ All 3 Autoencoder models saved to your Google Drive successfully!")

In [None]:
epochs = 300

for epoch in range(epochs):
    total_loss_pose = 0
    total_loss_left = 0
    total_loss_right = 0

    for pose, left, right in dataloader:
        pose, left, right = pose.to(device), left.to(device), right.to(device)



        # Right Hand AE
        optimizer_right.zero_grad()
        recon_right = right_ae(right)
        loss_right = criterion(recon_right, right)
        loss_right.backward()
        optimizer_right.step()

        total_loss_right += loss_right.item()

    print(f"Epoch {epoch+1}/{epochs} | Right Loss: {total_loss_right:.5f}")


In [None]:
import os
import torch

# 📂 Save directory (in your GDrive)
save_dir = '/content/drive/MyDrive/autoencoder_models/'
os.makedirs(save_dir, exist_ok=True)

# ✅ Save each model
torch.save(pose_ae.state_dict(), os.path.join(save_dir, 'pose_aese.pth'))
torch.save(left_ae.state_dict(), os.path.join(save_dir, 'left_hand_ae4.pth'))
torch.save(right_ae.state_dict(), os.path.join(save_dir, 'right_hand_ae4.pth'))

print("✅ All 3 Autoencoder models saved to your Google Drive successfully!")


In [None]:
# =================== Imports ===================
import pickle
import torch
import torch.nn as nn
import numpy as np
import plotly.graph_objects as go
import os

# =================== Define Skeleton Connections ===================
UPPER_BODY_CONNECTIONS = [
    (0, 1), (0, 2), (0, 3), (0, 4),
    (3,5), (4,6),
    (5,7), (6,8)
]
# UPPER_BODY_CONNECTIONS = [
#     (0, 1),  # nose to right eye
#     (0, 2),  # nose to left eye
#     (0, 3),  # nose to left shoulder
#     (0, 4),  # nose to right shoulder
#     (3, 5),  # left shoulder to left elbow
#     (5, 6),  # left elbow to left wrist
#     (4, 7),  # right shoulder to right elbow
#     (7, 8),  # right elbow to right wrist
# ]
HAND_CONNECTIONS = [
    (0, 1), (0, 17), (1, 2), (2, 3), (3, 4),
    (0, 5), (5, 6), (6, 7), (7, 8),
    (5, 9), (9, 10), (10, 11), (11, 12),
    (9, 13), (13, 14), (14, 15), (15, 16),
    (13, 17), (17, 18), (18, 19), (19, 20)
]

def create_skeleton_trace(kpts, connections, color, name):
    lines = []
    for i, j in connections:
        if i < len(kpts) and j < len(kpts) and not (np.isnan(kpts[i]).any() or np.isnan(kpts[j]).any()):
            lines.append(go.Scatter3d(
                x=[kpts[i, 0], kpts[j, 0]],
                y=[kpts[i, 1], kpts[j, 1]],
                z=[kpts[i, 2], kpts[j, 2]],
                mode='lines',
                line=dict(color=color, width=4),
                showlegend=False
            ))
    valid = ~np.isnan(kpts).any(axis=1)
    joints = go.Scatter3d(
        x=kpts[valid, 0],
        y=kpts[valid, 1],
        z=kpts[valid, 2],
        mode='markers',
        marker=dict(size=4, color=color),
        name=f"{name} joints",
        showlegend=False
    )
    return lines + [joints]

# =================== Autoencoder Definition ===================
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim=16):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        recon = self.decoder(z)
        return recon
import torch
import torch.nn as nn

class Autoencoderpose(nn.Module):
    def __init__(self, input_dim, latent_dim=16):
        super(Autoencoderpose, self).__init__()

        # Encoder: input → 128 → 64 → latent
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.Sigmoid(),
            nn.Linear(64, 32),
            nn.Sigmoid(),
            nn.Linear(32, latent_dim)  # latent vector
        )

        # Decoder: latent → 64 → 128 → output
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 32),
            nn.Sigmoid(),
            nn.Linear(32, 64),
            nn.Sigmoid(),
            nn.Linear(64, input_dim)  # reconstruct original input
        )

    def forward(self, x):
        z = self.encoder(x)
        recon = self.decoder(z)
        return recon

# =================== Load Data ===================
with open('/content/drive/MyDrive/all_transformed_keypoints.pkl', 'rb') as f:
    all_data = pickle.load(f)

print(f"✅ Loaded {len(all_data)} frames from all_transformed_keypoints.pkl.")

# =================== Load Models ===================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

pose_ae = Autoencoderpose(input_dim=27, latent_dim=16).to(device)      # 9 upper-body keypoints × 3D = 27
left_ae = Autoencoder(input_dim=63, latent_dim=16).to(device)      # 21 hand kpts × 3D
right_ae = Autoencoder(input_dim=63, latent_dim=16).to(device)

# Load trained weights
pose_ae.load_state_dict(torch.load('/content/drive/MyDrive/autoencoder_models/pose_aesest.pth', map_location=device))
left_ae.load_state_dict(torch.load('/content/drive/MyDrive/autoencoder_models/left_hand_ae4.pth', map_location=device))
right_ae.load_state_dict(torch.load('/content/drive/MyDrive/autoencoder_models/right_hand_ae4.pth', map_location=device))

pose_ae.eval()
left_ae.eval()
right_ae.eval()

print("✅ Loaded all 3 Autoencoder models successfully!")

# =================== Choose a Frame to Visualize ===================
video_id = 's0001_f_w000067'
frame_number = 105   # 🔥 Change this to visualize a different frame
frame_key = f"{video_id}_{frame_number}"

if frame_key not in all_data:
    raise ValueError(f"Frame key '{frame_key}' not found in data!")

# =================== Select Keypoints ===================
# Pose: select only 9 upper body keypoints
#upper_body_idx = [0, 2 , 5, 11, 12, 13, 15, 14, 16]  # e.g. [nose, shoulders, hips, elbows, wrists]
pose_kpts = all_data[frame_key]['pose'][upper_body_idx, :3]
left_hand_kpts = all_data[frame_key]['left hand'][:, :3]
right_hand_kpts = all_data[frame_key]['right hand'][:, :3]
# pose_raw = all_data[frame_key]['pose'][upper_body_idx, :3]
# left_hand_raw = all_data[frame_key]['left hand'][:, :3]
# right_hand_raw = all_data[frame_key]['right hand'][:, :3]

# # 🔄 Normalize and align all 3 skeletons
# pose_kpts, left_hand_kpts, right_hand_kpts = normalize_and_align_skeleton(pose_raw, left_hand_raw, right_hand_raw)

# =================== Run Autoencoders ===================
pose_input = torch.tensor(pose_kpts.flatten(), dtype=torch.float32).unsqueeze(0).to(device)
left_input = torch.tensor(left_hand_kpts.flatten(), dtype=torch.float32).unsqueeze(0).to(device)
right_input = torch.tensor(right_hand_kpts.flatten(), dtype=torch.float32).unsqueeze(0).to(device)

pose_recon = pose_ae(pose_input).squeeze(0).cpu().detach().numpy().reshape(-1, 3)
left_recon = left_ae(left_input).squeeze(0).cpu().detach().numpy().reshape(-1, 3)
right_recon = right_ae(right_input).squeeze(0).cpu().detach().numpy().reshape(-1, 3)

# =================== Plot Reconstructed Skeletons ===================
pose_fig = go.Figure(data=create_skeleton_trace(pose_recon, UPPER_BODY_CONNECTIONS, 'blue', 'Pose'))
pose_fig.update_layout(
    title=f"Reconstructed Pose Skeleton - {frame_key}",
    scene=dict(xaxis_title='X', yaxis_title='Y', zaxis_title='Z', aspectmode='data'),
    margin=dict(l=0, r=0, t=30, b=0)
)
pose_fig.show()

left_fig = go.Figure(data=create_skeleton_trace(left_recon, HAND_CONNECTIONS, 'green', 'Left Hand'))
left_fig.update_layout(
    title=f"Reconstructed Left Hand Skeleton - {frame_key}",
    scene=dict(xaxis_title='X', yaxis_title='Y', zaxis_title='Z', aspectmode='data'),
    margin=dict(l=0, r=0, t=30, b=0)
)
left_fig.show()

right_fig = go.Figure(data=create_skeleton_trace(right_recon, HAND_CONNECTIONS, 'red', 'Right Hand'))
right_fig.update_layout(
    title=f"Reconstructed Right Hand Skeleton - {frame_key}",
    scene=dict(xaxis_title='X', yaxis_title='Y', zaxis_title='Z', aspectmode='data'),
    margin=dict(l=0, r=0, t=30, b=0)
)
right_fig.show()

print("✅ Finished plotting reconstructed skeletons!")


#KMEANS CLUSTERING

In [None]:
import torch
import torch.nn as nn

# # Autoencoder class
# class Autoencoder(nn.Module):
#     def __init__(self, input_dim, latent_dim=16):
#         super(Autoencoder, self).__init__()
#         self.encoder = nn.Sequential(
#             nn.Linear(input_dim, 128),
#             nn.ReLU(),
#             nn.Linear(128, 64),
#             nn.ReLU(),
#             nn.Linear(64, latent_dim)
#         )
#         self.decoder = nn.Sequential(
#             nn.Linear(latent_dim, 64),
#             nn.ReLU(),
#             nn.Linear(64, 128),
#             nn.ReLU(),
#             nn.Linear(128, input_dim)
#         )

#     def forward(self, x):
#         z = self.encoder(x)
#         recon = self.decoder(z)
#         return recon

# Load models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pose_ae = Autoencoderpose(27, 16).to(device)
left_ae = Autoencoder(63, 16).to(device)
right_ae = Autoencoder(63, 16).to(device)

pose_ae.load_state_dict(torch.load("/content/drive/MyDrive/autoencoder_models/pose_aesest.pth",map_location=device))
left_ae.load_state_dict(torch.load("/content/drive/MyDrive/autoencoder_models/left_hand_ae4.pth",map_location=device))
right_ae.load_state_dict(torch.load("/content/drive/MyDrive/autoencoder_models/right_hand_ae4.pth",map_location=device))

pose_ae.eval()
left_ae.eval()
right_ae.eval()


In [None]:
import pickle

with open('/content/drive/MyDrive/all_transformed_keypoints.pkl', 'rb') as f:
    all_data = pickle.load(f)

print(f"Total frames: {len(all_data)}")


In [None]:
import numpy as np

pose_latents = []
left_latents = []
right_latents = []

# upper_body_idx = [0, 11, 12, 23, 24, 13, 15, 14, 16]  # Use your training indices
upper_body_idx = [0, 2, 5, 11, 12, 13, 14, 15, 16]

frame_keys = sorted(all_data.keys())  # Define this once, outside the loop

for frame_key in frame_keys:  # Loop over each frame key
    pose_kpts = all_data[frame_key]['pose'][upper_body_idx, :3]
    left_kpts = all_data[frame_key]['left hand'][:, :3]
    right_kpts = all_data[frame_key]['right hand'][:, :3]

    # Flatten and convert to tensor
    pose_tensor = torch.tensor(pose_kpts.flatten(), dtype=torch.float32).unsqueeze(0).to(device)
    left_tensor = torch.tensor(left_kpts.flatten(), dtype=torch.float32).unsqueeze(0).to(device)
    right_tensor = torch.tensor(right_kpts.flatten(), dtype=torch.float32).unsqueeze(0).to(device)

    with torch.no_grad():
        pose_z = pose_ae.encoder(pose_tensor).cpu().numpy().squeeze()
        left_z = left_ae.encoder(left_tensor).cpu().numpy().squeeze()
        right_z = right_ae.encoder(right_tensor).cpu().numpy().squeeze()

    pose_latents.append(pose_z)
    left_latents.append(left_z)
    right_latents.append(right_z)

pose_latents = np.vstack(pose_latents)
left_latents = np.vstack(left_latents)
right_latents = np.vstack(right_latents)
print("✅ Latent vectors extracted.")


In [None]:
from sklearn.cluster import KMeans

n_clusters = 50  # You can change this

pose_kmeans = KMeans(n_clusters=n_clusters, random_state=42)


pose_labels = pose_kmeans.fit_predict(pose_latents)


print("✅ Clustering done.")


In [None]:
from sklearn.cluster import KMeans

n_clusters = 100  # You can change this


left_kmeans = KMeans(n_clusters=n_clusters, random_state=42)


left_labels = left_kmeans.fit_predict(left_latents)


print("✅ Clustering done.")


In [None]:
from sklearn.cluster import KMeans

n_clusters = 100  # You can change this


right_kmeans = KMeans(n_clusters=n_clusters, random_state=42)


right_labels = right_kmeans.fit_predict(right_latents)

print("✅ Clustering done.")


In [None]:
import pickle

# Example: your clustering outputs
pose_data = {
    'labels': pose_labels,                # np.ndarray or list
    'kmeans': pose_kmeans,                # KMeans model
    'latents': pose_latents               # optionally include
}

left_hand_data = {
    'labels': left_labels,
    'kmeans': left_kmeans,
    'latents': left_latents
}

right_hand_data = {
    'labels': right_labels,
    'kmeans': right_kmeans,
    'latents': right_latents
}

# Save to files
with open("/content/pose_clustersnew.pkl", "wb") as f:
    pickle.dump(pose_data, f)

with open("/content/left_hand_clustersnew.pkl", "wb") as f:
    pickle.dump(left_hand_data, f)

with open("/content/right_hand_clustersnew.pkl", "wb") as f:
    pickle.dump(right_hand_data, f)


In [None]:
import pandas as pd

frame_keys = list(all_data.keys())

df_clusters = pd.DataFrame({
    "frame": frame_keys,
    "pose_cluster": pose_labels,
    "left_cluster": left_labels,
    "right_cluster": right_labels
})

df_clusters.to_csv("/content/drive/MyDrive/keypoint_cluster_labelsnew.csv", index=False)
print("✅ Saved clustering results.")


In [None]:
from collections import Counter

print("Pose Cluster Distribution:", Counter(pose_labels))
print("Left Hand Cluster Distribution:", Counter(left_labels))
print("Right Hand Cluster Distribution:", Counter(right_labels))


In [None]:
import numpy as np

# Cluster centers
print("Pose Cluster Centers:\n", pose_kmeans.cluster_centers_)
print("\nLeft Hand Cluster Centers:\n", left_kmeans.cluster_centers_)
print("\nRight Hand Cluster Centers:\n", right_kmeans.cluster_centers_)

# Frame counts per cluster
print("\nPose Cluster Distribution:", np.bincount(pose_labels))
print("Left Hand Cluster Distribution:", np.bincount(left_labels))
print("Right Hand Cluster Distribution:", np.bincount(right_labels))


In [None]:
print("Latent shape (pose):", pose_latents.shape)
print("Latent shape (left):", left_latents.shape)
print("Latent shape (right):", right_latents.shape)

print("Total pose frames:", len(pose_labels))
print("Total left hand frames:", len(left_labels))
print("Total right hand frames:", len(right_labels))


In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from tqdm import tqdm

def plot_elbow(latents, max_k=100, title=''):
    inertias = []
    k_values = range(1, max_k + 1)

    print(f"Computing KMeans for K=1 to K={max_k}...")
    for k in tqdm(k_values, desc="Fitting KMeans"):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(latents)
        inertias.append(kmeans.inertia_)

    # Print the inertia values
    for k, inertia in zip(k_values, inertias):
        print(f'K={k}: Inertia={inertia}')

    # Plot the elbow curve
    plt.figure(figsize=(6, 4))
    plt.plot(k_values, inertias, marker='o')
    plt.title(f'Elbow Method for {title}')
    plt.xlabel('Number of clusters (K)')
    plt.ylabel('Inertia')
    plt.grid(True)
    plt.show()


In [None]:
plot_elbow(pose_latents,max_k=50, title='Pose Latents')

In [None]:
plot_elbow(left_latents,max_k=100, title='Left Hand Latents')

In [None]:
plot_elbow(right_latents, max_k=100,title='Right Hand Latents')

In [None]:
import numpy as np
import plotly.graph_objects as go
import torch

# === Skeleton Connections ===
UPPER_BODY_CONNECTIONS = [
    (0, 1), (0, 2), (0, 3), (0, 4),
    (3,5), (4,6),
    (5,7), (6,8)
]
HAND_CONNECTIONS = [
    (0, 1), (0, 17), (1, 2), (2, 3), (3, 4),
    (0, 5), (5, 6), (6, 7), (7, 8),
    (5, 9), (9, 10), (10, 11), (11, 12),
    (9, 13), (13, 14), (14, 15), (15, 16),
    (13, 17), (17, 18), (18, 19), (19, 20)
]

# === Skeleton Trace Generator ===
def create_skeleton_trace(kpts, connections, color, name):
    lines = []
    for i, j in connections:
        if i < len(kpts) and j < len(kpts) and not (np.isnan(kpts[i]).any() or np.isnan(kpts[j]).any()):
            lines.append(go.Scatter3d(
                x=[kpts[i, 0], kpts[j, 0]],
                y=[kpts[i, 1], kpts[j, 1]],
                z=[kpts[i, 2], kpts[j, 2]],
                mode='lines',
                line=dict(color=color, width=4),
                showlegend=False
            ))
    valid = ~np.isnan(kpts).any(axis=1)
    joints = go.Scatter3d(
        x=kpts[valid, 0],
        y=kpts[valid, 1],
        z=kpts[valid, 2],
        mode='markers',
        marker=dict(size=4, color=color),
        name=f"{name} joints",
        showlegend=False
    )
    return lines + [joints]

# === One-Plot-Per-Cluster ===
def plot_cluster_centers_separately(cluster_centers, decoder, n_joints, title, connections):
    for i, center in enumerate(cluster_centers):
        latent_tensor = torch.tensor(center, dtype=torch.float32).unsqueeze(0).to(device)
        with torch.no_grad():
            recon = decoder(latent_tensor).cpu().numpy().squeeze()

        kpts = recon.reshape((n_joints, 3))
        traces = create_skeleton_trace(kpts, connections, color="blue", name=f"{title} {i}")

        fig = go.Figure(data=traces)
        fig.update_layout(
            title=f"{title} Skeleton from Cluster Center {i}",
            scene=dict(
                xaxis_title='X', yaxis_title='Y', zaxis_title='Z',
                aspectmode='data'
            ),
            height=500,
            width=600,
            showlegend=False
        )
        fig.show()


In [None]:
# Show 5 clusters, one per figure
num_clusters = 20

plot_cluster_centers_separately(pose_kmeans.cluster_centers_[:num_clusters], pose_ae.decoder, 9, "Pose", UPPER_BODY_CONNECTIONS)
# plot_cluster_centers_separately(left_kmeans.cluster_centers_[:num_clusters], left_ae.decoder, 21, "Left Hand", HAND_CONNECTIONS)
# plot_cluster_centers_separately(right_kmeans.cluster_centers_[:num_clusters], right_ae.decoder, 21, "Right Hand", HAND_CONNECTIONS)


In [None]:
plot_cluster_centers_separately(left_kmeans.cluster_centers_[:num_clusters], left_ae.decoder, 21, "Left Hand", HAND_CONNECTIONS)

In [None]:
plot_cluster_centers_separately(right_kmeans.cluster_centers_[:num_clusters], right_ae.decoder, 21, "Right Hand", HAND_CONNECTIONS)


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import torch

def plot_cluster_skeletons_2d(
    cluster_id,
    latents,
    labels,
    decoder,
    n_joints,
    connections,
    max_members=200000,
    title="Skeleton Cluster"
):
    # Select members of the cluster
    indices = np.where(labels == cluster_id)[0][:max_members]

    # Assign consistent color per connection
    connection_colors = {}
    for idx, conn in enumerate(connections):
        connection_colors[conn] = plt.cm.tab20(idx % 20)  # up to 20 colors

    # Decode all cluster members
    all_kpts = []
    for idx in indices:
        latent = torch.tensor(latents[idx]).unsqueeze(0).float()
        with torch.no_grad():
            decoded = decoder(latent).cpu().numpy().squeeze()

        if decoded.shape[0] == n_joints * 3:
            kpts = decoded.reshape(n_joints, 3)[:, :2]  # Only X, Y
        elif decoded.shape[0] == n_joints * 2:
            kpts = decoded.reshape(n_joints, 2)
        else:
            raise ValueError(f"Unexpected shape: {decoded.shape}")

        all_kpts.append(kpts)

    all_kpts = np.array(all_kpts)  # shape: (N, n_joints, 2)
    cluster_center = np.mean(all_kpts, axis=0)

    # Plotting
    plt.figure(figsize=(6, 6))
    ax = plt.gca()
    ax.set_title(f"{title} (Cluster {cluster_id}, {len(indices)} members)", fontsize=12)
    ax.set_aspect('equal')

    # Plot each skeleton
    for kpts in all_kpts:
        for i, j in connections:
            if i < n_joints and j < n_joints:
                x = [kpts[i, 0], kpts[j, 0]]
                y = [kpts[i, 1], kpts[j, 1]]
                ax.plot(x, y, color=connection_colors[(i, j)], alpha=0.1, linewidth=1)
        ax.scatter(kpts[:, 0], kpts[:, 1], color="black", s=2, alpha=0.1)  # draw keypoints

    # Plot cluster center (bold)
    for i, j in connections:
        if i < n_joints and j < n_joints:
            x = [cluster_center[i, 0], cluster_center[j, 0]]
            y = [cluster_center[i, 1], cluster_center[j, 1]]
            ax.plot(x, y, color=connection_colors[(i, j)], linewidth=3)
    ax.scatter(cluster_center[:, 0], cluster_center[:, 1], color="red", s=20, zorder=10)  # bold keypoints

    plt.xlabel("X")
    plt.ylabel("Y")
    plt.grid(True)
    plt.show()


In [None]:
def plot_cluster_skeletons_2d_save(
    cluster_id,
    latents,
    labels,
    decoder,
    n_joints,
    connections,
    max_members=200000,
    title="Skeleton Cluster",
    save_path=None
):
    import matplotlib.pyplot as plt
    import numpy as np
    import torch

    indices = np.where(labels == cluster_id)[0][:max_members]

    connection_colors = {conn: plt.cm.tab20(idx % 20) for idx, conn in enumerate(connections)}

    all_kpts = []
    for idx in indices:
        latent = torch.tensor(latents[idx]).unsqueeze(0).float()
        with torch.no_grad():
            decoded = decoder(latent).cpu().numpy().squeeze()

        if decoded.shape[0] == n_joints * 3:
            kpts = decoded.reshape(n_joints, 3)[:, :2]
        elif decoded.shape[0] == n_joints * 2:
            kpts = decoded.reshape(n_joints, 2)
        else:
            raise ValueError(f"Unexpected shape: {decoded.shape}")

        all_kpts.append(kpts)

    all_kpts = np.array(all_kpts)
    cluster_center = np.mean(all_kpts, axis=0)

    plt.figure(figsize=(6, 6))
    ax = plt.gca()
    ax.set_title(f"{title} (Cluster {cluster_id}, {len(indices)} members)", fontsize=12)
    ax.set_aspect('equal')

    for kpts in all_kpts:
        for i, j in connections:
            if i < n_joints and j < n_joints:
                x = [kpts[i, 0], kpts[j, 0]]
                y = [kpts[i, 1], kpts[j, 1]]
                ax.plot(x, y, color=connection_colors[(i, j)], alpha=0.1, linewidth=1)
        ax.scatter(kpts[:, 0], kpts[:, 1], color="black", s=2, alpha=0.1)

    for i, j in connections:
        if i < n_joints and j < n_joints:
            x = [cluster_center[i, 0], cluster_center[j, 0]]
            y = [cluster_center[i, 1], cluster_center[j, 1]]
            ax.plot(x, y, color=connection_colors[(i, j)], linewidth=3)

    ax.scatter(cluster_center[:, 0], cluster_center[:, 1], color="red", s=20, zorder=10)

    plt.xlabel("X")
    plt.ylabel("Y")
    plt.grid(True)

    if save_path:
        plt.savefig(save_path)
        print(f"✅ Saved: {save_path}")
        plt.close()
    else:
        plt.show()


In [None]:
from collections import Counter

def get_smallest_cluster_ids(labels, num_clusters=5):
    cluster_counts = Counter(labels)
    smallest_clusters = sorted(cluster_counts.items(), key=lambda x: x[1])[:num_clusters]
    print(" Smallest clusters (ID, size):", smallest_clusters)
    return [cid for cid, _ in smallest_clusters]

smallest_cluster_ids = get_smallest_cluster_ids(pose_labels, num_clusters=50)


In [None]:
smallest_cluster_ids_left = get_smallest_cluster_ids(left_labels, num_clusters=100)

In [None]:
smallest_cluster_ids_right = get_smallest_cluster_ids(right_labels, num_clusters=100)

In [None]:
import os

# Directory in Google Drive
save_dir = "/content/drive/MyDrive/pose_cluster_plots"
os.makedirs(save_dir, exist_ok=True)

# Get smallest clusters (already done)
# smallest_cluster_ids = get_smallest_cluster_ids(pose_labels, num_clusters=50)

# Plot and save all
for cid in smallest_cluster_ids:
    filename = f"pose_cluster_{cid:02d}.png"
    save_path = os.path.join(save_dir, filename)

    plot_cluster_skeletons_2d_save(
        cluster_id=cid,
        latents=pose_latents,
        labels=pose_labels,
        decoder=pose_ae.decoder,
        n_joints=9,
        connections=UPPER_BODY_CONNECTIONS,
        title=f"Pose Cluster {cid}",
        save_path=save_path
    )


In [None]:
plot_cluster_skeletons_2d(
    cluster_id= 30,
    latents=pose_latents,
    labels=pose_labels,
    decoder=pose_ae.decoder,
    n_joints=9,
    connections=UPPER_BODY_CONNECTIONS,
    title="Pose Cluster"
)


In [None]:
def plot_cluster_skeletons_hand_2d(
    cluster_id,
    latents,
    labels,
    decoder,
    n_joints,
    connections,
    max_members=200000,
    title="Skeleton Cluster",
    projection="frontal"  # options: 'frontal', 'side', 'top'
):
    import matplotlib.pyplot as plt
    import numpy as np
    import torch

    indices = np.where(labels == cluster_id)[0][:max_members]

    connection_colors = {}
    for idx, conn in enumerate(connections):
        connection_colors[conn] = plt.cm.tab20(idx % 20)

    all_kpts = []
    for idx in indices:
        latent = torch.tensor(latents[idx]).unsqueeze(0).float()
        with torch.no_grad():
            decoded = decoder(latent).cpu().numpy().squeeze()

        if decoded.shape[0] == n_joints * 3:
            kpts = decoded.reshape(n_joints, 3)
        elif decoded.shape[0] == n_joints * 2:
            temp = decoded.reshape(n_joints, 2)
            kpts = np.concatenate([temp, np.zeros((n_joints, 1))], axis=1)  # pad z=0
        else:
            raise ValueError(f"Unexpected shape: {decoded.shape}")
        all_kpts.append(kpts)

    all_kpts = np.array(all_kpts)  # shape: (N, n_joints, 3)
    cluster_center = np.mean(all_kpts, axis=0)

    # Projection logic: extract 2D coords from 3D
    def project(kpts3d):
        if projection == "frontal":
            # View from front: X (left-right), Y (up-down)
            # Flip Y to make Y up
            return np.stack([kpts3d[:, 0], -kpts3d[:, 1]], axis=1)
        elif projection == "side":
            return np.stack([kpts3d[:, 2], -kpts3d[:, 1]], axis=1)
        elif projection == "top":
            return np.stack([kpts3d[:, 0], kpts3d[:, 2]], axis=1)
        else:
            raise ValueError("Invalid projection")

    projected_kpts = [project(k) for k in all_kpts]
    projected_center = project(cluster_center)

    # Plotting
    plt.figure(figsize=(8, 11))
    ax = plt.gca()
    ax.set_title(f"{title} (Cluster {cluster_id}, {len(indices)} members)", fontsize=12)
    ax.set_aspect('equal')

    # Plot each skeleton
    for kpts in projected_kpts:
        for i, j in connections:
            if i < n_joints and j < n_joints:
                x = [kpts[i, 0], kpts[j, 0]]
                y = [kpts[i, 1], kpts[j, 1]]
                ax.plot(x, y, color=connection_colors[(i, j)], alpha=0.1, linewidth=1)
        ax.scatter(kpts[:, 0], kpts[:, 1], color="black", s=2, alpha=0.1)

    # Cluster center
    for i, j in connections:
        if i < n_joints and j < n_joints:
            x = [projected_center[i, 0], projected_center[j, 0]]
            y = [projected_center[i, 1], projected_center[j, 1]]
            ax.plot(x, y, color=connection_colors[(i, j)], linewidth=3)
    ax.scatter(projected_center[:, 0], projected_center[:, 1], color="red", s=20, zorder=10)

    plt.xlabel("X")
    plt.ylabel("Y")
    plt.grid(True)
    plt.show()


In [None]:
# plot_cluster_skeletons_2d(
#     cluster_id=smallest_cluster_ids_left[69],
#     latents=left_latents,
#     labels=left_labels,
#     decoder=left_ae.decoder,
#     n_joints=21,
#     connections=HAND_CONNECTIONS,
#     title="Left Hand Cluster",
#     projection="frontal"
# )
plot_cluster_skeletons_hand_2d(
    cluster_id=52,
    latents=left_latents,
    labels=left_labels,
    decoder=left_ae.decoder,
    n_joints=21,
    connections=HAND_CONNECTIONS,
    title="Left Hand Cluster",
    projection="frontal"
)


In [None]:
plot_cluster_skeletons_hand_2d(
    cluster_id=65,
    latents=right_latents,
    labels=right_labels,
    decoder=right_ae.decoder,
    n_joints=21,
    connections=HAND_CONNECTIONS,
    title="Right Hand Cluster",
    projection="frontal"
)


In [None]:
plot_cluster_skeletons_hand_2d(
    cluster_id=50,
    latents=right_latents,
    labels=right_labels,
    decoder=right_ae.decoder,
    n_joints=21,
    connections=HAND_CONNECTIONS,
    title="Right Hand Cluster",
    projection="frontal"
)


In [None]:
import os
import pickle

# Define the directory path
output_dir = "/content/drive/MyDrive/GestureClusters/"

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Save to files
with open(os.path.join(output_dir, "pose_clustersnew.pkl"), "wb") as f:
    pickle.dump(pose_data, f)

with open(os.path.join(output_dir, "left_hand_clustersnew.pkl"), "wb") as f:
    pickle.dump(left_hand_data, f)

with open(os.path.join(output_dir, "right_hand_clustersnew.pkl"), "wb") as f:
    pickle.dump(right_hand_data, f)

In [None]:
with open("/content/drive/MyDrive/GestureClusters/pose_clustersnew.pkl", "rb") as f:
    pose_data = pickle.load(f)

pose_labels = pose_data['labels']
pose_kmeans = pose_data['kmeans']
pose_latents = pose_data['latents']


In [None]:
import cv2
import mediapipe as mp
import numpy as np
from tqdm import tqdm
import pickle
import os

# Load from Drive
video_path = "/content/drive/MyDrive/data_composite_01/data/s0001_f_w000128.mp4"
video_id = os.path.basename(video_path)

# Init mediapipe modules
mp_pose = mp.solutions.pose.Pose(static_image_mode=False)
mp_hands_left = mp.solutions.hands.Hands(static_image_mode=False, max_num_hands=1)
mp_hands_right = mp.solutions.hands.Hands(static_image_mode=False, max_num_hands=1)

cap = cv2.VideoCapture(video_path)
video_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
video_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

keypoint_dict = {}
frame_id = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pose_result = mp_pose.process(image)
    left_hand_result = mp_hands_left.process(image)
    right_hand_result = mp_hands_right.process(image)

    pose_kpts = np.full((25, 3), np.nan)
    left_kpts = np.full((21, 3), np.nan)
    right_kpts = np.full((21, 3), np.nan)

    if pose_result.pose_landmarks:
        for i, lm in enumerate(pose_result.pose_landmarks.landmark[:25]):
            pose_kpts[i] = [lm.x * video_width, lm.y * video_height, lm.z * video_width]

    if left_hand_result.multi_hand_landmarks:
        for i, lm in enumerate(left_hand_result.multi_hand_landmarks[0].landmark):
            left_kpts[i] = [lm.x * video_width, lm.y * video_height, lm.z * video_width]

    if right_hand_result.multi_hand_landmarks:
        for i, lm in enumerate(right_hand_result.multi_hand_landmarks[0].landmark):
            right_kpts[i] = [lm.x * video_width, lm.y * video_height, lm.z * video_width]

    key = f"{video_id}_{frame_id}"
    keypoint_dict[key] = {
        "keypoints": np.vstack([pose_kpts, left_kpts, right_kpts]),
        "video_width": video_width,
        "video_height": video_height
    }
    frame_id += 1

cap.release()


In [None]:
def normalize_and_align_skeleton(pose, left_hand, right_hand, video_width, video_height):
    def normalize_kpts(kpts, part):
        if np.isnan(kpts).any():
            return kpts

        if part == 'pose':
            shoulder_l, shoulder_r = kpts[11], kpts[12]
            center = (shoulder_l + shoulder_r) / 2.0
            kpts_centered = kpts - center
            scale = np.linalg.norm(shoulder_l - shoulder_r) * 6.0
            scale = scale if scale > 0 else 1.0
            kpts_scaled = kpts_centered / scale
            direction = kpts[0] - center
        else:
            center = kpts[0]
            kpts_centered = kpts - center
            chain = [0, 9, 10, 11, 12]
            scale = sum(np.linalg.norm(kpts[chain[i+1]] - kpts[chain[i]]) for i in range(len(chain)-1))
            scale = scale if scale > 0 else 1.0
            kpts_scaled = kpts_centered / scale
            direction = kpts[12] - kpts[0]

        direction /= np.linalg.norm(direction) + 1e-6
        target = np.array([0, 1, 0])
        v = np.cross(direction, target)
        s = np.linalg.norm(v)
        c = np.dot(direction, target)

        if s == 0:
            R = np.eye(3)
        else:
            vx = np.array([[0, -v[2], v[1]], [v[2], 0, -v[0]], [-v[1], v[0], 0]])
            R = np.eye(3) + vx + (vx @ vx) * ((1 - c) / (s ** 2))

        return kpts_scaled @ R.T

    pose_new = normalize_kpts(pose, 'pose') if pose is not None else None
    left_new = normalize_kpts(left_hand, 'hand') if left_hand is not None else None
    right_new = normalize_kpts(right_hand, 'hand') if right_hand is not None else None
    return pose_new, left_new, right_new


In [None]:
for k in tqdm(keypoint_dict):
    entry = keypoint_dict[k]
    pose = entry["keypoints"][:25]
    left = entry["keypoints"][25:46]
    right = entry["keypoints"][46:]
    vw = entry["video_width"]
    vh = entry["video_height"]

    pose_new, left_new, right_new = normalize_and_align_skeleton(pose, left, right, vw, vh)
    keypoint_dict[k]["keypoints"] = np.vstack([pose_new, left_new, right_new])


In [None]:
with open("/content/drive/MyDrive/processed_data/keypoint_dict_normalizednew.pkl", "wb") as f:
    pickle.dump(keypoint_dict, f)

print("✅ Saved normalized keypoints to Drive.")


In [None]:
# Define Autoencoderpose and Autoencoder class first if not already defined
# Assuming you have them defined as `Autoencoderpose(input_size)` and `Autoencoder(input_size)`

ae_pose = Autoencoderpose(27)  # 9 keypoints × 3 coords
ae_pose.load_state_dict(torch.load("/content/drive/MyDrive/autoencoder_models/pose_aesest.pth", map_location="cpu"))
ae_pose.eval()

ae_left = Autoencoder(63)  # 21 × 3
ae_left.load_state_dict(torch.load("/content/drive/MyDrive/autoencoder_models/left_hand_ae4.pth", map_location="cpu"))
ae_left.eval()

ae_right = Autoencoder(63)  # 21 × 3
ae_right.load_state_dict(torch.load("/content/drive/MyDrive/autoencoder_models/right_hand_ae4.pth", map_location="cpu"))
ae_right.eval()


In [None]:
import torch
import numpy as np
import pickle
from tqdm import tqdm

# Load normalized keypoints
with open("/content/drive/MyDrive/processed_data/keypoint_dict_normalizednew.pkl", "rb") as f:
    keypoint_dict = pickle.load(f)

# Correct upper body indices
upper_body_idx = [0, 2, 5, 11, 12, 13, 14, 15, 16]

pose_latents = {}
left_latents = {}
right_latents = {}

for k in tqdm(keypoint_dict):
    kpts = keypoint_dict[k]["keypoints"]  # shape: (67, 3)

    # Extract parts
    pose = kpts[upper_body_idx].flatten()     # 9×3 = 27
    left = kpts[25:46].flatten()              # 21×3 = 63
    right = kpts[46:67].flatten()             # 21×3 = 63

    # Convert to torch tensors
    pose_tensor = torch.tensor(pose, dtype=torch.float32).unsqueeze(0)
    left_tensor = torch.tensor(left, dtype=torch.float32).unsqueeze(0)
    right_tensor = torch.tensor(right, dtype=torch.float32).unsqueeze(0)

    # Encode
    with torch.no_grad():
        pose_latents[k] = ae_pose.encoder(pose_tensor).numpy().flatten()
        left_latents[k] = ae_left.encoder(left_tensor).numpy().flatten()
        right_latents[k] = ae_right.encoder(right_tensor).numpy().flatten()


In [None]:
# Save
with open("/content/drive/MyDrive/processed_data/pose_latentsnew.pkl", "wb") as f:
    pickle.dump(pose_latents, f)

with open("/content/drive/MyDrive/processed_data/left_latentsnew.pkl", "wb") as f:
    pickle.dump(left_latents, f)

with open("/content/drive/MyDrive/processed_data/right_latentsnew.pkl", "wb") as f:
    pickle.dump(right_latents, f)

print("✅ Latent vectors saved.")


In [None]:
import pickle
import os

# Load latent vectors
with open("/content/drive/MyDrive/processed_data/pose_latentsnew.pkl", "rb") as f:
    pose_latents = pickle.load(f)

with open("/content/drive/MyDrive/processed_data/left_latentsnew.pkl", "rb") as f:
    left_latents = pickle.load(f)

with open("/content/drive/MyDrive/processed_data/right_latentsnew.pkl", "rb") as f:
    right_latents = pickle.load(f)

# Load cluster models
with open("/content/drive/MyDrive/GestureClusters/pose_clustersnew.pkl", "rb") as f:
    pose_kmeans = pickle.load(f)["kmeans"]

with open("/content/drive/MyDrive/GestureClusters/left_hand_clustersnew.pkl", "rb") as f:
    left_kmeans = pickle.load(f)["kmeans"]

with open("/content/drive/MyDrive/GestureClusters/right_hand_clustersnew.pkl", "rb") as f:
    right_kmeans = pickle.load(f)["kmeans"]


In [None]:
pose_cluster_map = {}
left_cluster_map = {}
right_cluster_map = {}

for key in pose_latents:
    pose_vector = pose_latents[key].reshape(1, -1)
    left_vector = left_latents[key].reshape(1, -1)
    right_vector = right_latents[key].reshape(1, -1)

    # Check for NaNs
    if np.isnan(pose_vector).any() or np.isnan(left_vector).any() or np.isnan(right_vector).any():
        print(f"⚠️ Skipping {key} due to NaNs")
        continue

    # Predict clusters
    pose_cluster_map[key] = pose_kmeans.predict(pose_vector)[0]
    left_cluster_map[key] = left_kmeans.predict(left_vector)[0]
    right_cluster_map[key] = right_kmeans.predict(right_vector)[0]


In [None]:
with open("/content/drive/MyDrive/processed_data/cluster_ids_pose.pkl", "wb") as f:
    pickle.dump(pose_cluster_map, f)

with open("/content/drive/MyDrive/processed_data/cluster_ids_left.pkl", "wb") as f:
    pickle.dump(left_cluster_map, f)

with open("/content/drive/MyDrive/processed_data/cluster_ids_right.pkl", "wb") as f:
    pickle.dump(right_cluster_map, f)

print("✅ Saved cluster assignments.")


In [None]:
import numpy as np
import torch
import matplotlib.pyplot as plt
import os

# Define connections for the small 9-joint upper body pose
POSE_CONNECTIONS = [
    (0, 1), (0, 2),         # Nose to eyes
    (0, 3), (0, 4),         # Nose to shoulders ❗
    (3, 5), (5, 7),         # Left: shoulder → elbow → wrist
    (4, 6), (6, 8)          # Right: shoulder → elbow → wrist
]

# Standard MediaPipe hand connections
HAND_CONNECTIONS = [
    (0, 1), (1, 2), (2, 3), (3, 4),
    (0, 5), (5, 6), (6, 7), (7, 8),
    (0, 9), (9,10), (10,11), (11,12),
    (0,13), (13,14), (14,15), (15,16),
    (0,17), (17,18), (18,19), (19,20)
]

def decode_latents(pose_cluster, left_cluster, right_cluster):
    # Latent vectors
    pose_latent = pose_kmeans.cluster_centers_[pose_cluster]
    left_latent = left_kmeans.cluster_centers_[left_cluster]
    right_latent = right_kmeans.cluster_centers_[right_cluster]

    # Decode to keypoints
    decoded_pose = pose_decoder(torch.tensor(pose_latent).float().unsqueeze(0)).detach().numpy().reshape(9, 3)
    decoded_left = left_decoder(torch.tensor(left_latent).float().unsqueeze(0)).detach().numpy().reshape(21, 3)
    decoded_right = right_decoder(torch.tensor(right_latent).float().unsqueeze(0)).detach().numpy().reshape(21, 3)

    return decoded_pose, decoded_left, decoded_right


In [None]:
# def plot_combined_clusters(pose_kp, left_kp, right_kp, frame_id, save_path=None):
#     # Place hands at wrist joints from pose
#     left_wrist = pose_kp[7]   # left wrist in 9-joint set
#     right_wrist = pose_kp[8]  # right wrist in 9-joint set

#     # Align and scale hands
#     hand_scale = 0.2
#     left_aligned = left_kp * hand_scale + left_wrist
#     right_aligned = right_kp * hand_scale + right_wrist

#     # Begin plotting
#     fig, ax = plt.subplots(figsize=(6, 6))

#     # Pose connections
#     for i, j in POSE_CONNECTIONS:
#         ax.plot([pose_kp[i, 0], pose_kp[j, 0]],
#                 [pose_kp[i, 1], pose_kp[j, 1]], 'b-')

#     # Left hand
#     for i, j in HAND_CONNECTIONS:
#         ax.plot([left_aligned[i, 0], left_aligned[j, 0]],
#                 [left_aligned[i, 1], left_aligned[j, 1]], 'g-')

#     # Right hand
#     for i, j in HAND_CONNECTIONS:
#         ax.plot([right_aligned[i, 0], right_aligned[j, 0]],
#                 [right_aligned[i, 1], right_aligned[j, 1]], 'r-')

#     ax.set_title(f"Cluster Skeletons for Frame {frame_id}")
#     ax.set_aspect('equal')
#     ax.invert_yaxis()
#     ax.grid(True)
#     ax.legend(['Pose', 'Left Hand', 'Right Hand'])

#     if save_path:
#         plt.savefig(save_path)
#         print(f"✅ Saved: {save_path}")
#     plt.close()
def plot_skeleton(pose_kps, left_kps, right_kps, frame_id, save_path=None):
    # 🔄 Flip Y of pose (not hands) to fix upside-down skeleton
    pose_kps[:, 1] *= -1

    # Align hands to wrists
    left_wrist = pose_kps[7]
    right_wrist = pose_kps[8]

    hand_scale = 0.2
    left_hand_shifted = left_kps * hand_scale + left_wrist
    right_hand_shifted = right_kps * hand_scale + right_wrist

    fig, ax = plt.subplots(figsize=(6, 6))

    # Plot Pose
    for i, j in POSE_CONNECTIONS:
        ax.plot([pose_kps[i, 0], pose_kps[j, 0]], [pose_kps[i, 1], pose_kps[j, 1]], 'b-')

    # Plot Left Hand
    for i, j in HAND_CONNECTIONS:
        ax.plot([left_hand_shifted[i, 0], left_hand_shifted[j, 0]], [left_hand_shifted[i, 1], left_hand_shifted[j, 1]], 'g-')

    # Plot Right Hand
    for i, j in HAND_CONNECTIONS:
        ax.plot([right_hand_shifted[i, 0], right_hand_shifted[j, 0]], [right_hand_shifted[i, 1], right_hand_shifted[j, 1]], 'r-')

    ax.set_title(f"Frame {frame_id}")
    ax.set_aspect('equal')
    ax.invert_yaxis()  # Optional: Keep if image coordinates preferred
    ax.grid(True)
    ax.legend(['Pose', 'Left Hand', 'Right Hand'])

    if save_path:
        plt.savefig(save_path)
        print(f"✅ Saved: {save_path}")

    plt.show()


In [None]:
# Load decoders
pose_decoder = ae_pose.decoder
left_decoder = ae_left.decoder
right_decoder = ae_right.decoder

pose_decoder.eval()
left_decoder.eval()
right_decoder.eval()


In [None]:
save_directory = "/content/drive/MyDrive/skeleton_plotsc"
os.makedirs(save_directory, exist_ok=True)

video_id = "s0001_f_w000128.mp4"
frame_keys = sorted(
    [k for k in pose_latents if k.startswith(video_id)],
    key=lambda x: int(x.split("_")[-1])
)

for full_key in frame_keys:
    frame_id = int(full_key.split("_")[-1])
    pose_vector = pose_latents[full_key].reshape(1, -1)
    left_vector = left_latents[full_key].reshape(1, -1)
    right_vector = right_latents[full_key].reshape(1, -1)

    if np.isnan(pose_vector).any() or np.isnan(left_vector).any() or np.isnan(right_vector).any():
        print(f"⚠️ Skipping {full_key} due to NaNs")
        continue

    pose_cid = pose_kmeans.predict(pose_vector)[0]
    left_cid = left_kmeans.predict(left_vector)[0]
    right_cid = right_kmeans.predict(right_vector)[0]

    pose_kps, left_kps, right_kps = decode_latents(pose_cid, left_cid, right_cid)

    save_path = f"{save_directory}/{video_id}_frame_{frame_id:04d}.png"
    plot_skeleton(pose_kps, left_kps, right_kps, frame_id, save_path)


In [None]:
def plot_skeleton_new(pose_kps, left_kps, right_kps, frame_id, save_path=None):
    # 🔄 Flip Y of pose (not hands) to fix upside-down skeleton
    pose_kps[:, 1] *= -1

    # Align hands to wrists
    left_wrist = pose_kps[7]
    right_wrist = pose_kps[8]

    def align_hand_to_wrist(hand_kps, wrist_pos, flip=True):
    # Vector from wrist to middle fingertip (approx hand direction)
          direction = hand_kps[12] - hand_kps[0]
          direction = direction / (np.linalg.norm(direction) + 1e-6)

          # Target direction (Y-up)
          target = np.array([0, 1, 0])

          # Rotation matrix to align direction to target
          v = np.cross(direction, target)
          s = np.linalg.norm(v)
          c = np.dot(direction, target)

          if s < 1e-6:
              R = np.eye(3)
          else:
              vx = np.array([
                  [0, -v[2], v[1]],
                  [v[2], 0, -v[0]],
                  [-v[1], v[0], 0]
              ])
              R = np.eye(3) + vx + (vx @ vx) * ((1 - c) / (s ** 2))

          # Rotate and shift
          scaled = hand_kps * 0.2
          rotated = scaled @ R.T
          return rotated + wrist_pos

    # Apply this:
    left_hand_shifted = align_hand_to_wrist(left_kps, left_wrist)
    right_hand_shifted = align_hand_to_wrist(right_kps, right_wrist)


    fig, ax = plt.subplots(figsize=(6, 6))

    # Plot Pose
    for i, j in POSE_CONNECTIONS:
        ax.plot([pose_kps[i, 0], pose_kps[j, 0]], [pose_kps[i, 1], pose_kps[j, 1]], 'b-')

    # Plot Left Hand
    for i, j in HAND_CONNECTIONS:
        ax.plot([left_hand_shifted[i, 0], left_hand_shifted[j, 0]], [left_hand_shifted[i, 1], left_hand_shifted[j, 1]], 'g-')

    # Plot Right Hand
    for i, j in HAND_CONNECTIONS:
        ax.plot([right_hand_shifted[i, 0], right_hand_shifted[j, 0]], [right_hand_shifted[i, 1], right_hand_shifted[j, 1]], 'r-')

    ax.set_title(f"Frame {frame_id}")
    ax.set_aspect('equal')
    ax.invert_yaxis()  # Optional: Keep if image coordinates preferred
    ax.grid(True)
    ax.legend(['Pose', 'Left Hand', 'Right Hand'])

    if save_path:
        plt.savefig(save_path)
        print(f"✅ Saved: {save_path}")

    plt.show()


In [None]:
save_directory = "/content/drive/MyDrive/skeleton_plotsn"
os.makedirs(save_directory, exist_ok=True)

video_id = "s0001_f_w000128.mp4"
frame_keys = sorted(
    [k for k in pose_latents if k.startswith(video_id)],
    key=lambda x: int(x.split("_")[-1])
)

for full_key in frame_keys:
    frame_id = int(full_key.split("_")[-1])
    pose_vector = pose_latents[full_key].reshape(1, -1)
    left_vector = left_latents[full_key].reshape(1, -1)
    right_vector = right_latents[full_key].reshape(1, -1)

    if np.isnan(pose_vector).any() or np.isnan(left_vector).any() or np.isnan(right_vector).any():
        print(f"⚠️ Skipping {full_key} due to NaNs")
        continue

    pose_cid = pose_kmeans.predict(pose_vector)[0]
    left_cid = left_kmeans.predict(left_vector)[0]
    right_cid = right_kmeans.predict(right_vector)[0]

    pose_kps, left_kps, right_kps = decode_latents(pose_cid, left_cid, right_cid)

    save_path = f"{save_directory}/{video_id}_frame_{frame_id:04d}.png"
    plot_skeleton_new(pose_kps, left_kps, right_kps, frame_id, save_path)


In [None]:
import torch
import pickle
import numpy as np
from tqdm import tqdm
import os

# Load the normalized keypoints
with open("/content/drive/MyDrive/all_transformed_keypoints.pkl", "rb") as f:
    all_keypoints = pickle.load(f)

# Load trained encoders and cluster models
ae_pose = Autoencoderpose(27)
ae_pose.load_state_dict(torch.load("/content/drive/MyDrive/autoencoder_models/pose_aesest.pth", map_location='cpu'))
ae_pose.eval()

ae_left = Autoencoder(63)
ae_left.load_state_dict(torch.load("/content/drive/MyDrive/autoencoder_models/left_hand_ae4.pth", map_location='cpu'))
ae_left.eval()

ae_right = Autoencoder(63)
ae_right.load_state_dict(torch.load("/content/drive/MyDrive/autoencoder_models/right_hand_ae4.pth", map_location='cpu'))
ae_right.eval()

# Load cluster models
with open("/content/drive/MyDrive/GestureClusters/pose_clustersnew.pkl", "rb") as f:
    pose_kmeans = pickle.load(f)["kmeans"]

with open("/content/drive/MyDrive/GestureClusters/left_hand_clustersnew.pkl", "rb") as f:
    left_kmeans = pickle.load(f)["kmeans"]

with open("/content/drive/MyDrive/GestureClusters/right_hand_clustersnew.pkl", "rb") as f:
    right_kmeans = pickle.load(f)["kmeans"]

# Define 9 pose indices used for training Autoencoderpose
UPPER_BODY_IDX = [0, 2, 5, 11, 12, 13, 14, 15, 16]

# Dictionary to store results
framewise_cluster_ids = {}

# Convert to latent and get cluster ID
for key in tqdm(sorted(all_keypoints)):
    entry = all_keypoints[key]

    try:
        pose = entry["pose"][UPPER_BODY_IDX, :3].flatten()      # (9 x 3) = 27
        left = entry["left hand"].flatten()                     # (21 x 3) = 63
        right = entry["right hand"].flatten()                   # (21 x 3) = 63

        pose_tensor = torch.tensor(pose, dtype=torch.float32).unsqueeze(0)
        left_tensor = torch.tensor(left, dtype=torch.float32).unsqueeze(0)
        right_tensor = torch.tensor(right, dtype=torch.float32).unsqueeze(0)

        with torch.no_grad():
            pose_latent = ae_pose.encoder(pose_tensor).numpy()
            left_latent = ae_left.encoder(left_tensor).numpy()
            right_latent = ae_right.encoder(right_tensor).numpy()

        pose_cluster = pose_kmeans.predict(pose_latent)[0]
        left_cluster = left_kmeans.predict(left_latent)[0]
        right_cluster = right_kmeans.predict(right_latent)[0]

        framewise_cluster_ids[key] = {
            "pose_cluster": pose_cluster,
            "left_cluster": left_cluster,
            "right_cluster": right_cluster
        }

    except Exception as e:
        print(f"❌ Error in {key}: {e}")
        continue

# Save result
with open("/content/drive/MyDrive/framewise_cluster_ids.pkl", "wb") as f:
    pickle.dump(framewise_cluster_ids, f)

print("✅ Saved: framewise_cluster_ids.pkl")


In [None]:
import pickle
from collections import defaultdict

# Load your framewise cluster triplets
with open("/content/drive/MyDrive/framewise_cluster_ids.pkl", "rb") as f:
    framewise_clusters = pickle.load(f)

# Step 1: Group by video and create observation sequences
video_obs = defaultdict(list)

for frame_key in sorted(framewise_clusters.keys()):
    video_id = "_".join(frame_key.split("_")[:-1])
    triplet = (
        framewise_clusters[frame_key]["pose_cluster"],
        framewise_clusters[frame_key]["left_cluster"],
        framewise_clusters[frame_key]["right_cluster"]
    )
    video_obs[video_id].append(triplet)

# Step 2: Map each unique triplet to an integer
triplet_to_id = {}
id_to_triplet = []
obs_sequences = []

for vid in sorted(video_obs.keys()):
    sequence = []
    for triplet in video_obs[vid]:
        if triplet not in triplet_to_id:
            triplet_to_id[triplet] = len(id_to_triplet)
            id_to_triplet.append(triplet)
        sequence.append(triplet_to_id[triplet])
    obs_sequences.append(sequence)

# Save the results
with open("/content/drive/MyDrive/bphmm_obs_sequences.pkl", "wb") as f:
    pickle.dump({
        "obs_sequences": obs_sequences,
        "triplet_to_id": triplet_to_id,
        "id_to_triplet": id_to_triplet
    }, f)

print("✅ Saved: bphmm_obs_sequences.pkl")
print(f"Total unique observation symbols: {len(id_to_triplet)}")
print(f"Total videos processed: {len(obs_sequences)}")


#Hidden Markov Model

In [None]:
!pip install hmmlearn


In [None]:
import pickle
import numpy as np
from hmmlearn import hmm
from tqdm import tqdm


In [None]:
# Load your cluster-ID triplet sequences
with open("/content/drive/MyDrive/bphmm_obs_sequences_int.pkl", "rb") as f:
    data = pickle.load(f)

obs_sequences = data["obs_sequences"]  # list of [int, int, int, ...] per video
all_obs = np.concatenate(obs_sequences).reshape(-1, 1)  # hmmlearn expects shape (n_samples, 1)

print(f"✅ Loaded {len(obs_sequences)} sequences")
print(f"🧩 Total observations: {len(all_obs)}")


In [None]:
print(all_obs.shape)
print(all_obs[:10])


In [None]:
print("Max obs:", np.max(all_obs))


In [None]:
print(len(data["id_to_triplet"]))  # should print something like 9870


In [None]:
print(f"Model states: {model.n_components}")
print(f"Model features: {model.n_features}")


In [None]:
print(f"📊 HMM configured with {model.n_features} observation symbols.")


In [None]:
from sklearn.cluster import KMeans
import numpy as np

triplet_vectors = np.array(id_to_triplet)  # shape (9870, 3)
n_obs_clusters = 512                       # Try 64, 128, or 256

kmeans = KMeans(n_clusters=n_obs_clusters, random_state=42).fit(triplet_vectors)

# Map old triplet ID to new reduced cluster ID
triplet_id_to_new = {i: int(label) for i, label in enumerate(kmeans.labels_)}

# Apply to all observation sequences
obs_sequences_reclustered = [
    [triplet_id_to_new[x] for x in seq]
    for seq in obs_sequences
]

print(f"✅ Reclustered triplets → {n_obs_clusters} observation symbols.")


In [None]:
# Flatten and reshape for hmmlearn
all_obs = np.concatenate(obs_sequences_reclustered).reshape(-1, 1)
print(f"📐 HMM input shape: {all_obs.shape}")  # Should be (N, 1)


In [None]:
# File from your earlier pipeline
with open("/content/drive/MyDrive/bphmm_obs_sequences_int.pkl", "rb") as f:
    data = pickle.load(f)

obs_sequences = data["obs_sequences"]         # List of videos (each: list of int IDs)
id_to_triplet = data["id_to_triplet"]         # 9870 unique (pose, left, right) triplets


In [None]:
from sklearn.cluster import KMeans
import numpy as np

n_obs_clusters = 512  # or 256, 128, etc.

# Each triplet is a 3D vector (pose, left hand, right hand cluster)
triplet_vectors = np.array(id_to_triplet)  # shape (9870, 3)

# Cluster them
kmeans = KMeans(n_clusters=n_obs_clusters, random_state=42)
kmeans.fit(triplet_vectors)

# Map original triplet ID to new reduced ID
triplet_id_to_new = {i: int(label) for i, label in enumerate(kmeans.labels_)}

# Apply to all sequences
obs_sequences_reclustered = [
    [triplet_id_to_new[x] for x in seq] for seq in obs_sequences
]

print(f"✅ Reclustered: {len(obs_sequences)} videos → {n_obs_clusters} obs symbols")


In [None]:
reclustered_data = {
    "obs_sequences": obs_sequences_reclustered,
    "n_obs_clusters": n_obs_clusters,
    "triplet_id_to_new": triplet_id_to_new
}

with open("/content/drive/MyDrive/bphmm_obs_sequences_int_reclustered.pkl", "wb") as f:
    pickle.dump(reclustered_data, f)

print("💾 Reclustered observation sequences saved.")


In [None]:
import pickle

with open("/content/drive/MyDrive/bphmm_obs_sequences_int_reclustered.pkl", "rb") as f:
    data = pickle.load(f)

obs_sequences_reclustered = data["obs_sequences"]
V = data["n_obs_clusters"]

print(f"✅ Loaded {len(obs_sequences_reclustered)} sequences with {V} observation symbols.")


In [None]:
# Convert each sequence into list of ints (required by pomegranate)
sequences = [[int(x) for x in seq] for seq in obs_sequences_reclustered]


In [None]:
!pip install hmmlearn


In [None]:
from hmmlearn import hmm
import numpy as np
import os
import joblib
import pickle
from tqdm import trange

# === Load reclustered observations ===
with open("/content/drive/MyDrive/bphmm_obs_sequences_int_reclustered.pkl", "rb") as f:
    data = pickle.load(f)

obs_sequences = data["obs_sequences"]  # List of list of obs IDs
V = data["n_obs_clusters"]             # Should be 512

# Flatten for hmmlearn
all_obs = np.concatenate(obs_sequences).reshape(-1, 1)
print(f"📐 all_obs shape: {all_obs.shape}")


In [None]:
from hmmlearn.hmm import CategoricalHMM
from hmmlearn.hmm import CategoricalHMM
from tqdm import trange
import joblib
import os
import numpy as np
from hmmlearn import hmm
import pickle

# === HMM Setup ===
n_states = 270
n_epochs = 60

model = CategoricalHMM(
    n_components=n_states,
    n_iter=1,             # One EM iteration per .fit()
    verbose=False,
    tol=1e-2,
    init_params="e"       # Only initialize emissions initially
)
model.n_features = V     # Must set before training!

# === Directory to Save Models ===
save_dir = "/content/drive/MyDrive/hmm_models_categorical"
os.makedirs(save_dir, exist_ok=True)

log_likelihoods = []

print(f"🚀 Training HMM with {n_states} states and {V} observation symbols...")

for epoch in trange(n_epochs, desc="Training HMM Epochs"):
    model.fit(all_obs)  # Run one EM step
    log_prob = model.score(all_obs)
    log_likelihoods.append(log_prob)

    print(f"📈 Epoch {epoch+1}/{n_epochs} — Log-likelihood: {log_prob:.4f}")

    # Save model
    model_path = os.path.join(save_dir, f"hmm_epoch_{epoch+1}.pkl")
    joblib.dump(model, model_path)
    print(f"💾 Saved model to: {model_path}")

print("✅ All epochs complete and saved.")
import pickle
import joblib
import numpy as np
from tqdm import tqdm
from hmmlearn import hmm

# === Load trained model ===
model_path = "/content/drive/MyDrive/hmm_models_categorical/hmm_epoch_60.pkl"
model = joblib.load(model_path)

# === Load clustered observation sequences ===
with open("/content/drive/MyDrive/bphmm_obs_sequences_int_reclustered.pkl", "rb") as f:
    data = pickle.load(f)

obs_sequences = data["obs_sequences"]

# === Decode using Viterbi ===
latent_state_sequences = []

print("🔍 Running Viterbi decoding for all videos...")
for seq in tqdm(obs_sequences, desc="Decoding"):
    X = np.array(seq).reshape(-1, 1)
    logprob, state_seq = model.decode(X, algorithm="viterbi")
    latent_state_sequences.append(state_seq)

print("✅ Predicted latent state sequences for all videos.")

# === Save latent states ===
save_path = "/content/drive/MyDrive/hmm_viterbi_latent_states.pkl"
with open(save_path, "wb") as f:
    pickle.dump(latent_state_sequences, f)

print(f"💾 Saved Viterbi latent state sequences to: {save_path}")


In [None]:
import pickle
import matplotlib.pyplot as plt

# === Load predicted latent states ===
with open("/content/drive/MyDrive/hmm_viterbi_latent_states.pkl", "rb") as f:
    latent_state_sequences = pickle.load(f)

def plot_latent_states(z_sequences, num_videos=5):
    """
    Visualize latent state sequences for a few videos.
    """
    for i in range(min(num_videos, len(z_sequences))):
        plt.figure(figsize=(10, 1.2))
        plt.title(f"Video {i} — Latent States Over Time")
        plt.imshow([z_sequences[i]], aspect='auto', cmap='tab20')
        plt.xlabel("Time (frames)")
        plt.yticks([])
        plt.colorbar(label="Latent State ID")
        plt.tight_layout()
        plt.show()

# === Call the visualization ===
plot_latent_states(latent_state_sequences, num_videos=352)


In [None]:
from itertools import groupby

video_segments = []

for seq in latent_state_sequences:
    segments = []
    for state, group in groupby(seq):
        length = len(list(group))
        segments.append((state, length))  # (state_id, duration)
    video_segments.append(segments)

# Save if needed
with open("/content/drive/MyDrive/hmm_video_segments.pkl", "wb") as f:
    pickle.dump(video_segments, f)

print("✅ Saved video-wise segmented latent state durations.")


In [None]:
import pickle
import matplotlib.pyplot as plt

# === Load latent state sequences ===
with open("/content/drive/MyDrive/hmm_viterbi_latent_states.pkl", "rb") as f:
    latent_state_sequences = pickle.load(f)

# === Load original video IDs (in same order as latent_state_sequences) ===
with open("/content/drive/MyDrive/bphmm_input_observation_sequences.pkl", "rb") as f:
    video_observation_sequences = pickle.load(f)

video_ids = list(video_observation_sequences.keys())  # 352 video filenames

# === Sanity check ===
assert len(video_ids) == len(latent_state_sequences), "Mismatch in video count!"

# === Visualization function ===
def plot_latent_states(z_sequences, video_ids, num_videos=5):
    for i in range(min(num_videos, len(z_sequences))):
        plt.figure(figsize=(10, 1.2))
        plt.title(f"{video_ids[i]} — Latent States Over Time")
        plt.imshow([z_sequences[i]], aspect='auto', cmap='tab20')
        plt.xlabel("Time (frames)")
        plt.yticks([])
        plt.colorbar(label="Latent State ID")
        plt.tight_layout()
        plt.show()

# === Call the plot ===
plot_latent_states(latent_state_sequences, video_ids, num_videos=352)  # or use 352 to plot all


In [None]:
import pickle
from itertools import groupby
import os

# === Load Viterbi latent states ===
with open("/content/drive/MyDrive/hmm_viterbi_latent_states.pkl", "rb") as f:
    latent_state_sequences = pickle.load(f)

# === Load original video IDs ===
with open("/content/drive/MyDrive/bphmm_input_observation_sequences.pkl", "rb") as f:
    video_observation_sequences = pickle.load(f)

video_ids = list(video_observation_sequences.keys())
assert len(video_ids) == len(latent_state_sequences), "Mismatch in video and sequence count."

# === Compute top-3 longest runs of states per video ===
video_top3_long_runs = {}

for vid, z_seq in zip(video_ids, latent_state_sequences):
    run_lengths = []

    for state, group in groupby(z_seq):
        run_len = len(list(group))
        run_lengths.append((state, run_len))

    # Get top 3 longest durations
    top3 = sorted(run_lengths, key=lambda x: x[1], reverse=True)[:3]
    top3_states = [s for s, _ in top3]
    video_top3_long_runs[vid] = top3_states

# === Save result ===
save_path = "/content/drive/MyDrive/hmm_video_top3_longest_run_states.pkl"
with open(save_path, "wb") as f:
    pickle.dump(video_top3_long_runs, f)

print("✅ Extracted top-3 continuous latent states per video (longest runs).")
print(f"💾 Saved to: {save_path}")


In [None]:
import pickle

# Load your saved results
with open("/content/drive/MyDrive/hmm_video_top3_longest_run_states.pkl", "rb") as f:
    video_top3_long_runs = pickle.load(f)

# See how many videos were processed
print(f"✅ Loaded dominant state info for {len(video_top3_long_runs)} videos.")


In [None]:
# Preview results
for i, (vid, top_states) in enumerate(video_top3_long_runs.items()):
    print(f"{vid}: Top-3 dominant states = {top_states}")
    if i == 352:
        break  # Print first 10 only


In [None]:
video_id = "s0001_f_w000148"  # replace with actual ID from your dataset
if video_id in video_top3_long_runs:
    print(f"{video_id} → Top-3 dominant latent states: {video_top3_long_runs[video_id]}")
else:
    print("⚠️ Video ID not found.")


In [None]:
from collections import Counter
import matplotlib.pyplot as plt

# Count how often each state appears in any video's top-3
state_counts = Counter()

for top3 in video_top3_long_runs.values():
    state_counts.update(top3)

# Top 20 most frequent dominant states
top_states = state_counts.most_common(20)
states, counts = zip(*top_states)

plt.figure(figsize=(12, 5))
plt.bar(states, counts, color="skyblue")
plt.xlabel("Latent State ID")
plt.ylabel("Number of Videos where it was in Top-3 Longest Run")
plt.title("🔢 Most Common Dominant States (Top-3 Longest Runs)")
plt.xticks(states)
plt.grid(True, axis='y')
plt.show()
