In [None]:
import os
import cv2
import glob
import numpy as np
from tqdm import tqdm
from pathlib import Path
from natsort import natsorted
import mediapipe as mp
import tensorflow as tf
import pandas as pd
import re

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        print(e)

In [None]:
# Initialize MediaPipe
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=True, min_detection_confidence=0.5)
basepath = os.getcwd()
input_dir = os.path.join(basepath, 'visual-features-2', 'train-talker')
output_dir = os.path.join(basepath, 'visual-features-pose', 'train-talker')

# Get list of VID folders
vid_folders = [f for f in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, f))]
vid_folders = natsorted(vid_folders)

for vid_folder in tqdm(vid_folders, desc='Processing VID folders'):
    vid_path = os.path.join(input_dir, vid_folder)
    out_vid_path = os.path.join(output_dir, vid_folder)
    
    if not os.path.exists(out_vid_path):
        os.makedirs(out_vid_path)
    
    frame_paths = glob.glob(os.path.join(vid_path, '*.jpg'))
    frame_paths = natsorted(frame_paths)
    
    for fp in frame_paths:
        base_frame = Path(fp).stem
        image = cv2.imread(fp)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Pose estimation
        results = pose.process(image_rgb)
        
        if results.pose_landmarks:
            # Extract 3D coordinates
            pose_3d = np.array([[lmk.x, lmk.y, lmk.z] for lmk in results.pose_landmarks.landmark])
            
            # Select only face and upper body parts (e.g., first 25 landmarks)
            pose_features = pose_3d[:25]
            
            # Store features as a 3D array (num_frames x num_landmarks x 3)
            pose_features = np.expand_dims(pose_features, axis=0)
            
            # Save features
            np.savez_compressed(os.path.join(out_vid_path, f'{base_frame}'), pose=pose_features)
        else:
            print(f"No pose detected in {fp}")

# Release MediaPipe resources
pose.close()
print("Feature extraction completed.")

In [None]:
def load_video_features(video_folder, num_frames=3000):
    # Select only .npz files
    frame_feature_files = [f for f in os.listdir(video_folder) if f.endswith('.npz')]
    
    # Sort files by frame number extracted from filename
    frame_feature_files.sort(key=lambda x: int(re.search(r'frame_(\d+)', x).group(1)))
    
    video_features = np.zeros((num_frames, 75), dtype=np.float32)  # Changed to 75 (pose data)
    
    for file_name in frame_feature_files:
        file_path = os.path.join(video_folder, file_name)
        frame_num = int(re.search(r'frame_(\d+)', file_name).group(1))
        
        if frame_num >= num_frames:
            break
        
        try:
            data = np.load(file_path, allow_pickle=True)
            if 'pose' in data and data['pose'].shape[0] > 0:
                # Store each frame in the correct position
                video_features[frame_num, :] = data['pose'].reshape(-1)[:75]  # Use only the first 75 values
            else:
                print(f"Warning: Empty or invalid data in {file_path}")
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    
    # Check the number of empty frames (frames with all zeros)
    empty_frames = np.sum(np.all(video_features == 0, axis=1))
    if empty_frames > 0:
        print(f"Warning: {empty_frames} empty frames in {os.path.basename(video_folder)}")
    
    return video_features

def load_all_videos(base_folder, video_order):
    all_video_features = []
    
    for video_name in tqdm(video_order, desc="Loading videos"):
        video_folder = os.path.join(base_folder, video_name)
        if os.path.isdir(video_folder):
            video_features = load_video_features(video_folder)
            all_video_features.append(video_features)
            print(f"Complete: {video_name}")
        else:
            print(f"Warning: Folder not found for video {video_name}")
    
    all_video_features = np.array(all_video_features)
    return all_video_features

# Read VID_NAME from CSV file
csv_path = './extracted-features/train-data-annotation-v1.csv'
df = pd.read_csv(csv_path)
video_order = df['VID_NAME'].tolist()

base_folder = './visual-features-pose/train-talker'
train_features = load_all_videos(base_folder, video_order)
print(train_features.shape)  # Expected output: (number_of_videos, 3000, 75)