In [None]:
#import related libraries
import numpy as np
import random as rn
import pickle
from tqdm import tqdm 
import os
import json
import csv

In [None]:
# step 1: define dataset path
dataset_dir = 'dataset dir here'
original_video_dir = dataset_dir + 'openpose/'
original_annocation_dir = dataset_dir + 'annotations/'


In [None]:
# step 2: load 'alldata.csv' into a dict
data = {}
# key: examid, => dict {patient, gmfcs, sides}
alldata_file = original_annocation_dir + 'alldata.csv'
with open(alldata_file, encoding='utf-8') as csvf:
    csvReader = csv.DictReader(csvf)
    for row in csvReader:
        key = row['examid']
        if key in data:
            data[key]['sides'] += 1
            assert data[key]['patient'] == row['Patient_ID']
            assert data[key]['gmfcs'] == row['gmfcs']
            continue
        data[key] = dict()
        data[key]['patient'] = row['Patient_ID']
        data[key]['gmfcs'] = row['gmfcs']
        data[key]['sides'] = 1
print(len(data))

In [None]:
# step 3: loda info of video files, and merge into data dict
video_file = original_annocation_dir + 'video_list.csv'
with open(video_file, encoding='utf-8') as csvf:
    csvReader = csv.DictReader(csvf)
    
    for row in csvReader:
        if row['Video_File'] == '': # for some exam_id, there's no associated video id
            continue
        key = row['Exam_ID']
        if key not in data: # ignore those videos without exam information
#             print(key, 'key not found in data')
            continue
        if 'video' in data[key]: # there's already a video associated with the exam ID, which should not happen
            print(key, data[key], row)
            continue
#             assert row['Video_File'] == data[key]['video']
        data[key]['video'] = row['Video_File']

keys_to_delete = []
# remove those exam information that does not have an associated video
for key, value in list(data.items()): 
    if 'video' not in value:
        keys_to_delete.append(key)
for key in keys_to_delete:
    del data[key]

print(len(data))

In [None]:
# step 4: process raw video

valid_entries = {}

parsed_video_dir = dataset_dir + '/parsed_videos/'

if not os.path.exists(parsed_video_dir):
    # Create the directory if it doesn't exist
    os.makedirs(parsed_video_dir)

for key, value in tqdm(list(data.items())):
    video_id = value['video']
    dir_path = original_video_dir + '/%s-processed/' % (video_id)
    if not os.path.isdir(dir_path): # there are some videos that do not appear in the video data, skip
#         print(video_id, 'not exist')
        continue
    file_string = parsed_video_dir + '/%s.npy' % (video_id)
    max_num_of_frames = 1500
    coordinate_count = 75  # 25 x 3 for openpose
    resL = np.zeros((max_num_of_frames,coordinate_count))
    resL[:] = np.nan
    for frame in range(1, max_num_of_frames):
        videoid_str = '%s-processed' % (video_id)
        test_image_json = original_video_dir + '/%s/%s_%s_keypoints.json' %\
            (videoid_str, videoid_str, str(frame).zfill(12))
        if not os.path.isfile(test_image_json): # some videos are shorter
            break
        with open(test_image_json) as data_file:  # load the frame data
            frame_data = json.load(data_file)
        
        counter = 0
        for person in frame_data['people']:
            keypoints = person['pose_keypoints_2d']
            xcoords = [keypoints[i] for i in range(len(keypoints)) if i % 3 == 0]
            if np.max(xcoords) < 320: # we only keep the left half of the video, due to the dataset format
                if(counter > 0): # there might be multiple people, we keep the first one. 
                    #note: in the baseline released code, the authors kept the last one instead of the first one.
                    continue
                counter += 1
                resL[frame-1,:] = keypoints
            
    
    check = np.apply_along_axis(lambda x: np.any(~np.isnan(x)),1,resL)
    for i in range(0, len(check)):
        if check[i]:
            break
    for j in range(len(check)-1,-1,-1):
        if check[j]:
            break
    res = resL[i:j+1]  # we remove redundant blanks from both side of the video, instead of only from the end
    res = np.array(res)
    np.save(file_string, res) # save the result
    valid_entries[key] = value # save the item in another dict

In [None]:
len(valid_entries)

In [None]:
# define a function to convert openpose 25 keypoints to coco 17 keypoints, because we only have pre-trained model on coco
# this function currently uses iteration, which could be changed to matrix manipulation for better speed if you care to do that
def openpose_to_coco(openpose_keypoints):
    """
    Convert OpenPose 25 keypoints to COCO 17 keypoints.
    
    Args:
        openpose_keypoints (list): List of 25 OpenPose keypoints in [x, y, c] format, where
                                   x and y are the keypoint coordinates and c is the confidence.
                                   
    Returns:
        List of 17 COCO keypoints in [x, y, v] format, where x and y are the keypoint coordinates
        and v is the visibility (0 or 1).
    """
    # Define the mapping from OpenPose keypoints to COCO keypoints
    # key is COCO keypoint, value is openpose keypoint
    coco_mapping = {
        0: 0, # nose
        1: 16, # left eye
        2: 15, # right eye
        3: 18, # left ear
        4: 17, # right ear
        5: 5, # left shoulder
        6: 2, # right shoulder
        7: 6, # left elbow
        8: 3, # right elbow
        9: 7, # left wrist
        10: 4, # right wrist
        11: 12, # left hip
        12: 9, # right hip
        13: 13, # left knee
        14: 10, # right knee
        15: 14, # left ankle
        16: 11 # right ankle
    }
    
    # Create an empty list to store the COCO keypoints
    coco_keypoints = []
    
    # Loop through each keypoint in the OpenPose keypoints list and convert to COCO format
    for i in range(17):
        op_idx = coco_mapping[i]
        op_keypoint = openpose_keypoints[op_idx]
#             coco_keypoint = [op_keypoint[0], op_keypoint[1], int(op_keypoint[2] > 0.0)]
        coco_keypoint = [op_keypoint[0], op_keypoint[1], op_keypoint[2]]
        coco_keypoints.append(coco_keypoint)
    coco_keypoints = np.array(coco_keypoints)
#     print(coco_keypoints.shape)
    return coco_keypoints

In [None]:
# find valid videos, skip those that are in the skip list

valid_video_list = []
input_folder = dataset_dir + '/parsed_videos/'
# Get a list of all CSV files in the input folder
npy_files = [f for f in os.listdir(input_folder) if f.endswith('.npy')]
skip_list = np.genfromtxt('skip.txt', dtype=str)
# print(type(skip_list[0]))
for i in npy_files:
    if i.split('.')[0] in skip_list:
        continue
    valid_video_list.append(i)
    

In [None]:
len(valid_video_list)

In [None]:
# step 5: convert openpose to coco
# Define input and output folders
input_folder = dataset_dir + '/parsed_videos/'
output_folder = dataset_dir + '/videos_coco/'

try:
    os.mkdir(output_folder)
except:
    pass

count = 0
# Loop through each CSV file
for npy_file in tqdm(valid_video_list):
    data = np.load(os.path.join(input_folder, npy_file))
    if len(data) == 0: # skip empty file
        print('empty file: ' + str(npy_file))
        continue
    data = data.reshape((-1, 25, 3))
    len_data = data.shape[0]
    coco_data = []
    for i in range(len_data):
        coco_data.append(openpose_to_coco(data[i]))
    coco_data = np.array(coco_data)
    assert coco_data.shape[1] == 17
    assert coco_data.shape[2] == 3
    # Save the numpy array as a npy file in the output folder
    np.save(os.path.join(output_folder, npy_file[:-4] + '.npy'), coco_data)
    count += 1
print(count)

In [None]:
def evaluate_data(data):
    # calculate the percentage of points in a sample with confidence >= 0.2
    confidence = data[:,:,2]
    return np.sum(confidence >= 0.2) / (data.shape[0]*data.shape[1])

def sliding_window_sampling(data, window_size):
    # sliding window sampling and calculate the confidence score of each sample
    step_size = int(window_size * 0.1)
    assert step_size == 12
    samples = []
    scores = {}
    idx = 0
    for i in range(0, data.shape[0]-window_size+1, step_size):
        sample = data[i:i+window_size,:,:]
        sample_name = f"{os.path.splitext(os.path.basename(file))[0]}_{idx}.npy"
        score = evaluate_data(sample)
        scores[idx] = score
        samples.append((idx, score, sample_name))
        idx += 1
    return samples, scores

def process_file(file):
    data = np.load(file)
    samples, scores = sliding_window_sampling(data, 124)
    sample_dir = os.path.join(dataset_dir + "/samples/", os.path.splitext(os.path.basename(file))[0])
    if not os.path.exists(sample_dir):
        os.makedirs(sample_dir)
    for idx, score, sample_name in samples:
        # save each sample
        np.save(os.path.join(sample_dir, sample_name), data[idx*int(124*0.1):idx*int(124*0.1)+124,:,:])
    count = len(scores)
    # save the information of the confidence scores
    save_dict = {'count': count, 'scores': scores}
    save_path = os.path.join(dataset_dir + "/samples/", os.path.splitext(os.path.basename(file))[0] + ".pkl")
    with open(save_path, 'wb') as f:
        pickle.dump(save_dict, f)
    return count, scores

In [None]:
#step 6: sample the data with sliding window, 90% overlap

import math
file_dir = dataset_dir + "/videos_coco/"
output_dir = dataset_dir + "/samples/"

try:
    os.mkdir(output_dir)
except:
    pass

total = 0
for file in tqdm(valid_video_list):
    if file.endswith(".npy"):
        try:
            count, scores = process_file(os.path.join(file_dir, file))
            total += 1
        except:
            print('skipping: ', file)
print(total)

In [None]:
# step 7: re-orgainze the data according to gmfcs score

gmfcs_dict = {}
# key -> {patient -> {video -> [clip1, clip2, ...]}}
clip_dict = {}
#set a keypoint availability threshold of 0.8
threshold = 0.8
count = 0
for key, item in tqdm(valid_entries.items()):
#     print(key, item)
    if item['gmfcs'] not in gmfcs_dict:
        gmfcs_dict[item['gmfcs']] = {}
    try:
        with open(dataset_dir + '/samples/' + str(item['video']) + '.pkl', 'rb') as f:
            scores = pickle.load(f)
    except:
        print('skipped: ', key)
        continue
    count += 1
    total = scores['count']
    for i in range(total):
        if scores['scores'][i] >= threshold: # we add the clips with score higher than threshold
            if item['patient'] not in gmfcs_dict[item['gmfcs']]:
                gmfcs_dict[item['gmfcs']][item['patient']] = {}
            if item['video'] not in gmfcs_dict[item['gmfcs']][item['patient']]:
                gmfcs_dict[item['gmfcs']][item['patient']][item['video']] = []
            gmfcs_dict[item['gmfcs']][item['patient']][item['video']].append(i)
            if item['video'] not in clip_dict:
                clip_dict[item['video']] = []
            clip_dict[item['video']].append(i)
    
print(count)


In [None]:
gmfcs_dict

In [None]:
clip_dict

In [None]:
# step 8: randomly split the dataset using stratified sampling according to patients
# and make sure that the training, validation and testing video count is approximately 7:1:2

import random

A = gmfcs_dict

# define the percentage of each set
split_ratio = {'train': 0.7, 'val': 0.1, 'test': 0.2}

# stratified sampling for each gmfcs score
sampled_dict = {}
for key, values in A.items():
    if key == '6': # we don't really have gmfcs == 5 in the dataset, so we ignore all gmfcs == 6
        continue
    # get all patient ids
    patient_ids = list(values.keys())
    print('key: ', key)
    print(len(patient_ids))

    # shuffle patient ids
    random.shuffle(patient_ids)

    # calculate how many patients each set
    n_samples = len(patient_ids)
    n_train = int(n_samples * split_ratio['train'])
    n_val = int(n_samples * split_ratio['val'])
    n_test = n_samples - n_train - n_val

    # get the patient ids for each set
    train_patients = patient_ids[:n_train]
    val_patients = patient_ids[n_train:n_train+n_val]
    test_patients = patient_ids[n_train+n_val:]

    train_videos, val_videos, test_videos = [], [], []

    # split videos according to patient ids
    for patient_id in train_patients:
        train_videos.extend(values[patient_id].keys())
    for patient_id in val_patients:
        val_videos.extend(values[patient_id].keys())
    for patient_id in test_patients:
        test_videos.extend(values[patient_id].keys())

    # calculate video numbers for each set
    train_video_count = len(train_videos)
    val_video_count = len(val_videos)
    test_video_count = len(test_videos)
    total_video_count = train_video_count + val_video_count + test_video_count
    
    train_clip_count = sum([len(clip_dict[i]) for i in train_videos])
    val_clip_count = sum([len(clip_dict[i]) for i in val_videos])
    test_clip_count = sum([len(clip_dict[i]) for i in test_videos])
    total_clip_count = train_clip_count + val_clip_count + test_clip_count
    print(train_videos)
    print(train_video_count, val_video_count, test_video_count, total_video_count)
    print(train_clip_count, val_clip_count, test_clip_count, total_clip_count)

    # Since we sample patient ids, we may need to keep sampling till we get the desired split of videos.
    while abs(train_video_count/total_video_count - split_ratio['train']) > 0.01 \
        or abs(val_video_count/total_video_count - split_ratio['val']) > 0.01 \
        or abs(test_video_count/total_video_count - split_ratio['test']) > 0.01:
        if abs(val_video_count - test_video_count) <= 1 and test_video_count >= 5: 
            # we cannot guarantee the exact ratio, so if the difference is below 1, should be fine
            break
        # otherwise, re-sample until satisfied
        random.shuffle(patient_ids)

        train_patients = patient_ids[:n_train]
        val_patients = patient_ids[n_train:n_train+n_val]
        test_patients = patient_ids[n_train+n_val:]

        train_videos, val_videos, test_videos = [], [], []

        for patient_id in train_patients:
            train_videos.extend(values[patient_id].keys())
        for patient_id in val_patients:
            val_videos.extend(values[patient_id].keys())
        for patient_id in test_patients:
            test_videos.extend(values[patient_id].keys())

        train_video_count = len(train_videos)
        val_video_count = len(val_videos)
        test_video_count = len(test_videos)
        total_video_count = train_video_count + val_video_count + test_video_count
        train_clip_count = sum([len(clip_dict[i]) for i in train_videos])
        val_clip_count = sum([len(clip_dict[i]) for i in val_videos])
        test_clip_count = sum([len(clip_dict[i]) for i in test_videos])
        total_clip_count = train_clip_count + val_clip_count + test_clip_count
        print('video: ', train_video_count, val_video_count, test_video_count, total_video_count)
        print('clip: ', train_clip_count, val_clip_count, test_clip_count, total_clip_count)
    
    # we organize the results into a dict
    sampled_dict[key] = {}
    sampled_dict[key]['train'] = {}
    sampled_dict[key]['val'] = {}
    sampled_dict[key]['test'] = {}
    sampled_dict[key]['train']['patients'] = train_patients
    sampled_dict[key]['train']['videos'] = train_videos
    sampled_dict[key]['train']['mapping'] = {}
    for p in train_patients:
        sampled_dict[key]['train']['mapping'][p] = gmfcs_dict[key][p]
        
    sampled_dict[key]['val']['patients'] = val_patients
    sampled_dict[key]['val']['videos'] = val_videos
    sampled_dict[key]['val']['mapping'] = {}
    for p in val_patients:
        sampled_dict[key]['val']['mapping'][p] = gmfcs_dict[key][p]
        
    sampled_dict[key]['test']['patients'] = test_patients
    sampled_dict[key]['test']['videos'] = test_videos
    sampled_dict[key]['test']['mapping'] = {}
    for p in test_patients:
        sampled_dict[key]['test']['mapping'][p] = gmfcs_dict[key][p]
        
# Get the number of sampled videos and patients for each GMFCS score
for key, values in sampled_dict.items():
    train_video_count = len(values['train']['videos'])
    val_video_count = len(values['val']['videos'])
    test_video_count = len(values['test']['videos'])
    total_video_count = train_video_count + val_video_count + test_video_count

    train_patient_count = len(values['train']['patients'])
    val_patient_count = len(values['val']['patients'])
    test_patient_count = len(values['test']['patients'])
    total_patient_count = train_patient_count + val_patient_count + test_patient_count
    
    print(f"Key '{key}':")
    print(f"Train set video count: {train_video_count}")
    print(f"Validation set video count: {val_video_count}")
    print(f"Test set video count: {test_video_count}")
    print(f"Total video count: {total_video_count}")
    print(f"Train set patient count: {train_patient_count}")
    print(f"Validation set patient count: {val_patient_count}")
    print(f"Test set patient count: {test_patient_count}")
    print(f"Total patient count: {total_patient_count}")

In [None]:
len(sampled_dict['0']['train']['videos'])

In [None]:
# step 9: organize the sampling result

train_dataset = []
val_dataset = []
test_dataset = []
train_dataset14 = []
val_dataset14 = []
test_dataset14 = []
train_0_dataset = []
train_1_dataset = []
train_2_dataset = []
train_3_dataset = []
train_4_dataset = []
# label, patient, video, clip
for key, value in sampled_dict.items():
    for patient, video in value['train']['mapping'].items():
        for video_id, clips in video.items():
            for i in clips:
                assert i in clip_dict[video_id]
                train_dataset.append([int(key), int(patient), int(video_id), int(i)])
                # we also save the different gmfcs level into separate files, in case we need such info for training
                if key == '0':
                    train_0_dataset.append([int(key), int(patient), int(video_id), int(i)])
                elif key == '1':
                    train_1_dataset.append([int(key), int(patient), int(video_id), int(i)])
                elif key == '2':
                    train_2_dataset.append([int(key), int(patient), int(video_id), int(i)])
                elif key == '3':
                    train_3_dataset.append([int(key), int(patient), int(video_id), int(i)])
                elif key == '4':
                    train_4_dataset.append([int(key), int(patient), int(video_id), int(i)])
                if key != '0':
                    train_dataset14.append([int(key), int(patient), int(video_id), int(i)])
                    
# label, patient, video, clip
for key, value in sampled_dict.items():
    for patient, video in value['val']['mapping'].items():
        for video_id, clips in video.items():
            for i in clips:
                val_dataset.append([int(key), int(patient), int(video_id), int(i)])
                if key != '0':
                    val_dataset14.append([int(key), int(patient), int(video_id), int(i)])
            
# label, patient, video, clip
for key, value in sampled_dict.items():
    for patient, video in value['test']['mapping'].items():
        for video_id, clips in video.items():
            for i in clips:
                test_dataset.append([int(key), int(patient), int(video_id), int(i)])
                if key != '0':
                    test_dataset14.append([int(key), int(patient), int(video_id), int(i)])
            

In [None]:
# step 10: save the sampling result in npy file

train_dataset = np.array(train_dataset)
train_dataset14 = np.array(train_dataset14)
train_dataset_0 = np.array(train_0_dataset)
train_dataset_1 = np.array(train_1_dataset)
train_dataset_2 = np.array(train_2_dataset)
train_dataset_3 = np.array(train_3_dataset)
train_dataset_4 = np.array(train_4_dataset)
val_dataset = np.array(val_dataset)
test_dataset = np.array(test_dataset)
val_dataset14 = np.array(val_dataset14)
test_dataset14 = np.array(test_dataset14)
np.save(dataset_dir + '/train_dataset.npy', train_dataset)
np.save(dataset_dir + '/train_dataset14.npy', train_dataset14)
np.save(dataset_dir + '/train_dataset_0.npy', train_0_dataset)
np.save(dataset_dir + '/train_dataset_1.npy', train_1_dataset)
np.save(dataset_dir + '/train_dataset_2.npy', train_2_dataset)
np.save(dataset_dir + '/train_dataset_3.npy', train_3_dataset)
np.save(dataset_dir + '/train_dataset_4.npy', train_4_dataset)
np.save(dataset_dir + '/val_dataset.npy', val_dataset)
np.save(dataset_dir + '/test_dataset.npy', test_dataset)
np.save(dataset_dir + '/val_dataset14.npy', val_dataset14)
np.save(dataset_dir + '/test_dataset14.npy', test_dataset14)