# Clean Errors in dataset
Unfortunately the data seems to be missing some frames in some sequences.  
In this script we will check consistent input and label lengths for all data points.
We can then manually or automatically correct/deltete the erronous data points.

In [None]:
import os
from os import path
from glob import glob
from tqdm import notebook.tqdm as tqdm
import numpy as np

In [86]:
data_folder = "/data/7nouri/final-dataset/"

In [87]:
def get_configurations(data_folder):
    """
    Return folders first level subfolders, 
    which are the different configurations of the dataset.
    """
    content = os.listdir(data_folder)
    configurations = []
    for c in content:
        conf = path.join(data_folder,c)
        if path.isdir(conf) and not c.startswith("."):
            configurations.append(conf)
    return configurations

def get_variations(conf):
    """Given a configuration folder return the subfolders"""
    variation = os.listdir(conf)
    v_s = []
    for v in variation:
        v_folder = path.join(conf, v)
        if os.path.isdir(v_folder):
            v_s.append(v_folder)
    return v_s

def get_sequences(variation):
    """Return all sequence folders of given variation"""
    seq = []
    for sequence in os.listdir(variation):
        if "sequence" in sequence:
            seq.append(path.join(variation,sequence))
    return seq

In [118]:
def verify_sequence(seq):
    """Given a sequence folder verify that the lengths all inputs and outputs match"""
    action_info = open(seq+"/action.info").readlines()[0]
    action_info = int(action_info.split("[")[1].split(",")[0])
    n_frames = len(glob(seq+"/*.png"))
    n_frame_desc = len(glob(seq+"/frame*.txt"))
    n_joints = len(glob(seq+"/joints*.npy"))
    #labels = np.load(seq+"/label.npy")
    #print(labels)
    
    assert action_info <= n_frames, F"Error at {seq}: action_info states {action_info}, but {n_frames} frames found"
    assert action_info <= n_frame_desc, F"Error at {seq}: action_info states {action_info}, but {n_frames_info} frames descirptions"
    assert action_info <= n_joints, F"Error at {seq}: action_info states {action_info}, but {n_joints} joints found"

In [119]:
configurations = get_configurations(data_folder)
variations = get_variations(configurations[0])
sequences = get_sequences(variations[0])
for seq in sequences:
    verify_sequence(seq)

In [138]:
def verify_all():
    global data_folder
    # get configurations
    configurations = get_configurations(data_folder)
    configurations.sort()
    for config in tqdm(configurations, desc="configurations", colour='green', leave=False, position=0):
        # get varitiations
        variations = get_variations(config)
        variations.sort()
        for var in tqdm(variations, desc="variations", colour='red', leave=False,position=0):
            #get sequences
            sequences = get_sequences(var)
            sequences.sort()
            # verify lengths
            for seq in tqdm(sequences, desc="sequences", colour='blue', leave=False,position=0):
                try:
                    verify_sequence(seq)
                except Exception as e:
                    print(e)

In [139]:
verify_all()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for config in tqdm(configurations, desc="configurations", colour='green', leave=False, position=0):


configurations:   0%|          | 0/14 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for var in tqdm(variations, desc="variations", colour='red', leave=False,position=0):


variations:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for seq in tqdm(sequences, desc="sequences", colour='blue', leave=False,position=0):


sequences:   0%|          | 0/5000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2500 [00:00<?, ?it/s]

variations:   0%|          | 0/2 [00:00<?, ?it/s]

sequences:   0%|          | 0/5000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2500 [00:00<?, ?it/s]

variations:   0%|          | 0/2 [00:00<?, ?it/s]

sequences:   0%|          | 0/5000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2500 [00:00<?, ?it/s]

variations:   0%|          | 0/2 [00:00<?, ?it/s]

sequences:   0%|          | 0/5000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2500 [00:00<?, ?it/s]

variations:   0%|          | 0/2 [00:00<?, ?it/s]

sequences:   0%|          | 0/5000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2500 [00:00<?, ?it/s]

variations:   0%|          | 0/2 [00:00<?, ?it/s]

sequences:   0%|          | 0/5000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2500 [00:00<?, ?it/s]

variations:   0%|          | 0/2 [00:00<?, ?it/s]

sequences:   0%|          | 0/5000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2500 [00:00<?, ?it/s]

variations:   0%|          | 0/2 [00:00<?, ?it/s]

sequences:   0%|          | 0/5000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2500 [00:00<?, ?it/s]

variations:   0%|          | 0/2 [00:00<?, ?it/s]

sequences:   0%|          | 0/5000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2500 [00:00<?, ?it/s]

variations:   0%|          | 0/2 [00:00<?, ?it/s]

sequences:   0%|          | 0/5000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2500 [00:00<?, ?it/s]

variations:   0%|          | 0/2 [00:00<?, ?it/s]

sequences:   0%|          | 0/5000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2500 [00:00<?, ?it/s]

variations:   0%|          | 0/2 [00:00<?, ?it/s]

sequences:   0%|          | 0/5000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2500 [00:00<?, ?it/s]

variations:   0%|          | 0/6 [00:00<?, ?it/s]

sequences:   0%|          | 0/2000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2000 [00:00<?, ?it/s]

variations:   0%|          | 0/6 [00:00<?, ?it/s]

sequences:   0%|          | 0/2000 [00:00<?, ?it/s]

Error at /data/7nouri/final-dataset/generalization-test/V1-generalization-test/sequence_0000: action_info states 22, but 6 frames found
Error at /data/7nouri/final-dataset/generalization-test/V1-generalization-test/sequence_0003: action_info states 27, but 22 frames found


sequences:   0%|          | 0/2000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2000 [00:00<?, ?it/s]

sequences:   0%|          | 0/2000 [00:00<?, ?it/s]