In [1]:
# This notebook will split the data stored in a directory into a train, validation and test set

In [151]:
import os
from pathlib import Path
import pathlib
import cv2
import pandas as pd
import numpy as np
import shutil

In [157]:
TARGET_DIR = Path("D:/__School/__Masters/____2021fALL/5280_aiwearables/final_gesture_frame_splits")

MAIN_DATA_DIR = Path("D:/__School/__Masters/____2021fALL/5280_aiwearables/combined_1st_2nd_3rd_iter_frames")

DIRS = [Path("train"), Path("validation"), Path("test")]
GESTURES = ["CLOCKWISE", "COUNTERCLOCKWISE", "DOWN", "UP", "LEFT", "RIGHT"]

LABEL_DICT = {"CLOCKWISE": 0, "COUNTERCLOCKWISE": 1, "DOWN": 2, "UP": 3, "LEFT": 4, "RIGHT": 5}

In [40]:
def make_train_val_test_folders(target_dir, dirs):
    """given a directory, generate 3 new directories named train, validation, test"""
    new_dir_path = None
    for dir_name in dirs:
        new_dir_path = target_dir.joinpath(Path(dir_name))
        pathlib.Path(new_dir_path).mkdir(parents=True, exist_ok=True) 

In [42]:
# make_train_val_test_folders(TARGET_DIR, DIRS)

In [53]:
def make_gesture_subfolders(target_directory, gestures, dirs):
    """creates gesture subfolders in a given directory that contain train, validation, test dirs"""
    dir_path = None
    for dir_name in dirs:
        split_dir_path = target_directory.joinpath(Path(dir_name))
        
        for gesture_name in gestures:
            gesture_dir_path = split_dir_path.joinpath(Path(gesture_name))
            pathlib.Path(gesture_dir_path).mkdir(parents=True, exist_ok=True) 

In [52]:
# make_gesture_subfolders(TARGET_DIR, GESTURES, DIRS)

In [56]:
def get_video_directories(main_directory):
    """returns a list of all of the subfolders that are videos within a main directory"""
    video_fnames = []
    for path, subdirs, files in os.walk(main_directory):
        for name in subdirs:
            video_fnames.append(str(Path(os.path.join(path, name))))
    return video_fnames[6:]

In [57]:
all_videos_dir_names = get_video_directories(MAIN_DATA_DIR)
all_videos_dir_names

['D:\\__School\\__Masters\\____2021fALL\\5280_aiwearables\\combined_1st_2nd_3rd_iter_frames\\CLOCKWISE\\AW_CLOCKWISE_00001',
 'D:\\__School\\__Masters\\____2021fALL\\5280_aiwearables\\combined_1st_2nd_3rd_iter_frames\\CLOCKWISE\\AW_CLOCKWISE_00002',
 'D:\\__School\\__Masters\\____2021fALL\\5280_aiwearables\\combined_1st_2nd_3rd_iter_frames\\CLOCKWISE\\AW_CLOCKWISE_00003',
 'D:\\__School\\__Masters\\____2021fALL\\5280_aiwearables\\combined_1st_2nd_3rd_iter_frames\\CLOCKWISE\\AW_CLOCKWISE_00004',
 'D:\\__School\\__Masters\\____2021fALL\\5280_aiwearables\\combined_1st_2nd_3rd_iter_frames\\CLOCKWISE\\AW_CLOCKWISE_00005',
 'D:\\__School\\__Masters\\____2021fALL\\5280_aiwearables\\combined_1st_2nd_3rd_iter_frames\\CLOCKWISE\\AW_CLOCKWISE_00006',
 'D:\\__School\\__Masters\\____2021fALL\\5280_aiwearables\\combined_1st_2nd_3rd_iter_frames\\CLOCKWISE\\AW_CLOCKWISE_00007',
 'D:\\__School\\__Masters\\____2021fALL\\5280_aiwearables\\combined_1st_2nd_3rd_iter_frames\\CLOCKWISE\\AW_CLOCKWISE_00008',


In [58]:
def get_video_metadata_df(list_of_video_directories):
    """extracts the participant, class, num_frames in a list of full directories
       returns a dataframe"""
    df_list = []
    
    for vid_name in list_of_video_directories:
        
        # split video into list 
        # ['D:','__School','__Masters','____2021fALL',
        #  '5280_aiwearables','combined_1st_2nd_iter_frames',
        #  'CLOCKWISE','AW_CLOCKWISE_00002']
        split_vid_name = vid_name.split("\\")
        
        # get full path
        video_path = Path(vid_name)
        
        # get class name
        gesture_class = split_vid_name[-2]
        
        # get participant_id
        video_dir_name_split = split_vid_name[-1].split("_")
        if len(video_dir_name_split) == 4:
            participant_id = "_".join(video_dir_name_split[:2])
        else:
            participant_id = video_dir_name_split[0]
            
        # get frame count
        frames = os.listdir(vid_name)
        num_frames = len(frames)
        
        df_list.append({
            "full_path": video_path,
            "gesture": gesture_class,
            "participant_id": participant_id,
            "num_frames": num_frames
        })
        
    df = pd.DataFrame(df_list)
    return df

In [59]:
df = get_video_metadata_df(all_videos_dir_names)
df

Unnamed: 0,full_path,gesture,participant_id,num_frames
0,D:\__School\__Masters\____2021fALL\5280_aiwear...,CLOCKWISE,AW,46
1,D:\__School\__Masters\____2021fALL\5280_aiwear...,CLOCKWISE,AW,44
2,D:\__School\__Masters\____2021fALL\5280_aiwear...,CLOCKWISE,AW,38
3,D:\__School\__Masters\____2021fALL\5280_aiwear...,CLOCKWISE,AW,38
4,D:\__School\__Masters\____2021fALL\5280_aiwear...,CLOCKWISE,AW,39
...,...,...,...,...
910,D:\__School\__Masters\____2021fALL\5280_aiwear...,UP,Yen_P,47
911,D:\__School\__Masters\____2021fALL\5280_aiwear...,UP,Yen_P,57
912,D:\__School\__Masters\____2021fALL\5280_aiwear...,UP,Yen_P,52
913,D:\__School\__Masters\____2021fALL\5280_aiwear...,UP,Yen_P,47


In [61]:
df.participant_id.value_counts()

MCM           72
AW            60
SP            60
JBG           60
KP            60
SNB           51
cooper_s      50
Will_B        50
Miguel_Q      50
Madeline_U    50
Lori_L        50
Ian_z         50
emmanuel_z    50
Daniel_M      50
Yen_P         50
IA            42
RAM           30
IM            12
RCS           12
KH             6
Name: participant_id, dtype: int64

In [62]:
df.gesture.value_counts()

RIGHT               169
UP                  169
DOWN                168
LEFT                168
COUNTERCLOCKWISE    123
CLOCKWISE           118
Name: gesture, dtype: int64

In [88]:
from sklearn.model_selection import train_test_split
full_train, test = train_test_split(df, stratify=df.gesture, test_size=0.15)
train, validation = train_test_split(full_train, stratify=full_train.gesture, test_size=0.15)

In [89]:
train.participant_id.value_counts()

MCM           59
KP            47
SP            44
JBG           42
cooper_s      39
Will_B        39
Ian_z         38
Miguel_Q      38
Madeline_U    37
emmanuel_z    36
Yen_P         36
Lori_L        35
Daniel_M      35
AW            35
SNB           34
IA            29
RAM           18
RCS            7
IM             7
KH             5
Name: participant_id, dtype: int64

In [91]:
train.gesture.value_counts()

LEFT                122
RIGHT               122
UP                  122
DOWN                121
COUNTERCLOCKWISE     88
CLOCKWISE            85
Name: gesture, dtype: int64

In [92]:
test.gesture.value_counts()

UP                  26
RIGHT               25
DOWN                25
LEFT                25
COUNTERCLOCKWISE    19
CLOCKWISE           18
Name: gesture, dtype: int64

In [122]:
from sklearn.model_selection import train_test_split

def append_train_validation_test_split(df, train, validation, test, stratify_subject=False, stratify_gesture=False, random_state = 1):
    """appends a 'train', 'validation', 'test' onto the dataframe 
       based on the train, validation, test proportions and type, 'subjectwise', stratified"""
    dff = df.copy()
    
    if stratify_subject:
        full_train, test = train_test_split(df, stratify=df.participant_id, test_size=test)
        train, validation = train_test_split(full_train, stratify=full_train.participant_id, test_size=0.15)
        
    elif stratify_gesture:
        full_train, test = train_test_split(df, stratify=df.gesture, test_size=test)
        train, validation = train_test_split(full_train, stratify=full_train.gesture, test_size=validation)
        
    else:
        full_train, test = train_test_split(df, stratify=df.gesture, test_size=test)
        train, validation = train_test_split(full_train, stratify=full_train.gesture, test_size=validation)
    
    df["train"] = df.full_path.isin(train.full_path)
    df["validation"] = df.full_path.isin(validation.full_path)
    df["test"] = df.full_path.isin(test.full_path)
    
    return dff, train, validation, test

In [123]:
dff, df_train, df_val, df_test = append_train_validation_test_split(df, .7, .15, .15, stratify_subject=False, stratify_gesture=True, random_state = 1)

In [124]:
dff

Unnamed: 0,full_path,gesture,participant_id,num_frames,train,validation,test
0,D:\__School\__Masters\____2021fALL\5280_aiwear...,CLOCKWISE,AW,46,False,False,True
1,D:\__School\__Masters\____2021fALL\5280_aiwear...,CLOCKWISE,AW,44,True,False,False
2,D:\__School\__Masters\____2021fALL\5280_aiwear...,CLOCKWISE,AW,38,True,False,False
3,D:\__School\__Masters\____2021fALL\5280_aiwear...,CLOCKWISE,AW,38,True,False,False
4,D:\__School\__Masters\____2021fALL\5280_aiwear...,CLOCKWISE,AW,39,True,False,False
...,...,...,...,...,...,...,...
910,D:\__School\__Masters\____2021fALL\5280_aiwear...,UP,Yen_P,47,True,False,False
911,D:\__School\__Masters\____2021fALL\5280_aiwear...,UP,Yen_P,57,True,False,False
912,D:\__School\__Masters\____2021fALL\5280_aiwear...,UP,Yen_P,52,True,False,False
913,D:\__School\__Masters\____2021fALL\5280_aiwear...,UP,Yen_P,47,True,False,False


In [125]:
def get_stats(df):
    print(df.participant_id.value_counts())
    print(df.gesture.value_counts())
    print()

In [113]:
for x in [df, df_train, df_val, df_test]:
    get_stats(x)

MCM           72
AW            60
SP            60
JBG           60
KP            60
SNB           51
cooper_s      50
Will_B        50
Miguel_Q      50
Madeline_U    50
Lori_L        50
Ian_z         50
emmanuel_z    50
Daniel_M      50
Yen_P         50
IA            42
RAM           30
IM            12
RCS           12
KH             6
Name: participant_id, dtype: int64
RIGHT               169
UP                  169
DOWN                168
LEFT                168
COUNTERCLOCKWISE    123
CLOCKWISE           118
Name: gesture, dtype: int64

MCM           52
KP            47
JBG           44
Yen_P         41
SP            41
emmanuel_z    39
Ian_z         38
Miguel_Q      37
Will_B        37
AW            37
SNB           37
Madeline_U    35
cooper_s      35
Daniel_M      33
Lori_L        33
IA            27
RAM           26
IM            11
RCS            5
KH             5
Name: participant_id, dtype: int64
LEFT                122
UP                  122
RIGHT               122
DOWN 

In [114]:
df_train

Unnamed: 0,full_path,gesture,participant_id,num_frames
11,D:\__School\__Masters\____2021fALL\5280_aiwear...,CLOCKWISE,cooper_s,33
573,D:\__School\__Masters\____2021fALL\5280_aiwear...,LEFT,Yen_P,69
795,D:\__School\__Masters\____2021fALL\5280_aiwear...,UP,Ian_z,94
59,D:\__School\__Masters\____2021fALL\5280_aiwear...,CLOCKWISE,KP,51
276,D:\__School\__Masters\____2021fALL\5280_aiwear...,DOWN,emmanuel_z,75
...,...,...,...,...
760,D:\__School\__Masters\____2021fALL\5280_aiwear...,UP,cooper_s,27
537,D:\__School\__Masters\____2021fALL\5280_aiwear...,LEFT,SNB,81
364,D:\__School\__Masters\____2021fALL\5280_aiwear...,DOWN,RAM,89
422,D:\__School\__Masters\____2021fALL\5280_aiwear...,LEFT,cooper_s,34


In [100]:
len(df)

915

In [99]:
len(df_train)

660

In [101]:
len(df_val)

117

In [102]:
len(df_test)

138

In [128]:
dff.full_path[0]

WindowsPath('D:/__School/__Masters/____2021fALL/5280_aiwearables/combined_1st_2nd_3rd_iter_frames/CLOCKWISE/AW_CLOCKWISE_00001')

In [155]:
def copy_train_validation_test(df, main_dir, target_dir):
    """copies the video directories into the corresponding train, validation, test folders"""
    
    # for each row of df
        # split full_path
        # 'D:/__School/__Masters/____2021fALL/5280_aiwearables/combined_1st_2nd_3rd_iter_frames/CLOCKWISE/AW_CLOCKWISE_00001'
        # make it TARGET_DIR = Path("D:/__School/__Masters/____2021fALL/5280_aiwearables/final_gesture_frame_splits")
        
        # if train:
            # insert final_gesture_frame_splits/train before -2
        # if validation
            # insert final_gesture_frame_splits/validation before -2
        # if test
            # insert final_gesture_frame_splits/test before -2
        
        # copy video to new directory
    def per_row(row):
        split_path = str(row.full_path).split("\\")
        
        
        if row.train:
            split_path[-3] = "final_gesture_frame_splits\\train"
            
        elif row.validation:
            split_path[-3] = "final_gesture_frame_splits\\validation"
            
        elif row.test:
            split_path[-3] = "final_gesture_frame_splits\\test"
            
        new_dir = "\\".join(split_path)
        
        #print(row.full_path)
        #print(Path(new_dir))
        
        #copy(row.full_path, Path(new_dir))
        shutil.copytree(row.full_path, Path(new_dir), symlinks=False, ignore=None, copy_function=copy, ignore_dangling_symlinks=False, dirs_exist_ok=False)
    
    df.apply(per_row, axis=1)

In [156]:
copy_train_validation_test(dff, MAIN_DATA_DIR, TARGET_DIR)

In [None]:
def generate_annotation_file(input_dir, label_dict, output_dir):
    """generates the annotation.txt for the directory"""
    
    
    def get_video_dir_names(input_dir):
        """returns the list of frame_dirs and class_dirs"""

        # get all file directories
        video_fnames = []
        for path, subdirs, files in os.walk(input_dir):
            for subdir in subdirs:
                video_fnames.append(str(Path(os.path.join(path, subdir))))

        # remove the parent class folder directories 0 to 6
        return video_fnames[6:]

    
    def get_frame_dir_names(input_dir):
        """given a frames directory returns a list of each frame directory"""
        frames_fnames = []
        for path, subdirs, files in os.walk(input_dir):
            for file in files:
                frames_fnames.append(str(Path(os.path.join(path, file))))

        return frames_fnames
    
    
    def get_start_and_end_frame_numbers(input_dir):
        """returns the start and end frame number of a given video directory"""
        start = 1
        end = 0
        frame_fnames = []
        for path, subdirs, files in os.walk(input_dir):
            for name in files:
                frame_fnames.append(str(Path(os.path.join(path, name))))

        end = len(frame_fnames)
        return start, end
    
    
    def get_class_label_id(frame_dir, label_dict):
        """given a frame directory returns the label id"""

        #may only work on windows
        split_dir = str(frame_dir).split("\\")

        for item in split_dir:
            if item in label_dict.keys():
                class_name = item

        return label_dict[class_name]
    
    
    def get_class_name_video_id(video_dir):
        """given a video_dir returns the class name and the video_idx"""
        split_dir = str(video_dir).split("\\")
        class_name = split_dir[-2]
        video_idx = split_dir[-1]

        return "\\".join([class_name, video_idx])
    
    
    # get all file directories
    video_dirs = get_video_dir_names(input_dir)
    
    #open annotation.txt
    annotations = open(os.path.join(output_dir), "a")
    
    for video_dir in video_dirs:
        # extract start_frame and end_frame number (inclusive)
        start_frame, end_frame = get_start_and_end_frame_numbers(video_dir)
        
        # get label id
        label_id = get_class_label_id(video_dir, label_dict)
        
        #get class_name/video_idx
        class_video_idx = get_class_name_video_id(video_dir)
        
        #build annotation string
        annotation_line_item = class_video_idx + " " + str(start_frame) + " " + str(end_frame) + " " + str(label_id) + "\n"
        
        # write to annotation.txt
        annotations.write(annotation_line_item)
    annotations.close()
    

In [None]:
# for all train, test, validation directories

for x_dir in [Path("D:/__School/__Masters/____2021fALL/5280_aiwearables/final_gesture_frame_splits/train"),
              Path("D:/__School/__Masters/____2021fALL/5280_aiwearables/final_gesture_frame_splits/validation"),
              Path("D:/__School/__Masters/____2021fALL/5280_aiwearables/final_gesture_frame_splits/test")]:
    
    generate_annotation_file(INPUT_FRAMES_DIRECTORY, LABEL_DICT, OUTPUT_DIRECTORY)

In [None]:
# make train, validiation, test directories

# copy internal class folders

# make list of all video folders and associated meta data

# generate train, validation, test tags on each one.

# def a function that can apply to every row of the pandas frame 
#that will put the associated video into the new directory path