In [None]:
from math import floor, ceil
from pathlib import Path
from termcolor import cprint, colored
import shutil
import pandas as pd

data_dir = Path("../data").resolve()

unsorted_path = data_dir / "unsorted"
out_path = data_dir
classes_file = data_dir / "classes.txt"
splits_file = data_dir / "splits.txt"

log_file = open("log.txt", "w+")

with open(classes_file) as classes_file:
    CLASS_NAMES = classes_file.read().splitlines()

splits = {}
split_names = []
with open(splits_file) as splits_file:
    for line in splits_file.read().splitlines():
        name, members = line.split()
        for member in members.split(","):
            splits[member] = name
        if name not in split_names:
            split_names.append(name)

errors = ['Partial', '!! BAD', 'Inaudible']

In [None]:
def print_progress(curr, total):
    bars = floor((curr/total)*10)*'|'
    dashes = (11 - ceil((curr/total)*10))*'='
    print("\r[{}{}]".format(bars, dashes), end="")

In [None]:
def count_entries(class_files):
    total = 0
    for file in class_files:
        total += sum(1 for line in open(file))
    return total

In [None]:
# Create directories for splits
for split in split_names:
    (out_path / split).mkdir(parents=True, exist_ok=True)

In [None]:
manual_classes = list(unsorted_path.glob("**/_classes.csv"))
count = 0
copied = 0
skipped = 0
total = count_entries(manual_classes)

for csv in manual_classes:
    data = pd.read_csv(csv, header=None)
    
    # Directory with video segments corresponding to this csv file
    seg_dir = csv.parent / 'segments'

    # Set output root directory
    curr = str(csv.relative_to(unsorted_path).parent)
    out_dir = out_path / splits[curr] / curr
    out_dir.mkdir(parents=True, exist_ok=True)

    curr_splits = 0

    # Create output directories for each class
    for c in CLASS_NAMES:
        (out_dir / c).mkdir(parents=True, exist_ok=True)

    # For each row of the manual classification file
    for index, row in data.iterrows():
        infile = seg_dir / "{:06d}.mp4".format(row[0])
        outfile = out_dir / row[1] / "{:06d}.mp4".format(row[0])

        print_progress(count, total)

        # If second row contains an error type, skip
        if pd.notnull(row[2]) and any([x in row[2] for x in errors]):
            cprint(" Skipped ({:06d}, {}, {})             ".format(row[0], row[1], row[2]), 'red', end="", flush=True)
            log_file.write("Skipped copying file {} with entry ({:06d}, {}, {}). Reason: Matched a known error subclass.\n".format(infile, row[0], row[1], row[2]))
            skipped += 1

        # If second row contains a speaker change, split video at change and skip
        elif pd.notnull(row[2]) and "Speaker Change" in row[2]:
            print("", end="\r")
            cprint("Found Speaker Change in file {} at {:06d}. Splitting data into new set.".format(csv, row[0]), 'blue', flush=True)
            curr_splits += 1
            curr =  str(csv.relative_to(unsorted_path).parent) + "-" + str(curr_splits)
            out_dir = out_dir.parent / curr
            out_dir.mkdir(parents=True, exist_ok=True)
            for c in CLASS_NAMES:
                (out_dir / c).mkdir(parents=True, exist_ok=True)
            log_file.write("Split video dataset defined in {} at {} due to speaker change. New ID: {}\n".format(csv, row[0],  curr))
            skipped += 1

        # If second row is not null, show a message but copy anyways
        elif pd.notnull(row[2]):
            cprint(" Copying ({:06d}, {}, {})             ".format(row[0], row[1], row[2]), 'yellow', end="", flush=True)
            shutil.copy(str(infile), str(outfile))
            copied += 1

        # If first row is a class valid class name, copy file to output directory
        elif row[1] in CLASS_NAMES:
            print(" Copying ({:06d}, {})...               ".format(row[0], row[1]), end="", flush=True)
            shutil.copy(str(infile), str(outfile))
            copied += 1

        # Otherwise, show an error and skip
        else:
            print("", end="\r")
            cprint("Failed to copy ({:06d}, {}, {}): Unknown class '{}'".format(row[0], row[1], row[2], row[1]), 'red', flush=True)
            log_file.write("Skipped copying file {} with entry ({:06d}, {}, {}). Reason: Unknown class {}.\n".format(infile, row[0], row[1], row[2], row[1]))
            skipped += 1
        count += 1

print("\rDone. Copied {} files with {} classes. ({} skipped, see log file for details)".format(copied, len(CLASS_NAMES), skipped))
log_file.close()