In [1]:
from math import floor, ceil
from pathlib import Path
from termcolor import cprint, colored
import shutil
import pandas as pd

data_dir = Path("../data")

unsorted_path = data_dir / "unsorted"
out_path = data_dir
classes_file = data_dir / "classes.txt"

log_file = open("log.txt", "w+")

with open(classes_file) as classes_file:
    CLASS_NAMES = classes_file.read().splitlines()

errors = ['Partial', '!! BAD', 'Inaudible']

In [2]:
def printcr():
    print("\r", end="")

def print_progress(curr, total):
    bars = floor((curr/total)*10)*'|'
    dashes = (11 - ceil((curr/total)*10))*'='
    print("[{}{}]".format(bars, dashes), end="")

In [3]:
manual_classes = list(unsorted_path.glob("**/_classes.csv"))
count = 0
copied = 0
skipped = 0
total = 0

for csv in manual_classes:
    total += sum(1 for line in open(csv))

for csv in manual_classes:
    data = pd.read_csv(csv, header=None)
    seg_dir = csv.parent / 'segments'
    out_dir = out_path / csv.relative_to(unsorted_path).parent
    out_dir.mkdir(parents=True, exist_ok=True)

    for c in CLASS_NAMES:
        (out_dir / c).mkdir(parents=True, exist_ok=True)

    for index, row in data.iterrows():
        infile = seg_dir / "{:06d}.mp4".format(row[0])
        outfile = out_dir / row[1] / "{:06d}.mp4".format(row[0])
        if pd.notnull(row[2]) and any([x in row[2] for x in errors]):
            printcr()
            print_progress(count, total)
            cprint(" Skipped ({:06d}, {}, {})             ".format(row[0], row[1], row[2]), 'red', end="", flush=True)
            log_file.write("Skipped copying file {} with entry ({:06d}, {}, {}). Reason: Matched a known error subclass.\n".format(infile, row[0], row[1], row[2]))
            count += 1
            skipped += 1
        elif pd.notnull(row[2]):
            printcr()
            print_progress(count, total)
            cprint(" Copying ({:06d}, {}, {})             ".format(row[0], row[1], row[2]), 'yellow', end="", flush=True)
            shutil.copy(str(infile), str(outfile))
            count += 1
            copied += 1
        elif row[1] in CLASS_NAMES:
            printcr()
            print_progress(count, total)
            print(" Copying ({:06d}, {})...               ".format(row[0], row[1]), end="", flush=True)
            shutil.copy(str(infile), str(outfile))
            count += 1
            copied += 1
        else:
            cprintcr()
            print_progress(count, total)
            cprint(" Failed to copy ({:06d}, {}, {}): Unknown class '{}'".format(row[0], row[1], row[2], row[1]), 'red', flush=True)
            log_file.write("Skipped copying file {} with entry ({:06d}, {}, {}). Reason: Unknown class {}.\n".format(infile, row[0], row[1], row[2], row[1]))
            count += 1
            skipped += 1

print("\rDone. Copied {} files with {} classes. ({} skipped, see log file for details)".format(copied, len(CLASS_NAMES), skipped))
log_file.close()

Done. Copied 3416 files with 2 classes. (226 skipped, see log file for details)
