In [1]:
import os
import pandas as pd
from datetime import datetime
from tqdm import tqdm

In [2]:
phase = "P1"
technical_area = "TA1"
task = "CD" #### MAKE SURE THIS IS YOUR TASK!!! ND, ED, CD, etc.
team = "COL"
dataset = "LDC2022E22-V1"

system_output_cols = ["file_id","is_processed","message","file_path"]
submission_root = "./unprocessed"

if task == "CD":
    doc_cols = ["file_id", "timestamp", "llr"]
# elif task == "YOUR_TASK":
#     doc_cols = ["COLUMNS", "FOR", "YOUR", "TASK"]

In [4]:
folders = [ f.path for f in os.scandir(submission_root) if f.is_dir() ]

for folder in folders:
    
    try:
        print("root folder:", folder)

        files = [ f.path for f in os.scandir(folder) ]
        assert len(files) == 2098, "submission folder should have exactly 2098 files"
        
        doc_files = []
        for file in files:
            if "system_output.index" not in file:
                doc_files.append(file.split("/")[-1].split(".")[-2])
        assert len(doc_files) == 2097, "submission folder should have exactly 2097 document output files"
        
        system_output_file = "{}/system_output.index.tab".format(folder)
        system_output_df = pd.read_csv(system_output_file, sep="\t")
        assert system_output_df.columns.tolist() == system_output_cols, "system output file must have columns {}".format(system_output_cols)
        assert system_output_df.shape[0] == 2097, "system output file must have 2097 rows"

        doc_files.sort()
        assert doc_files == system_output_df.sort_values(["file_id"])["file_id"].tolist(), "file_ids should be the same"

        print("checking...")
        for i in tqdm(range(system_output_df.shape[0])):
            row = system_output_df.iloc[i]
            assert row["file_id"]+".tab" == row["file_path"], "file paths should be FILEID.tab" 
            assert row["is_processed"] == True, "is_processed must be set to TRUE"
            doc_df = pd.read_csv("{}/{}".format(folder,row["file_path"]), sep="\t")
            assert doc_df.columns.tolist() == doc_cols, "document output file must have columns {}".format(doc_cols)
        print("tests passed!")
        
        submission_name = "CCU_{}_{}_{}_{}_{}_{}".format(
            phase,
            technical_area,
            task,
            team,
            dataset,
            datetime.now().strftime("%Y%m%d_%H%M%S")
        )
        print("Name this submission", submission_name, "\n") 
    except Exception as e:
        print("failed:", e, "\n")

root folder: ./unprocessed/entrainment_emotion_disgust-entrainment_mandarin-4-0.05
checking...


100%|████████████████████████████████████████████████████████████████| 2097/2097 [00:01<00:00, 1158.73it/s]


tests passed!
Name this submission CCU_P1_TA1_CD_COL_LDC2022E22-V1_20221121_123817 

root folder: ./unprocessed/entrainment-entrainment_mandarin-4-0.06
checking...


100%|████████████████████████████████████████████████████████████████| 2097/2097 [00:01<00:00, 1245.59it/s]


tests passed!
Name this submission CCU_P1_TA1_CD_COL_LDC2022E22-V1_20221121_123818 

root folder: ./unprocessed/entrainment-entrainment_mandarin-4-0.07
checking...


100%|█████████████████████████████████████████████████████████████████| 2097/2097 [00:03<00:00, 620.73it/s]


tests passed!
Name this submission CCU_P1_TA1_CD_COL_LDC2022E22-V1_20221121_123822 

root folder: ./unprocessed/entrainment_emotion_disgust_fear-entrainment_mandarin-4-0.04
checking...


100%|█████████████████████████████████████████████████████████████████| 2097/2097 [00:04<00:00, 478.66it/s]


tests passed!
Name this submission CCU_P1_TA1_CD_COL_LDC2022E22-V1_20221121_123826 

root folder: ./unprocessed/entrainment_emotion_disgust-entrainment_mandarin-4-0.04
checking...


100%|█████████████████████████████████████████████████████████████████| 2097/2097 [00:04<00:00, 471.38it/s]

tests passed!
Name this submission CCU_P1_TA1_CD_COL_LDC2022E22-V1_20221121_123831 




