## Data Preparation For Rebatching



In [None]:
import os
import json
import pandas as pd 

In [None]:
data_path = '../coded-books/'

In [None]:
def construct_batches_dataframe(data_path):
    """Constructs a dataframe of batches from the data in the given path.
    
    Parameters
    ----------
    data_path : str
        The path to the data.
    
    Returns
    -------
    pandas.DataFrame
        A dataframe of batches. Each row represents a batch. The columns are:
        - ra: The RA who coded the batch.
        - batch: The name of the batch.
        - batch_path: The path to the directory containing the images for the batch.
        - via_json_path: The path to the VIA JSON file for the batch.
    """
    batches = []
    for ra in os.listdir(data_path):
        ra_path = os.path.join(data_path, ra)
        for batch in os.listdir(ra_path):
            batch_path = os.path.join(ra_path, batch)
            json_files = [f for f in os.listdir(batch_path) if f.endswith('.json')]
            if len(json_files) != 1:
                print(f'Found {len(json_files)} JSON files in "{batch_path}". Skipping.')
                continue
            via_json_path = os.path.join(batch_path, json_files[0])
            batches.append({
                'ra': ra,
                'batch': batch,
                'batch_path': batch_path,
                'via_json_path': via_json_path
            })

    return pd.DataFrame(batches)

In [None]:
batches_df = construct_batches_dataframe(data_path)