## Data Preparation For Rebatching



In [None]:
import os
import shutil
import json
import pandas as pd 

import matplotlib.pyplot as plt

In [None]:
data_path = '../coded-books/'

In [None]:
def construct_batches_dataframe(data_path):
    """Constructs a dataframe of batches from the data in the given path.
    
    Parameters
    ----------
    data_path : str
        The path to the data.
    
    Returns
    -------
    pandas.DataFrame
        A dataframe of batches. Each row represents a batch. The columns are:
        - ra: The RA who coded the batch.
        - batch: The name of the batch.
        - batch_path: The path to the directory containing the images for the batch.
        - via_json_path: The path to the VIA JSON file for the batch.
    """
    if not os.path.isdir(data_path):
        raise ValueError('data_path must be a directory')
    
    batches = []
    for ra in os.listdir(data_path):
        ra_path = os.path.join(data_path, ra)
        for batch in os.listdir(ra_path):
            batch_path = os.path.join(ra_path, batch)
            json_files = [f for f in os.listdir(batch_path) if f.endswith('.json')]
            if len(json_files) != 1:
                print(f'Found {len(json_files)} JSON files in "{batch_path}". Skipping.')
                continue
            via_json_path = os.path.join(batch_path, json_files[0])
            batches.append({
                'ra': ra,
                'batch': batch,
                'batch_path': batch_path,
                'via_json_path': via_json_path
            })

    return pd.DataFrame(batches)

In [None]:
def validate_via_img_metadata(coded_image):
    """Check that the image metadata in the VIA JSON file has the correct fields.
    
    Parameters
    ----------
    coded_image : dict
        The image metadata from the VIA JSON file.
        
    Returns
    -------
    bool
        True if the metadata is valid, False otherwise.
    """
    if 'filename' not in coded_image:
        return False
    if 'title' not in coded_image['file_attributes']:
        return False
    if 'google' not in coded_image['file_attributes']:
        return False
    if 'identifiable' not in coded_image['file_attributes']:
        return False
    if 'diversity' not in coded_image['file_attributes']:
        return False
    return True 

In [None]:
def construct_coded_images_dataframe(batches_df):
    """Constructs a dataframe of coded images from the given batches dataframe.
    
    Parameters
    ----------
    batches_df : pandas.DataFrame
        A dataframe of batches. Each row represents a batch. The columns are:
        - ra: The RA who coded the batch.
        - batch: The name of the batch.
        - batch_path: The path to the directory containing the images for the batch.
        - via_json_path: The path to the VIA JSON file for the batch.
        
    Returns
    -------
    pandas.DataFrame
        A dataframe of coded images. Each row represents an image. The columns are:
        - ra: The RA who coded the image.
        - batch_path: The path of the batch containing the image.
        - batch_name: The name of the batch containing the image.
        - image: The name of the image.
        - image_path: The path to the image.
        - title: The title of the book.
        - google: The Google Books link for the book.
        - identifiable: Whether the book is identifiable.
        - diversity_none: Whether the image contains no diversity.
        - diversity_bipoc: Whether the image contains diversity in the form of BIPOC.
        - diversity_non_cis_man: Whether the image contains diversity in the form of
                                 non cisgendered man.
        - diversity_lgbq: Whether the image contains diversity in the form of LGBTQ.
        - diversity_non-christian: Whether the image contains diversity in the form of
                                   non-Christian.
        - diversity_disabled: Whether the image contains diversity in the form of
                              disabled.
        - diversity_other: Whether the image contains diversity in the form of other.
        - diversity_ambiguous: Whether the image contains diversity in the form of
                                ambiguous.
    """
    if not set(batches_df.columns) == {'ra', 'batch', 'batch_path', 'via_json_path'}:
        raise ValueError('batches_df must have the columns: ra, batch, batch_path, via_json_path')

    coded_images = []
    for _, row in batches_df.iterrows():
        with open(row['via_json_path'], encoding='utf8') as f:
            via_json = json.load(f)
        for key, coded_image in via_json['_via_img_metadata'].items():
            # TODO: this is very hacky. Should be more elegant about checking valid metadata.
            flag = True
            if not validate_via_img_metadata(coded_image):
                print(f'Invalid metadata for image "{row["via_json_path"]} -> {key}". Writing default.')
                flag = False
            
            if 'filename' not in coded_image:
                print(f'No filename for image "{row["via_json_path"]} -> {key}". No default avaliable. Skipping')
                continue

            ra = row['ra']
            batch_path = row['batch_path']
            batch_name = row['batch']

            image = coded_image['filename']
            image_path = os.path.join(batch_path, image)
            
            title = coded_image['file_attributes']['title'] if flag else ''
            title = title.lower()
        
            google = coded_image['file_attributes']['google'] if flag else ''

            identifiable = coded_image['file_attributes']['identifiable'] if flag else 'no'

            diversity_none = "none" in coded_image['file_attributes']['diversity'] if flag else False
            diversity_bipoc = "bipoc" in coded_image['file_attributes']['diversity'] if flag else False
            diversity_non_cis_man = "non-cis man" in coded_image['file_attributes']['diversity'] if flag else False
            diverseity_lgbq = "lgbq" in coded_image['file_attributes']['diversity'] if flag else False
            diversity_non_christian = "non-christian" in coded_image['file_attributes']['diversity'] if flag else False
            diversity_disabled = "disabled" in coded_image['file_attributes']['diversity'] if flag else False
            diversity_other = "other" in coded_image['file_attributes']['diversity'] if flag else False
            diversity_ambiguous = "ambiguous" in coded_image['file_attributes']['diversity'] if flag else False

            coded_images.append({
                'ra': ra,
                'batch': batch_name,
                'batch_path': batch_path,
                'image': image,
                'image_path': image_path,
                'title': title,
                'google': google,
                'identifiable': identifiable,
                'diversity_none': diversity_none,
                'diversity_bipoc': diversity_bipoc,
                'diversity_non_cis_man': diversity_non_cis_man,
                'diversity_lgbq': diverseity_lgbq,
                'diversity_non_christian': diversity_non_christian,
                'diversity_disabled': diversity_disabled,
                'diversity_other': diversity_other,
                'diversity_ambiguous': diversity_ambiguous
            })

    return pd.DataFrame(coded_images)




In [None]:
batches_df = construct_batches_dataframe(data_path)
coded_images_df = construct_coded_images_dataframe(batches_df)
coded_images_df.head(10)

In [None]:
training_data = coded_images_df[coded_images_df['batch'].str.contains('training')]
non_training_data = coded_images_df[~coded_images_df['batch'].str.contains('training')]

In [None]:
training_data.head()

In [None]:
# some images are duplicated across the batches for some reason. This shouldn't have happened.
duplicates_in_non_training= non_training_data[non_training_data['image'].isin(training_data['image'])]
duplicates_in_training = training_data[training_data['image'].isin(non_training_data['image'])]

print(f'Number of non-training duplicates: {len(duplicates_in_non_training)}')
print(f'Number of training duplicates: {len(duplicates_in_training)}')

In [None]:
duplicates = pd.concat([duplicates_in_non_training, duplicates_in_training])
duplicates = duplicates.sort_values(by=['image', 'batch'])
duplicates.to_csv('../duplicates.csv', index=False)

In [None]:
# Removing images that were coded with the exact same title by all RA's.
training_data_needs_rebatching = training_data.drop_duplicates(subset=['image', 'title'])

# Removing duplicate images. 
training_data_needs_rebatching = training_data.drop_duplicates(subset=['image'])

In [None]:
non_training_data_needs_rebatching = non_training_data[non_training_data['identifiable'] == 'no']

In [None]:
rebatching_df = pd.concat([training_data_needs_rebatching, non_training_data_needs_rebatching, duplicates_in_non_training], axis=0, ignore_index=True)
rebatching_df = rebatching_df[['ra', 'batch', 'image']]
# There should not have been duplicates in this data, but there were.
# We are going to rebatch all duplicates.
rebatching_df = rebatching_df.drop_duplicates(subset=['image'])
rebatching_df = rebatching_df.rename(columns={'batch': 'old_batch',
                                              'ra': 'old_ra',})
rebatching_df.to_csv('../need-rebatching.csv', index=False)

## Checking Batches for Duplicate Images

In [None]:
batches_path = '../batches'

# check all individual directories for duplicate images
for batch in os.listdir(batches_path):
    batch_path = os.path.join(batches_path, batch)
    images = os.listdir(batch_path)
    if len(images) != len(set(images)):
        print(f'batch {batch} has duplicate images.')

# check for duplicate images across all batches
images = []
for batch in os.listdir(batches_path):
    batch_path = os.path.join(batches_path, batch)
    images.extend(os.listdir(batch_path))

if len(images) != len(set(images)):
    print(f'batches have duplicate images.')

## Gathering Images That Need Rebatching

In [None]:
ra_batches_path = '../coded-books'
output_path = '../needs-rebatching-12-14-2022'

def create_path_from_row(row):
    return os.path.join(ra_batches_path, row['old_ra'], row['old_batch'], row['image'])

rebatching_df['path'] = rebatching_df.apply(lambda x: create_path_from_row(x), axis=1)
# copy images to new directory
if not os.path.exists(output_path):
    os.mkdir(output_path)
for index, row in rebatching_df.iterrows():
    shutil.copy(row['path'], output_path)

## Coding Metrics
Generating insights into how the RA's are coding images

In [None]:
# create a bar chart for each ra where the x axis is the ra and the y axis shows the distribution of images
ra_batches = coded_images_df.groupby(['ra', 'identifiable'])
ra_batches = ra_batches['image'].count().unstack('identifiable').fillna(0)
ra_batches = ra_batches.sort_values(by=['yes'], ascending=False)

# set styles for the bar chart
plt.style.use('bmh')
colors = ['#323031', '#FFC857', '#DB3A34', '#084C61', '#177E89']
ax = ra_batches.plot.bar(stacked=False, figsize=(20, 10), color=colors)
ax.set_facecolor('#EEEEEE')
ax.set_xlabel('RA', fontsize=14)
ax.set_ylabel('Number of Images', fontsize=14)
ax.set_title('Distribution of Codable Images by RA', fontsize=18)
ax.legend(loc='upper right', fontsize=14)
ax.tick_params(axis='x', labelsize=12)
ax.tick_params(axis='y', labelsize=12)

In [None]:
# create a bar chart for each ra where the x axis is the ra and the y axis shows the distribution of diversity categories
ra_batches = coded_images_df.groupby(['ra'])
# get counts for each diversity category
ra_batches = ra_batches[['diversity_none', 'diversity_bipoc', 'diversity_non_cis_man',
                        'diversity_lgbq', 'diversity_non_christian', 'diversity_disabled',
                        'diversity_other', 'diversity_ambiguous']].sum().fillna(0)

ra_batches = ra_batches.sort_values(by=['diversity_none'], ascending=False)


# set styles for the bar chart
plt.style.use('bmh')
colors = ['#f94144','#f9844a','#f9c74f','#90be6d','#43aa8b','#4d908e','#577590','#277da1']
ax = ra_batches.plot.bar(stacked=False, figsize=(20, 10), color=colors)
ax.set_facecolor('#EEEEEE')
ax.set_xlabel('RA', fontsize=14)
ax.set_ylabel('Number of Images', fontsize=14)
ax.set_title('Distribution of Coded Diversity by RA', fontsize=18)
ax.legend(loc='upper right', fontsize=14)
ax.tick_params(axis='x', labelsize=12)
ax.tick_params(axis='y', labelsize=12)
