## Create List of Images Exported with Illustration Detector

In [None]:
import os
import pandas as pd
import glob

def get_file_names(root_folder, extension='.jpg'):
    file_list = []
    for folder, _, files in os.walk(root_folder):
        for file in files:
            if file.endswith(extension):
                file_path = os.path.join(folder[-36:], file)
                barcode = os.path.basename(folder)
                tag = os.path.basename(os.path.dirname(folder))
                file_list.append({'filename': file, 'path': file_path, 'barcode': barcode, 'lang_year': tag})
    return file_list

# Replace 'your_root_folder' with the path to your top-level directory
root_folder = 'your_root_folder'
file_list = get_file_names(root_folder)

# Create a DataFrame from the file list
df_files = pd.DataFrame(file_list)

# Display the DataFrame
df_files

## Import Discarded Images from ONiT Similarity Explorer

In [None]:
import json
import pandas as pd

def get_all_filenames(json_path):
    all_filenames = []
    with open(json_path, 'r') as f:
        data = json.load(f)
        for key, filenames in data.items():
            all_filenames.extend([filename + '.jpg' for filename in filenames])
    return all_filenames

# Example usage
json_path = 'your_json_path/discardedImages20230312.json'
discarded_files_new = get_all_filenames(json_path)

print(len(discarded_files_new))

## Remove Discarded Images from Full List

In [None]:
# Remove discarded files from loaded list with inverted mask (negated .isin() condition)
images_cleaned = df_files[~df_files['filename'].isin(discarded_files_new)]

print(len(images_cleaned))

## Import Barcodes from Metadata Lists

In [None]:
# Import barcodes from metadata lists
import os
import pandas as pd

def get_barcodes(path):
    barcodes = []
    long_bc_count = 0
    
    for folder, _, files in os.walk(path):
        for file in files:
            print('Processing ', file)
            file_path = os.path.join(path, file)
            #print(file_path)
            corpus = file[11:14]
            with open(file_path,'rb') as f:
                data = pd.read_excel(f)
                for barcode in data['Barcode']:
                    if not pd.isna(barcode):
                        if len(str(barcode)) > 10:
                            long_bc_count += 1
                            #print(f'Barcode > 10char: ', barcode)
                        shortened_barcode = barcode[:10] # add only first barcode to remove duplicates (exception: journals - not included here)
                        barcodes.append({'barcode': shortened_barcode, 'corpus': corpus})
        print(f'Number of barcodes > 10char: ', long_bc_count)
        return barcodes

path = 'path_to_folder'
filenames = get_barcodes(path)

barcodes = pd.DataFrame(filenames)

barcodes.to_csv('ONiT_barcodes_full-list.csv', index=False)

In [None]:
# Remove barcode entries from cleaned list that are not present in the full barcodes list
## Note: multiple barcodes per entry are removed; a list of journal barcodes is missing here
images_cleaned_new = images_cleaned[images_cleaned['barcode'].isin(barcodesNew['barcode'])]

print(len(images_cleaned_new))

## Create JSON

In [None]:
import json
import pandas as pd

read_file = pd.read_csv(r'ONiT_extracted-images_full.csv')[['filename','barcode','iiif', 'lang_year']]

# Create local_url column & adapt entries & structure
#read_file['local_url'] = 'http://localhost/images/' + read_file['barcode'] + '/' + read_file['filename']
read_file['filename'] = read_file['filename'].str.replace(".jpg","", regex=True)
#read_file = read_file.drop('barcode', axis=1)
reorder = ['filename', 'barcode', 'iiif', 'lang_year']
read_file = read_file[reorder]
read_file = read_file.rename(columns={'filename': 'id', 'iiif': 'iiif_url', 'lang_year': 'corpus'})

# Convert to dictionary
neighbors_data = read_file[['id', 'barcode', 'iiif_url', 'corpus']].to_dict(orient='records')

# Write dictionary to a JSON file
output_file_path = 'your_output_file_path.json'

with open(output_file_path, 'w') as json_file:
    json.dump(neighbors_data, json_file, indent=2)

print(f"JSON file created at: {output_file_path}")

## Copy JPGs of Curated Collection to New Folder

In [None]:
import os
import shutil

def copy_files(source_folder, destination_folder, filenames):
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
        print(f"Created folder: '{destination_folder}'")
        
    for folder, _, files in os.walk(source_folder):
        for filename in filenames:
            source_path = os.path.join(folder, filename)
            destination_path = os.path.join(destination_folder, filename)
            try:
                shutil.copy2(source_path, destination_path)
                print(f"File '{filename}' copied to '{destination_folder}'")
            except FileNotFoundError:
                pass  # Skip to the next file if it's not found

# Example usage:
source_folder = 'your_source_folder'
destination_folder = 'your_target_folder'
filenames = images_cleaned_new['filename'].to_list()

#filenames
copy_files(source_folder, destination_folder, filenames)

## Data Analysis & Visualisation

In [None]:
import matplotlib.pyplot as plt

# Count images per language & century
lang_order = images_cleaned_new['lang_year'].unique()
lang_year_counts = images_cleaned_new['lang_year'].value_counts().loc[lang_order]

# Create a bar chart
plt.figure(figsize=(10, 6))
plt.bar(lang_year_counts.index, lang_year_counts.values)

# Customize the plot if needed
plt.title('Number of Images with Nature Representations per Sub-Corpus')
plt.xlabel('Sub-Corpus (Language/Century)')
plt.ylabel('Count')

# Show the plot
plt.show()

In [None]:
# Group by 'barcode' and 'lang_year', then count the number of files in each group
grouped_df = images_cleaned_new.groupby(['barcode', 'lang_year']).size().reset_index(name='file_count')

# Reorder the groups based on the order of unique 'lang_year' values
grouped_df['lang_year'] = pd.Categorical(grouped_df['lang_year'], categories=lang_order, ordered=True)
grouped_df = grouped_df.sort_values(by=['lang_year'])

# Create a bar chart using Matplotlib with Axes object
fig, ax = plt.subplots(figsize=(10, 6))

# Iterate over unique 'lang_year' values and plot bars for each
for lang_year in lang_order:
    subset_df = grouped_df[grouped_df['lang_year'] == lang_year]
    ax.bar(subset_df['barcode'], subset_df['file_count'], label=str(lang_year))

# Hide x-axis labels (barcodes)
ax.set_xticks([])

# Customize the plot if needed
ax.set_title('Number of Images per Barcode, Ordered by Language/Century')
ax.set_ylabel('File Count')
ax.legend(title='language/century')

# Show the plot
plt.show()

In [None]:
# Group by 'lang_year' and calculate the number of files and unique barcodes
grouped_2 = images_cleaned_new.groupby('lang_year').agg({'filename': 'count', 'barcode': 'nunique'}).reset_index()

# Create a grouped bar chart using Matplotlib
fig, ax = plt.subplots(figsize=(10, 6))

# Bar positions
bar_width = 0.45
bar_positions_files = grouped_2.index
bar_positions_barcodes = [pos + bar_width for pos in bar_positions_files]

# Plot bars for files
files_bars = ax.bar(bar_positions_files, grouped_2['filename'], width=bar_width, label='Images')

# Plot bars for unique barcodes
barcodes_bars = ax.bar(bar_positions_barcodes, grouped_2['barcode'], width=bar_width, label='Unique Barcodes')

# Display the sum of barcodes and images next to the bars
for bar, value in zip(files_bars, grouped_2['filename']):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height(),
            f'{value}', ha='center', va='bottom')

for bar, value in zip(barcodes_bars, grouped_2['barcode']):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height(),
            f'{value}', ha='center', va='bottom')

# Customize the plot
ax.set_xticks([pos + bar_width/2 for pos in bar_positions_files])
ax.set_xticklabels(grouped_2['lang_year'])
ax.set_title('Number of Images and Unique Barcodes per Language/Century')
ax.set_xlabel('Language/Century')
ax.set_ylabel('Count')
ax.legend()

# Show the plot
plt.show()