In [1]:
import pandas as pd
import pathlib
from scripts.stat_utils import collect_study_stats, collect_databank_stats
from scripts.io_utils import walk

In [2]:
# Define study metadata directory
studies_metadata_dir = pathlib.Path("../data/metadata")

# Collect metadata file paths
metadata_files = list(walk(studies_metadata_dir))

# Calculate statistics for each image attribute within each study
all_results_list = list()
for metadata_path in metadata_files:
    collect_study_stats(metadata_path, all_results_list)

stat_results_df = pd.DataFrame(
    data=all_results_list,
    columns=["Study_Name", "Attribute", "S", "H", "NME", "J", "E", "GC"],
)

# Make directories
stats_dir = pathlib.Path("../data/statistics")
pathlib.Path.mkdir(stats_dir, exist_ok=True)

# Save individual stats as parquet file
output_file = pathlib.Path(stats_dir, f"individual_studies_diversity.parquet.gzip")
stat_results_df.to_parquet(output_file, compression="gzip")

In [3]:
# Collect databank stats
databank_stats = collect_databank_stats(metadata_dir=studies_metadata_dir, na_cols=["pixel_size_x", "pizel_size_y"])

# Save databank stats as parquet file
output_file = pathlib.Path(stats_dir, f"databank_diversity.parquet.gzip")
databank_stats.to_parquet(output_file, compression="gzip")

In [4]:
study_stats = pd.read_parquet(f"{stats_dir}/individual_studies_diversity.parquet.gzip")
print(study_stats.head())

databank_stats = pd.read_parquet(f"{stats_dir}/databank_diversity.parquet.gzip")
print(databank_stats.head())

                              Study_Name       Attribute     S         H  NME  \
0  idr0080-way-perturbation_screenA_2701         well_id  6912  8.841014  1.0   
1  idr0080-way-perturbation_screenA_2701  imaging_method     1  0.000000  NaN   
2  idr0080-way-perturbation_screenA_2701        organism     1  0.000000  NaN   
3  idr0080-way-perturbation_screenA_2701   organism_part     1  0.000000  NaN   
4  idr0080-way-perturbation_screenA_2701       cell_line     3  1.098612  1.0   

     J    E   GC  
0  1.0  1.0  0.0  
1  NaN  1.0  0.0  
2  NaN  1.0  0.0  
3  NaN  1.0  0.0  
4  1.0  1.0  0.0  
    Attribute      S          H       NME         J         E        GC
0   screen_id      3   0.860492  0.812077  0.783253  0.674471  0.363842
1  study_name      3   0.860492  0.812077  0.783253  0.674471  0.363842
2    plate_id    334   5.494304  0.330586  0.945478  0.534196  0.371400
3  plate_name    334   5.494304  0.330586  0.945478  0.534196  0.371400
4     well_id  28320  10.251324  1.0000