# Generate a QC report for the preliminary data

The QC report consists of a table with the cell line, seeding density, and percentage failed single-cells

In [1]:
import pathlib
import pandas as pd
from pycytominer import annotate
import pprint

In [2]:
# Set round of data to be processed
round_id = "Round_4_data"

# path for platemap directory
platemap_dir = pathlib.Path("../0.download_data/metadata/platemaps")

# load in barcode platemap
barcode_platemap = pd.read_csv(
    pathlib.Path(f"{platemap_dir}/Barcode_platemap_pilot_data.csv")
)

# path for qc results (indices)
qc_results_dir = pathlib.Path("./qc_results")

# Path to dir with converted data from single-cell QC
converted_dir = pathlib.Path(f"./data/converted_profiles/{round_id}")

# output path for reports
output_dir = pathlib.Path("./qc_report")
output_dir.mkdir(parents=True, exist_ok=True)

# extract the plate names from the file name
plate_names = [file.stem.split("_")[0] for file in converted_dir.glob("*.parquet")]

In [3]:
# create plate info dictionary
plate_info_dictionary = {
    name: {
        "converted_path": (
            str(
                pathlib.Path(
                    list(converted_dir.rglob(f"{name}_converted.parquet"))[0]
                ).resolve(strict=True)
            )
            if list(converted_dir.rglob(f"{name}_converted.parquet"))
            else None
        ),
        "qc_results_path": (
            str(
                pathlib.Path(
                    list(qc_results_dir.rglob(f"{name}_failed_qc_indices.csv.gz"))[0]
                ).resolve(strict=True)
            )
            if list(qc_results_dir.rglob(f"{name}_failed_qc_indices.csv.gz"))
            else None
        ),
        # Find the platemap file based on barcode match and append .csv
        "platemap_path": (
            str(
                pathlib.Path(
                    list(
                        platemap_dir.rglob(
                            f"{barcode_platemap.loc[barcode_platemap['barcode'] == name, 'platemap_file'].values[0]}.csv"
                        )
                    )[0]
                ).resolve(strict=True)
            )
            if name in barcode_platemap["barcode"].values
            else None
        ),
        # Get the time_point based on the barcode match
        "time_point": (
            barcode_platemap.loc[
                barcode_platemap["barcode"] == name, "time_point"
            ].values[0]
            if name in barcode_platemap["barcode"].values
            else None
        ),
    }
    for name in plate_names
}

# Display the dictionary to verify the entries
pprint.pprint(plate_info_dictionary, indent=4)

{   'BR00147482': {   'converted_path': '/media/18tbdrive/1.Github_Repositories/pediatric_cancer_atlas_profiling/3.preprocessing_features/data/converted_profiles/Round_4_data/BR00147482_converted.parquet',
                      'platemap_path': '/media/18tbdrive/1.Github_Repositories/pediatric_cancer_atlas_profiling/0.download_data/metadata/platemaps/Assay_Plate9_platemap.csv',
                      'qc_results_path': '/media/18tbdrive/1.Github_Repositories/pediatric_cancer_atlas_profiling/3.preprocessing_features/qc_results/Round_4_data/BR00147482_failed_qc_indices.csv.gz',
                      'time_point': 72},
    'BR00147483': {   'converted_path': '/media/18tbdrive/1.Github_Repositories/pediatric_cancer_atlas_profiling/3.preprocessing_features/data/converted_profiles/Round_4_data/BR00147483_converted.parquet',
                      'platemap_path': '/media/18tbdrive/1.Github_Repositories/pediatric_cancer_atlas_profiling/0.download_data/metadata/platemaps/Assay_Plate9_platemap.cs

In [4]:
# Set metadata columns to load in for the converted df
metadata_cols = [
    "Metadata_ImageNumber",
    "Image_Metadata_Plate",
    "Image_Metadata_Well",
    "Image_Metadata_Site",
    "Metadata_Nuclei_Location_Center_X",
    "Metadata_Nuclei_Location_Center_Y",
]

qc_report_list = []  # Initialize an empty list to store per-plate QC reports

# Generate QC report across plates
for plate, info in plate_info_dictionary.items():
    print(f"Generating QC report for {plate}")

    # Load in only the metadata columns from the converted dataframe and create a column called failed_qc
    converted_df = pd.read_parquet(info["converted_path"], columns=metadata_cols)
    converted_df["failed_qc"] = False  # Initialize all rows as False

    # Load in the qc_results_path and use the indices to change the rows that match to failing QC
    qc_failed_indices = pd.read_csv(info["qc_results_path"], compression="gzip")

    # Update failed_qc for rows matching the indices in qc_failed_indices
    converted_df.loc[qc_failed_indices["original_indices"], "failed_qc"] = True

    # Make sure that this worked by asserting that the number of True failed rows matches the number of failed indices
    num_failed_qc = converted_df["failed_qc"].sum()
    num_qc_failed_indices = len(qc_failed_indices)

    assert (
        num_failed_qc == num_qc_failed_indices
    ), f"Mismatch: {num_failed_qc} != {num_qc_failed_indices}"

    # Load platemap
    platemap_df = pd.read_csv(info["platemap_path"])

    # Add cell line and seeding density metadata
    annotated_df = annotate(
        profiles=converted_df,
        platemap=platemap_df,
        join_on=["Metadata_well", "Image_Metadata_Well"],
    )

    # Add 'Metadata_time_point' column based on the plate's time_point from dict
    annotated_df["Metadata_time_point"] = info["time_point"]

    # Group by cell line and seeding density, and calculate total nuclei segmented and failed QC
    failure_stats = (
        annotated_df.groupby(
            [
                "Metadata_cell_line",
                "Metadata_seeding_density",
                "Metadata_time_point",
                "Metadata_condition",
                "Metadata_Plate",
            ]
        )
        .agg(
            total_nuclei_segmented=("failed_qc", "count"),
            total_failed_qc=("failed_qc", "sum"),
            percentage_failing_cells=("failed_qc", "mean"),
        )
        .reset_index()
    )

    # Convert to percentage
    failure_stats["percentage_failing_cells"] *= 100

    # Append to list
    qc_report_list.append(failure_stats)

# Concatenate all reports into a single DataFrame
qc_report_df = pd.concat(qc_report_list, ignore_index=True)

# Save QC report as parquet file
qc_report_df.to_parquet(pathlib.Path(f"{output_dir}/{round_id}_qc_report.parquet"))

# Filter the QC report to only include rows for a cell line
filter_qc_report_df = qc_report_df[qc_report_df["Metadata_cell_line"] == "U2-OS"]

# Display filtered QC report info
print(filter_qc_report_df.shape)
filter_qc_report_df

Generating QC report for BR00148739
Generating QC report for BR00148741
Generating QC report for BR00147482
Generating QC report for BR00148752
Generating QC report for BR00148802
Generating QC report for BR00148746
Generating QC report for BR00148751
Generating QC report for BR00148740
Generating QC report for BR00148753
Generating QC report for BR00147484
Generating QC report for BR00148801
Generating QC report for BR00147483
Generating QC report for BR00148744
Generating QC report for BR00148800
Generating QC report for BR00148745
(120, 8)


Unnamed: 0,Metadata_cell_line,Metadata_seeding_density,Metadata_time_point,Metadata_condition,Metadata_Plate,total_nuclei_segmented,total_failed_qc,percentage_failing_cells
50,U2-OS,1000,24,synthemax_PFA,BR00148739,1431,10,0.698812
51,U2-OS,2000,24,synthemax_PFA,BR00148739,1679,7,0.416915
52,U2-OS,4000,24,synthemax_PFA,BR00148739,3525,24,0.680851
53,U2-OS,8000,24,synthemax_PFA,BR00148739,8763,70,0.798813
54,U2-OS,12000,24,synthemax_PFA,BR00148739,10187,56,0.549720
...,...,...,...,...,...,...,...,...
612,U2-OS,1000,48,synthemax_PFA,BR00148745,565,98,17.345133
613,U2-OS,2000,48,synthemax_PFA,BR00148745,2766,194,7.013738
614,U2-OS,4000,48,synthemax_PFA,BR00148745,5255,408,7.764034
615,U2-OS,8000,48,synthemax_PFA,BR00148745,10473,646,6.168242
