In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from matplotlib.backends.backend_pdf import PdfPages

In [None]:
#from bulk, quality controlled data
qc_df = pd.read_parquet("../3.preprocessing_features/qc_report/qc_report.parquet")
pearson_df = pd.read_parquet("../5.optimization/results/pearson_correlation.parquet")

In [3]:
pearson_df.head()

Unnamed: 0,Metadata_cell_line,Metadata_seeding_density,Metadata_time_point,Shuffled,pearsons_correlation
0,A673,1000,24,False,0.782851
1,A673,1000,24,True,0.015424
2,A673,1000,48,False,0.514848
3,A673,1000,48,True,0.216561
4,A673,1000,72,False,0.788962


In [4]:
qc_df_sorted = qc_df.sort_values(by="Metadata_cell_line")
qc_df_sorted.head(50)

Unnamed: 0,Metadata_cell_line,Metadata_seeding_density,Metadata_time_point,total_nuclei_segmented,total_failed_qc,percentage_failing_cells
226,A673,12000,24,3090,1123,36.343042
223,A673,2000,24,1097,221,20.145852
222,A673,1000,24,563,80,14.209591
167,A673,12000,48,9433,2868,30.403901
35,A673,1000,72,1103,291,26.382593
36,A673,2000,72,2728,736,26.979472
224,A673,4000,24,1801,511,28.373126
166,A673,8000,48,5329,1809,33.946331
37,A673,4000,72,9280,2153,23.200431
164,A673,2000,48,1442,496,34.396671


In [5]:
pearson_df.columns = pearson_df.columns.str.strip()
qc_df.columns = qc_df.columns.str.strip()
print("Columns in pearson_df:", pearson_df.columns)
print("Columns in qc_df:", qc_df.columns)

# Check data types of the columns we want to merge on
print("Data types in pearson_df:\n", pearson_df.dtypes)
print("Data types in qc_df:\n", qc_df.dtypes)

Columns in pearson_df: Index(['Metadata_cell_line', 'Metadata_seeding_density', 'Metadata_time_point',
       'Shuffled', 'pearsons_correlation'],
      dtype='object')
Columns in qc_df: Index(['Metadata_cell_line', 'Metadata_seeding_density', 'Metadata_time_point',
       'total_nuclei_segmented', 'total_failed_qc',
       'percentage_failing_cells'],
      dtype='object')
Data types in pearson_df:
 Metadata_cell_line           object
Metadata_seeding_density      int64
Metadata_time_point           int64
Shuffled                     object
pearsons_correlation        float64
dtype: object
Data types in qc_df:
 Metadata_cell_line           object
Metadata_seeding_density      int64
Metadata_time_point           int64
total_nuclei_segmented        int64
total_failed_qc               int64
percentage_failing_cells    float64
dtype: object


In [6]:
# Merge pearson_df and qc_df
merged_df = pd.merge(
    pearson_df[pearson_df["Shuffled"] == "False"],
    qc_df,
    on=["Metadata_cell_line", "Metadata_seeding_density", "Metadata_time_point"],
    how="inner"
)

# save df
merged_df.to_parquet("../5.optimization/results/merged_pearson_qc_data.parquet")
print("Merged dataframe saved to results/merged_pearson_qc_data.parquet")

Merged dataframe saved to results/merged_pearson_qc_data.parquet


In [7]:
merged_df.head()

Unnamed: 0,Metadata_cell_line,Metadata_seeding_density,Metadata_time_point,Shuffled,pearsons_correlation,total_nuclei_segmented,total_failed_qc,percentage_failing_cells
0,A673,1000,24,False,0.782851,563,80,14.209591
1,A673,1000,48,False,0.514848,287,79,27.526132
2,A673,1000,72,False,0.788962,1103,291,26.382593
3,A673,2000,24,False,0.818574,1097,221,20.145852
4,A673,2000,48,False,0.787276,1442,496,34.396671


In [8]:
custom_palette = sns.color_palette("Set1", n_colors=5)
# Create a PdfPages object to save all plots in a single PDF
with PdfPages('../5.optimization/results/pearson_vs_percentage_failing_cells.pdf') as pdf:
    # Plot for each cell line
    for cell_line in merged_df["Metadata_cell_line"].unique():
        # Filter data for the current cell line
        cell_line_df = merged_df[merged_df["Metadata_cell_line"] == cell_line]
        
        # Generate the scatter plot
        plt.figure(figsize=(8, 6))
        scatter = sns.scatterplot(
            data=cell_line_df,
            x="pearsons_correlation",
            y="percentage_failing_cells",
            hue="Metadata_seeding_density",
            palette=custom_palette,  # Choose a color palette for seeding density
            style="Metadata_time_point",  # Different styles for each time point
            markers=["o", "X", "s"],  # Customize markers
            s=100,
        )
        
        # Set plot title and labels
        plt.title(f"Pearson Correlation vs Percentage Failing Cells\nCell Line: {cell_line}", fontsize=16)
        plt.xlabel("Pearson Correlation")
        plt.ylabel("Percentage Failing Cells")
        
        # Reverse the y-axis (since higher failing percentage is bad)
        plt.gca().invert_yaxis()
        
        # Move the legend outside of the plot
        plt.legend(title='Seeding Density', bbox_to_anchor=(1.05, 1), loc='upper left')
        
        # Save the plot to the PDF
        pdf.savefig(bbox_inches="tight", transparent=True)
        plt.close()  # Close the figure to avoid overlap in the next plot
    
    print("Plots saved to results/pearson_vs_percentage_failing_cells.pdf")

Plots saved to results/pearson_vs_percentage_failing_cells.pdf
