In [15]:
import pathlib
import sys

import pandas as pd

In [16]:
from comparators.PearsonsCorrelation import PearsonsCorrelation
from comparison_tools.PairwiseCompareManager import PairwiseCompareManager

In [17]:
# output path for bulk profiles
output_dir = pathlib.Path("../3.preprocessing_features/data/bulk_profiles")
output_dir.mkdir(parents=True, exist_ok=True)

# extract the plate names from the file name
plate_names = [file.stem.split("_")[0] for file in output_dir.glob("*.parquet")]

for plate in plate_names:
    output_feature_select_file = str(
            pathlib.Path(f"{output_dir}/{plate}_bulk_feature_selected.parquet")
        )
    print(output_feature_select_file)

../3.preprocessing_features/data/bulk_profiles/BR00143979_bulk_feature_selected.parquet
../3.preprocessing_features/data/bulk_profiles/BR00143980_bulk_feature_selected.parquet
../3.preprocessing_features/data/bulk_profiles/BR00143981_bulk_feature_selected.parquet
../3.preprocessing_features/data/bulk_profiles/BR00143978_bulk_feature_selected.parquet
../3.preprocessing_features/data/bulk_profiles/BR00143978_bulk_feature_selected.parquet
../3.preprocessing_features/data/bulk_profiles/BR00143977_bulk_feature_selected.parquet
../3.preprocessing_features/data/bulk_profiles/BR00143977_bulk_feature_selected.parquet
../3.preprocessing_features/data/bulk_profiles/BR00143976_bulk_feature_selected.parquet
../3.preprocessing_features/data/bulk_profiles/BR00143981_bulk_feature_selected.parquet
../3.preprocessing_features/data/bulk_profiles/BR00143981_bulk_feature_selected.parquet
../3.preprocessing_features/data/bulk_profiles/BR00143980_bulk_feature_selected.parquet
../3.preprocessing_features/data

In [18]:
results = []

for plate in plate_names:
    output_feature_select_file = str(
            pathlib.Path(f"{output_dir}/{plate}_bulk_feature_selected.parquet")
        )
    plate_df = pd.read_parquet(output_feature_select_file)
    feat_cols = plate_df.columns[~plate_df.columns.str.contains("Metadata")].tolist()

    pearsons_comparator = PearsonsCorrelation()

    comparer = PairwiseCompareManager(
        _df=plate_df.copy(),
        _comparator=pearsons_comparator,
        _same_columns=["Metadata_cell_line", "Metadata_seeding_density", "Metadata_time_point"],
        _different_columns=["Metadata_Well"],
        _feat_cols=feat_cols,
        _drop_cols=["Metadata_Concentration", "Metadata_Well"],
    )

    micdf = comparer()
    results.append(micdf)

In [26]:
# Combine all results into a single dataframe
combined_df = pd.concat(results, ignore_index=True)

Unnamed: 0,pearsons_correlation,Metadata_cell_line__antehoc_group0,Metadata_cell_line__antehoc_group1,Metadata_seeding_density__antehoc_group0,Metadata_seeding_density__antehoc_group1,Metadata_time_point__antehoc_group0,Metadata_time_point__antehoc_group1
3972,0.997004,PA1,PA1,12000,12000,24,24
6558,0.997004,PA1,PA1,12000,12000,24,24
2101,0.997004,PA1,PA1,12000,12000,24,24
5305,0.997004,PA1,PA1,12000,12000,24,24
5303,0.996497,PA1,PA1,12000,12000,24,24
2099,0.996497,PA1,PA1,12000,12000,24,24
3970,0.996497,PA1,PA1,12000,12000,24,24
6556,0.996497,PA1,PA1,12000,12000,24,24
6554,0.996354,PA1,PA1,12000,12000,24,24
2097,0.996354,PA1,PA1,12000,12000,24,24


In [None]:
best_results = (
    combined_df.loc[
        combined_df.groupby("Metadata_cell_line__antehoc_group0")["pearsons_correlation"].idxmax(),
        [
            "Metadata_cell_line__antehoc_group0", 
            "Metadata_seeding_density__antehoc_group0", 
            "Metadata_time_point__antehoc_group0", 
            "pearsons_correlation"
        ]
    ]
)
best_results.head(50)

Unnamed: 0,Metadata_cell_line__antehoc_group0,Metadata_seeding_density__antehoc_group0,Metadata_time_point__antehoc_group0,pearsons_correlation
229,A673,8000,72,0.96343
1917,CHP212,12000,24,0.98356
818,DAOY,1000,48,0.963171
869,G292,12000,48,0.972649
2011,G401,12000,24,0.983014
2041,G402,12000,24,0.989128
402,IMR32,4000,72,0.9811
1461,KNS-42,8000,24,0.984281
1497,KP-N-YN,12000,24,0.973118
636,NB-1,12000,72,0.989935


In [27]:
worst_results = (
    combined_df.loc[
        combined_df.groupby("Metadata_cell_line__antehoc_group0")["pearsons_correlation"].idxmin(),
        [
            "Metadata_cell_line__antehoc_group0", 
            "Metadata_seeding_density__antehoc_group0", 
            "Metadata_time_point__antehoc_group0", 
            "pearsons_correlation"
        ]
    ]
)
worst_results.head(50)

Unnamed: 0,Metadata_cell_line__antehoc_group0,Metadata_seeding_density__antehoc_group0,Metadata_time_point__antehoc_group0,pearsons_correlation
759,A673,1000,48,0.462673
241,CHP212,1000,72,0.29976
1943,DAOY,8000,24,0.549704
851,G292,2000,48,0.353834
1996,G401,4000,24,0.529593
377,G402,4000,72,0.760906
2058,IMR32,4000,24,0.495112
552,KNS-42,1000,72,0.482886
597,KP-N-YN,8000,72,0.115889
1506,NB-1,1000,24,0.842538
