This notebook performs data splits on the wells on the bulk data.

In [1]:
import argparse
import pathlib

import pandas as pd
import toml

In [2]:
argparser = argparse.ArgumentParser()
argparser.add_argument("--cell_type", default="all")

args = argparser.parse_args()

cell_type = args.cell_type

In [3]:
# set path to the import data
data_path = pathlib.Path(
    f"../../../data/{cell_type}_preprocessed_sc_norm_aggregated.parquet"
)
# results path
results_path = pathlib.Path("../results/").resolve()
results_path.mkdir(exist_ok=True)

# dataframe with only the labeled data we want (exclude certain phenotypic classes)
data_df = pd.read_parquet(data_path)
data_df.head()

Unnamed: 0,Metadata_Well,oneb_Metadata_Treatment_Dose_Inhibitor_Dose,Cytoplasm_AreaShape_Compactness,Cytoplasm_AreaShape_Eccentricity,Cytoplasm_AreaShape_Extent,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_MajorAxisLength,Cytoplasm_AreaShape_MinorAxisLength,Cytoplasm_AreaShape_Orientation,Cytoplasm_AreaShape_Zernike_0_0,...,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_CorrMito_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_02_256,Nuclei_Texture_SumEntropy_CorrPM_3_01_256,Nuclei_Texture_SumVariance_CorrER_3_01_256,Nuclei_Texture_SumVariance_CorrMito_3_03_256,Nuclei_Texture_SumVariance_CorrPM_3_03_256
0,B13,Media,0.001581,0.049096,0.009659,-0.032662,0.136661,0.090671,-0.022868,-0.005781,...,-0.013838,-0.022186,-0.026609,-0.037155,-0.086089,-0.087688,0.070222,0.003325,-0.006576,-0.007685
1,B14,DMSO_0.100_%_DMSO_0.025_%,-0.062052,-0.001451,0.03448,0.036254,0.035743,0.029159,-0.013366,0.027159,...,0.029123,0.042585,0.037382,-0.016886,-0.021182,-0.005578,0.009521,-0.017836,-0.010545,0.018704
2,B15,DMSO_0.100_%_DMSO_0.025_%,-0.06221,0.001338,0.043384,0.030471,-0.003881,0.004393,0.010505,0.029986,...,0.005497,-0.004856,-0.020013,0.07373,-0.058383,-0.061821,0.063773,0.009585,-0.009521,-0.008554
3,B16,LPS_0.010_ug_per_ml_DMSO_0.025_%,-0.032242,0.047022,-0.004604,0.003074,0.11299,0.068219,-0.00844,-0.010655,...,-0.082061,-0.074079,-0.078062,0.029767,-0.0747,-0.069611,0.067694,-0.009791,-0.012226,-0.008081
4,B17,LPS_0.010_ug_per_ml_DMSO_0.025_%,-0.046176,0.037852,0.034321,0.023957,0.161078,0.118943,0.000147,0.018533,...,-0.08405,-0.090688,-0.084021,0.100577,-0.084613,-0.085133,0.070831,0.022704,-0.017325,-0.006467


In [4]:
# get all of the features
features = data_df.columns.to_list()

nuclei_features = []
pm_features = []
gsdmd_features = []
mito_features = []
er_features = []
other_features = []
metadata_features = []
correlation_features = []

# Separate the features into distinct types
# code loop modified from:
# https://github.com/WayScience/nuclear_speckles_analysis/blob/main/1.regression_modelling/1.train_models.ipynb
# Thank you Jenna Tomkinson for the code!
for feature in features:
    # check if the feature contains "Metadata"
    if "Metadata" in feature:
        metadata_features.append(feature)
    else:
        parts = feature.split("_")

        if "Correlation" in parts:  # Check if it's a correlation feature
            correlation_features.append(feature)

        else:  # Non-correlation features
            if "CorrDNA" in parts:
                nuclei_features.append(feature)
            elif "CorrPM" in parts:
                pm_features.append(feature)
            elif "CorrGasdermin" in parts:
                gsdmd_features.append(feature)
            elif "CorrMito" in parts:
                mito_features.append(feature)
            elif "CorrER" in parts:
                er_features.append(feature)
            else:
                other_features.append(feature)

In [5]:
print(f"nuclei_features: {len(nuclei_features)}")
print(f"pm_features: {len(pm_features)}")
print(f"gsdmd_features: {len(gsdmd_features)}")
print(f"mito_features: {len(mito_features)}")
print(f"er_features: {len(er_features)}")
print(f"other_features: {len(other_features)}")
print(f"correlation_features: {len(correlation_features)}")
print(f"metadata_features: {len(metadata_features)}")
print(f"total features: {len(features)}")
# check if all featues are accounted for
assert len(features) == len(nuclei_features) + len(pm_features) + len(
    gsdmd_features
) + len(mito_features) + len(er_features) + len(other_features) + len(
    correlation_features
) + len(
    metadata_features
)

nuclei_features: 235
pm_features: 255
gsdmd_features: 68
mito_features: 239
er_features: 175
other_features: 115
correlation_features: 140
metadata_features: 2
total features: 1229


In [6]:
# create the combinations of features
# out of 5 channels, we can have 0, 1, 2, 3, 4, or 5 channels
# even with 0 channels, we still have the non channel features (object-based)
# these are areshape features
# set the feature combination lists
dict_of_feature_combinations = {
    "No_channels": other_features,
    "CorrDNA": nuclei_features,
    "CorrPM": pm_features,
    "CorrGasdermin": gsdmd_features,
    "CorrMito": mito_features,
    "CorrER": er_features,
    "CorrDNA_CorrPM": nuclei_features + pm_features,
    "CorrDNA_CorrGasdermin": nuclei_features + gsdmd_features,
    "CorrDNA_CorrMito": nuclei_features + mito_features,
    "CorrDNA_CorrER": nuclei_features + er_features,
    "CorrPM_CorrGasdermin": pm_features + gsdmd_features,
    "CorrPM_CorrMito": pm_features + mito_features,
    "CorrPM_CorrER": pm_features + er_features,
    "CorrGasdermin_CorrMito": gsdmd_features + mito_features,
    "CorrGasdermin_CorrER": gsdmd_features + er_features,
    "CorrMito_CorrER": mito_features + er_features,
    "CorrDNA_CorrPM_CorrGasdermin": nuclei_features + pm_features + gsdmd_features,
    "CorrDNA_CorrPM_CorrMito": nuclei_features + pm_features + mito_features,
    "CorrDNA_CorrPM_CorrER": nuclei_features + pm_features + er_features,
    "CorrDNA_CorrGasdermin_CorrMito": nuclei_features + gsdmd_features + mito_features,
    "CorrDNA_CorrGasdermin_CorrER": nuclei_features + gsdmd_features + er_features,
    "CorrDNA_CorrMito_CorrER": nuclei_features + mito_features + er_features,
    "CorrPM_CorrGasdermin_CorrMito": pm_features + gsdmd_features + mito_features,
    "CorrPM_CorrGasdermin_CorrER": pm_features + gsdmd_features + er_features,
    "CorrPM_CorrMito_CorrER": pm_features + mito_features + er_features,
    "CorrGasdermin_CorrMito_CorrER": gsdmd_features + mito_features + er_features,
    "CorrDNA_CorrPM_CorrGasdermin_CorrMito": nuclei_features
    + pm_features
    + gsdmd_features
    + mito_features,
    "CorrDNA_CorrPM_CorrGasdermin_CorrER": nuclei_features
    + pm_features
    + gsdmd_features
    + er_features,
    "CorrDNA_CorrPM_CorrMito_CorrER": nuclei_features
    + pm_features
    + mito_features
    + er_features,
    "CorrDNA_CorrGasdermin_CorrMito_CorrER": nuclei_features
    + gsdmd_features
    + mito_features
    + er_features,
    "CorrPM_CorrGasdermin_CorrMito_CorrER": pm_features
    + gsdmd_features
    + mito_features
    + er_features,
    "All_channels": nuclei_features
    + pm_features
    + gsdmd_features
    + mito_features
    + er_features
    + other_features
    + correlation_features,
}
# loop through each feature combination and add the metadata features
for combination in dict_of_feature_combinations:
    if combination == "No_channels":
        temp_correlation_features = other_features
    elif "_" in combination:
        channels = combination.split("_")
        temp_correlation_features = []
        for feature in correlation_features:
            if all(channel not in feature for channel in channels):
                temp_correlation_features.append(feature)
    else:
        temp_correlation_features = []
        for feature in correlation_features:
            if combination not in feature:
                temp_correlation_features.append(feature)

    num_featues = len(dict_of_feature_combinations[combination])
    dict_of_feature_combinations[combination] += temp_correlation_features

    print(
        f"{len(dict_of_feature_combinations[combination]) - num_featues} correlation features added to {combination}"
    )
    dict_of_feature_combinations[combination] += metadata_features

115 correlation features added to No_channels
83 correlation features added to CorrDNA
86 correlation features added to CorrPM
106 correlation features added to CorrGasdermin
96 correlation features added to CorrMito
94 correlation features added to CorrER
44 correlation features added to CorrDNA_CorrPM
56 correlation features added to CorrDNA_CorrGasdermin
54 correlation features added to CorrDNA_CorrMito
45 correlation features added to CorrDNA_CorrER
58 correlation features added to CorrPM_CorrGasdermin
53 correlation features added to CorrPM_CorrMito
50 correlation features added to CorrPM_CorrER
69 correlation features added to CorrGasdermin_CorrMito
74 correlation features added to CorrGasdermin_CorrER
52 correlation features added to CorrMito_CorrER
23 correlation features added to CorrDNA_CorrPM_CorrGasdermin
26 correlation features added to CorrDNA_CorrPM_CorrMito
16 correlation features added to CorrDNA_CorrPM_CorrER
34 correlation features added to CorrDNA_CorrGasdermin_Corr

In [7]:
# save the dict to a toml file

toml_path = pathlib.Path(f"../results/feature_combinations_{cell_type}.toml")
with open(toml_path, "w") as f:
    toml.dump(dict_of_feature_combinations, f)

# write the keys to a txt file with each key on a new line
# this is for easy retrieval of the keys in bash
txt_path = pathlib.Path("../results/feature_combinations_keys.txt")
with open(txt_path, "w") as f:
    for key in dict_of_feature_combinations:
        f.write(f"{key}\n")