In [3]:
import argparse
import pathlib

import pandas as pd
import toml

In [4]:
argparser = argparse.ArgumentParser()
argparser.add_argument("--cell_type", default="all")

args = argparser.parse_args()

cell_type = args.cell_type

In [5]:
# set path to the import data
data_path = pathlib.Path(
    f"../../../data/{cell_type}_preprocessed_sc_norm_aggregated_nomic.parquet"
)
# results path
results_path = pathlib.Path("../results/").resolve()
results_path.mkdir(exist_ok=True)

# dataframe with only the labeled data we want (exclude certain phenotypic classes)
data_df = pd.read_parquet(data_path)
data_df.head()

Unnamed: 0,Metadata_Well,Cytoplasm_AreaShape_Compactness,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_MajorAxisLength,Cytoplasm_AreaShape_MinorAxisLength,Cytoplasm_AreaShape_Orientation,Cytoplasm_AreaShape_Zernike_0_0,Cytoplasm_AreaShape_Zernike_1_1,Cytoplasm_AreaShape_Zernike_2_0,Cytoplasm_AreaShape_Zernike_2_2,...,uPA [NSU],VCAM-1 [NSU],VEGF Receptor 2 (Flk-1) [NSU],VEGF-A (165) [NSU],VEGF-C [NSU],VEGF-D [NSU],VEGFR-1 [NSU],WISP-1 (CCN4) [NSU],XCL1 (Lymphotactin) [NSU],oneb_Treatment_Dose_Inhibitor_Dose
0,B02,0.100173,-0.059734,0.218567,0.111938,0.00742,-0.100946,-0.030356,-0.070701,0.013108,...,0.396902,0.385081,1.0,0.0,0.430111,0.538503,0.784695,0.468448,0.237545,LPS_0.010_ug_per_ml_DMSO_0.025_%
1,B03,0.137279,-0.097646,0.205644,0.108021,-0.002159,-0.141895,-0.059932,-0.091195,-0.011037,...,0.256691,0.327491,0.390866,0.406489,0.412096,0.10483,0.812933,0.518536,0.244397,LPS_0.010_ug_per_ml_DMSO_0.025_%
2,B04,0.071345,-0.053566,0.055404,0.013373,0.004443,-0.111708,-0.084402,-0.043409,-0.030164,...,0.555221,0.357476,0.346884,0.477553,0.427658,0.642061,0.24938,0.627712,0.31835,LPS_Nigericin_100.000_ug_per_ml_1.000_uM_DMSO_...
3,B05,0.110685,-0.084346,0.107954,0.071923,0.00415,-0.121376,-0.075382,-0.052805,-0.038156,...,0.308536,0.588899,0.828371,0.484102,0.294634,0.673648,0.236793,0.557634,0.350429,LPS_Nigericin_100.000_ug_per_ml_1.000_uM_DMSO_...
4,B06,-0.021771,0.018442,-0.048689,-0.07049,-0.005284,-0.008255,-0.012815,-0.017174,0.003785,...,0.469875,0.395392,0.560129,0.504521,0.490444,0.258834,0.238358,0.524276,0.25067,DMSO_0.100_%_DMSO_0.025_%


In [6]:
# get all of the features
features = data_df.columns.to_list()

nuclei_features = []
pm_features = []
gsdmd_features = []
mito_features = []
er_features = []
other_features = []
metadata_features = []
correlation_features = []
secretome_features = []

# Separate the features into distinct types
# code loop modified from:
# https://github.com/WayScience/nuclear_speckles_analysis/blob/main/1.regression_modelling/1.train_models.ipynb
# Thank you Jenna Tomkinson for the code!
for feature in features:
    # check if the feature contains "Metadata"
    if "Metadata" in feature:
        metadata_features.append(feature)
    elif "[NSU]" in feature:
        secretome_features.append(feature)
    else:
        parts = feature.split("_")

        if "Correlation" in parts:  # Check if it's a correlation feature
            correlation_features.append(feature)

        else:  # Non-correlation features
            if "CorrDNA" in parts:
                nuclei_features.append(feature)
            elif "CorrPM" in parts:
                pm_features.append(feature)
            elif "CorrGasdermin" in parts:
                gsdmd_features.append(feature)
            elif "CorrMito" in parts:
                mito_features.append(feature)
            elif "CorrER" in parts:
                er_features.append(feature)
            else:
                other_features.append(feature)

In [7]:
print(f"nuclei_features: {len(nuclei_features)}")
print(f"pm_features: {len(pm_features)}")
print(f"gsdmd_features: {len(gsdmd_features)}")
print(f"mito_features: {len(mito_features)}")
print(f"er_features: {len(er_features)}")
print(f"other_features: {len(other_features)}")
print(f"correlation_features: {len(correlation_features)}")
print(f"metadata_features: {len(metadata_features)}")
print(f"secretome_features: {len(secretome_features)}")
print(f"total features: {len(features)}")
# check if all featues are accounted for
assert len(features) == len(nuclei_features) + len(pm_features) + len(
    gsdmd_features
) + len(mito_features) + len(er_features) + len(other_features) + len(
    correlation_features
) + len(
    metadata_features
) + len(
    secretome_features
)

nuclei_features: 251
pm_features: 243
gsdmd_features: 86
mito_features: 220
er_features: 184
other_features: 108
correlation_features: 108
metadata_features: 2
secretome_features: 187
total features: 1389


In [8]:
# create the combinations of features
# out of 5 channels, we can have 0, 1, 2, 3, 4, or 5 channels
# even with 0 channels, we still have the non channel features
# these are areshape features
# set the feature combination lists
dict_of_feature_combinations = {
    "No_channels": other_features,
    "CorrDNA": nuclei_features,
    "CorrPM": pm_features,
    "CorrGasdermin": gsdmd_features,
    "CorrMito": mito_features,
    "CorrER": er_features,
    "CorrDNA_CorrPM": nuclei_features + pm_features,
    "CorrDNA_CorrGasdermin": nuclei_features + gsdmd_features,
    "CorrDNA_CorrMito": nuclei_features + mito_features,
    "CorrDNA_CorrER": nuclei_features + er_features,
    "CorrPM_CorrGasdermin": pm_features + gsdmd_features,
    "CorrPM_CorrMito": pm_features + mito_features,
    "CorrPM_CorrER": pm_features + er_features,
    "CorrGasdermin_CorrMito": gsdmd_features + mito_features,
    "CorrGasdermin_CorrER": gsdmd_features + er_features,
    "CorrMito_CorrER": mito_features + er_features,
    "CorrDNA_CorrPM_CorrGasdermin": nuclei_features + pm_features + gsdmd_features,
    "CorrDNA_CorrPM_CorrMito": nuclei_features + pm_features + mito_features,
    "CorrDNA_CorrPM_CorrER": nuclei_features + pm_features + er_features,
    "CorrDNA_CorrGasdermin_CorrMito": nuclei_features + gsdmd_features + mito_features,
    "CorrDNA_CorrGasdermin_CorrER": nuclei_features + gsdmd_features + er_features,
    "CorrDNA_CorrMito_CorrER": nuclei_features + mito_features + er_features,
    "CorrPM_CorrGasdermin_CorrMito": pm_features + gsdmd_features + mito_features,
    "CorrPM_CorrGasdermin_CorrER": pm_features + gsdmd_features + er_features,
    "CorrPM_CorrMito_CorrER": pm_features + mito_features + er_features,
    "CorrGasdermin_CorrMito_CorrER": gsdmd_features + mito_features + er_features,
    "CorrDNA_CorrPM_CorrGasdermin_CorrMito": nuclei_features
    + pm_features
    + gsdmd_features
    + mito_features,
    "CorrDNA_CorrPM_CorrGasdermin_CorrER": nuclei_features
    + pm_features
    + gsdmd_features
    + er_features,
    "CorrDNA_CorrPM_CorrMito_CorrER": nuclei_features
    + pm_features
    + mito_features
    + er_features,
    "CorrDNA_CorrGasdermin_CorrMito_CorrER": nuclei_features
    + gsdmd_features
    + mito_features
    + er_features,
    "CorrPM_CorrGasdermin_CorrMito_CorrER": pm_features
    + gsdmd_features
    + mito_features
    + er_features,
    "All_channels": nuclei_features
    + pm_features
    + gsdmd_features
    + mito_features
    + er_features
    + other_features
    + correlation_features,
}

for combination in dict_of_feature_combinations:
    if combination == "No_channels":
        temp_correlation_features = other_features
    elif "_" in combination:
        channels = combination.split("_")
        temp_correlation_features = []
        for feature in correlation_features:
            if all(channel not in feature for channel in channels):
                temp_correlation_features.append(feature)
    else:
        temp_correlation_features = []
        for feature in correlation_features:
            if combination not in feature:
                temp_correlation_features.append(feature)

    num_featues = len(dict_of_feature_combinations[combination])
    dict_of_feature_combinations[combination] += temp_correlation_features

    print(
        f"{len(dict_of_feature_combinations[combination]) - num_featues} correlation features added to {combination}"
    )
    dict_of_feature_combinations[combination] += secretome_features
    dict_of_feature_combinations[combination] += metadata_features

108 correlation features added to No_channels
63 correlation features added to CorrDNA
69 correlation features added to CorrPM
89 correlation features added to CorrGasdermin
69 correlation features added to CorrMito
70 correlation features added to CorrER
37 correlation features added to CorrDNA_CorrPM
44 correlation features added to CorrDNA_CorrGasdermin
37 correlation features added to CorrDNA_CorrMito
36 correlation features added to CorrDNA_CorrER
52 correlation features added to CorrPM_CorrGasdermin
39 correlation features added to CorrPM_CorrMito
38 correlation features added to CorrPM_CorrER
55 correlation features added to CorrGasdermin_CorrMito
59 correlation features added to CorrGasdermin_CorrER
35 correlation features added to CorrMito_CorrER
20 correlation features added to CorrDNA_CorrPM_CorrGasdermin
20 correlation features added to CorrDNA_CorrPM_CorrMito
17 correlation features added to CorrDNA_CorrPM_CorrER
23 correlation features added to CorrDNA_CorrGasdermin_CorrM

In [9]:
# save the dict to a toml file

toml_path = pathlib.Path(f"../results/channel_feature_combinations_{cell_type}.toml")
with open(toml_path, "w") as f:
    toml.dump(dict_of_feature_combinations, f)

# write the keys to a txt file with each key on a new line
# this is for easy retrieval of the keys in bash
txt_path = pathlib.Path("../results/feature_combinations_keys.txt")
with open(txt_path, "w") as f:
    for key in dict_of_feature_combinations:
        f.write(f"{key}\n")