In [8]:
import sys
import pathlib
from collections import defaultdict

import pandas as pd
from pycytominer import feature_select

sys.path.append("../../")
from src import utils

In [9]:
# data directory
data_dir = pathlib.Path("../../data").resolve(strict=True)
results_dir = pathlib.Path("../../results").resolve(strict=True)
fs_dir = (results_dir / "0.feature_selection").resolve()
fs_dir.mkdir(exist_ok=True)

# data paths
suppl_meta_path = (data_dir / "41467_2023_36829_MOESM5_ESM.csv.gz").resolve(strict=True)
screen_anno_path = (data_dir / "idr0133-screenA-annotation.csv.gz").resolve(strict=True)

# load data
image_profile_df = pd.read_csv(screen_anno_path)
meta_df = image_profile_df[image_profile_df.columns[:31]]
compounds_df = meta_df[["Compound Name", "Compound Class"]]

suppl_meta_df = pd.read_csv(suppl_meta_path)
cell_injury_df = suppl_meta_df[["Cellular injury category", "Compound alias"]]

In [10]:
# getting profiles based on injury and compound type
injury_and_compounds = defaultdict(list)
for injury, compound in cell_injury_df.values.tolist():
    injury_and_compounds[injury].append(compound)

# cross reference selected injury and associated components into the screen profile
injury_profiles = []
for injury_type, compound_list in injury_and_compounds.items():
    sel_profile = image_profile_df[
        image_profile_df["Compound Name"].isin(compound_list)
    ]
    sel_profile.insert(0, "injury_type", injury_type)
    injury_profiles.append(sel_profile)

In [11]:
# creating a dataframe that contains stratified screen Data
injured_df = pd.concat(injury_profiles)

# drop wells that do not have an injury
injured_df = injured_df.dropna(subset="injury_type").reset_index(drop=True)
print("Number of wells", len(injured_df["Plate"].unique()))

# display df
print("shape:", injured_df.shape)
injured_df.head()

Number of wells 84
shape: (6848, 404)


Unnamed: 0,injury_type,Plate,Well,Characteristics [Organism],Term Source 1 REF,Term Source 1 Accession,Characteristics [Cell Line],Term Source 2 REF,Term Source 2 Accession,Experimental Condition [Treatment time (h)],...,Nuclei_Texture_InverseDifferenceMoment_DNA_5_0,Nuclei_Texture_InverseDifferenceMoment_RNA_5_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_DNA_10_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_DNA_10_0,Nuclei_Texture_SumEntropy_DNA_20_0,Nuclei_Texture_SumEntropy_DNA_5_0,Nuclei_Texture_SumVariance_DNA_20_0
0,Cytoskeletal,BR00110363,E17,Homo sapiens,NCBITaxon,NCBITaxon_9606,U2OS,EFO,EFO_0002869,24,...,0.561075,0.139535,0.188096,-1.035562,0.655389,0.182888,-0.004066,0.130472,-0.418286,0.283484
1,Cytoskeletal,BR00110363,E18,Homo sapiens,NCBITaxon,NCBITaxon_9606,U2OS,EFO,EFO_0002869,24,...,0.642707,0.052501,0.130166,-1.304556,0.438742,0.187985,0.088121,0.289709,-0.451626,0.461128
2,Cytoskeletal,BR00110363,E19,Homo sapiens,NCBITaxon,NCBITaxon_9606,U2OS,EFO,EFO_0002869,24,...,0.599857,0.184587,0.111444,-1.462714,0.821791,0.22949,0.121207,0.165713,-0.342221,0.388047
3,Cytoskeletal,BR00110363,E20,Homo sapiens,NCBITaxon,NCBITaxon_9606,U2OS,EFO,EFO_0002869,24,...,0.513671,0.137843,0.165498,-1.005157,0.264772,0.169579,0.142331,0.264883,-0.161366,0.337277
4,Cytoskeletal,BR00110363,E21,Homo sapiens,NCBITaxon,NCBITaxon_9606,U2OS,EFO,EFO_0002869,24,...,0.402869,0.083364,0.181626,-1.068167,0.469826,0.411077,0.427186,0.45869,-0.012347,0.658387


In [12]:
# seperating meta and feature columns
meta = injured_df.columns.tolist()[:32]
features = injured_df.columns.tolist()[32:]

In [13]:
# dropping samples that have at least 1 NaN
injured_df = utils.drop_na_samples(profile=injured_df, features=features, cut_off=0)

# display
print("Shape after removing samples: ", injured_df.shape)
injured_df.head()

Shape after removing samples:  (6846, 404)


Unnamed: 0,Compound PubChem CID,Well,Compound IUPAC,Plate,Compound BRD,Mahalanobis distance,Experimental Condition [Treatment time (h)],Relative well cellcount,Control Type,Compound Concentration (microMolar),...,Nuclei_Texture_InverseDifferenceMoment_DNA_5_0,Nuclei_Texture_InverseDifferenceMoment_RNA_5_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_DNA_10_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_DNA_10_0,Nuclei_Texture_SumEntropy_DNA_20_0,Nuclei_Texture_SumEntropy_DNA_5_0,Nuclei_Texture_SumVariance_DNA_20_0
0,4122.0,E17,methyl N-[6-(thiophene-2-carbonyl)-1H-benzimid...,BR00110363,BRD-K12539581-001-24-5,20.81,24,0.25,Positive,20.0,...,0.561075,0.139535,0.188096,-1.035562,0.655389,0.182888,-0.004066,0.130472,-0.418286,0.283484
1,4122.0,E18,methyl N-[6-(thiophene-2-carbonyl)-1H-benzimid...,BR00110363,BRD-K12539581-001-24-5,20.16,24,0.25,Positive,10.0,...,0.642707,0.052501,0.130166,-1.304556,0.438742,0.187985,0.088121,0.289709,-0.451626,0.461128
2,4122.0,E19,methyl N-[6-(thiophene-2-carbonyl)-1H-benzimid...,BR00110363,BRD-K12539581-001-24-5,20.42,24,0.25,Positive,5.0,...,0.599857,0.184587,0.111444,-1.462714,0.821791,0.22949,0.121207,0.165713,-0.342221,0.388047
3,4122.0,E20,methyl N-[6-(thiophene-2-carbonyl)-1H-benzimid...,BR00110363,BRD-K12539581-001-24-5,19.93,24,0.23,Positive,2.5,...,0.513671,0.137843,0.165498,-1.005157,0.264772,0.169579,0.142331,0.264883,-0.161366,0.337277
4,4122.0,E21,methyl N-[6-(thiophene-2-carbonyl)-1H-benzimid...,BR00110363,BRD-K12539581-001-24-5,17.58,24,0.21,Positive,1.25,...,0.402869,0.083364,0.181626,-1.068167,0.469826,0.411077,0.427186,0.45869,-0.012347,0.658387


In [15]:
# setting feature selection operations
all_operations = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blocklist",
    "drop_outliers",
]

# Applying feature selection using pycytominer
fs_injury_df = feature_select(
    profiles=injured_df,
    features=features,
    operation=all_operations,
    freq_cut=0.05,
    corr_method="pearson",
    corr_threshold=0.90,
    na_cutoff=0.0,
    outlier_cutoff=100,
)

# saving dataframe
fs_injury_df.to_csv(
    fs_dir / "cell_injury_profile_fs.csv.gz",
    index=False,
    compression="gzip",
)

In [16]:
print("Feature selected profile shape:", fs_injury_df.shape)
fs_injury_df.head()

Feature selected profile shape: (6846, 378)


Unnamed: 0,Compound PubChem CID,Well,Compound IUPAC,Plate,Compound BRD,Mahalanobis distance,Experimental Condition [Treatment time (h)],Relative well cellcount,Control Type,Compound Concentration (microMolar),...,Nuclei_Texture_InverseDifferenceMoment_DNA_20_0,Nuclei_Texture_InverseDifferenceMoment_DNA_5_0,Nuclei_Texture_InverseDifferenceMoment_RNA_5_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_DNA_10_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_DNA_10_0,Nuclei_Texture_SumEntropy_DNA_20_0,Nuclei_Texture_SumVariance_DNA_20_0
0,4122.0,E17,methyl N-[6-(thiophene-2-carbonyl)-1H-benzimid...,BR00110363,BRD-K12539581-001-24-5,20.81,24,0.25,Positive,20.0,...,0.09746,0.561075,0.139535,0.188096,-1.035562,0.655389,0.182888,-0.004066,0.130472,0.283484
1,4122.0,E18,methyl N-[6-(thiophene-2-carbonyl)-1H-benzimid...,BR00110363,BRD-K12539581-001-24-5,20.16,24,0.25,Positive,10.0,...,0.065539,0.642707,0.052501,0.130166,-1.304556,0.438742,0.187985,0.088121,0.289709,0.461128
2,4122.0,E19,methyl N-[6-(thiophene-2-carbonyl)-1H-benzimid...,BR00110363,BRD-K12539581-001-24-5,20.42,24,0.25,Positive,5.0,...,0.101799,0.599857,0.184587,0.111444,-1.462714,0.821791,0.22949,0.121207,0.165713,0.388047
3,4122.0,E20,methyl N-[6-(thiophene-2-carbonyl)-1H-benzimid...,BR00110363,BRD-K12539581-001-24-5,19.93,24,0.23,Positive,2.5,...,0.072294,0.513671,0.137843,0.165498,-1.005157,0.264772,0.169579,0.142331,0.264883,0.337277
4,4122.0,E21,methyl N-[6-(thiophene-2-carbonyl)-1H-benzimid...,BR00110363,BRD-K12539581-001-24-5,17.58,24,0.21,Positive,1.25,...,-0.13916,0.402869,0.083364,0.181626,-1.068167,0.469826,0.411077,0.427186,0.45869,0.658387


In [20]:
# setting which injr
cell_injuries = fs_injury_df["injury_type"].unique()
print("number of cell injury types", len(cell_injuries))
cell_injuries

number of cell injury types 14


array(['Cytoskeletal', 'Hsp90', 'Kinase', 'Genotoxin', 'Miscellaneous',
       'Redox', 'HDAC', 'mTOR', 'Proteasome', 'Saponin', 'Mitochondria',
       'Ferroptosis', 'Tannin', 'Nonspecific reactive'], dtype=object)