In [1]:
import pathlib
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import pyarrow as pa
import papermill as pm

In [2]:
# Parameters
celltype = "PBMC"

In [3]:
# Define inputs
feature_file = pathlib.Path(f"../data/{celltype}_sc_norm_fs.parquet")
feature_df = pq.read_table(feature_file).to_pandas()

In [4]:
# remove uM in each row of the Metadata_inducer1_concentration column if it is present
if "Metadata_inducer1_concentration" in feature_df.columns:
    feature_df["Metadata_inducer1_concentration"] = feature_df[
        "Metadata_inducer1_concentration"
    ].str.replace("µM", "")

In [5]:
feature_df["Metadata_inducer1_concentration"].unique()

array(['0.010', '0.100', '100.000', None, '1.000', '10.000', '2.5',
       '2.500', '5.000', '20.000'], dtype=object)

In [6]:
# define output file path
feature_df_out_path = pathlib.Path(f"../data/{celltype}_preprocessed_sc_norm.parquet")

In [7]:
print(feature_df.shape)
feature_df.head()

(5598682, 1266)


Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,...,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_CorrMito_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_03_256,Nuclei_Texture_SumEntropy_CorrPM_3_01_256,Nuclei_Texture_SumVariance_CorrPM_3_01_256,Nuclei_Texture_Variance_CorrER_3_00_256,Nuclei_Texture_Variance_CorrGasdermin_3_00_256
0,PBMC,B02,34618,6,DMSO,0.025,%,LPS,0.01,µg/ml,...,1.326095,0.584946,0.13338,0.729722,0.2677,-0.122157,0.10825,-0.068544,-0.205295,-0.197257
1,PBMC,B02,34618,6,DMSO,0.025,%,LPS,0.01,µg/ml,...,0.908872,0.230972,0.994988,0.866751,0.636846,0.874833,-0.929131,-0.158906,-0.198572,-0.185623
2,PBMC,B02,34618,6,DMSO,0.025,%,LPS,0.01,µg/ml,...,0.129185,1.223301,0.79899,0.717814,0.883606,0.699406,-1.227278,-0.163521,-0.16514,-0.174762
3,PBMC,B02,34618,6,DMSO,0.025,%,LPS,0.01,µg/ml,...,-1.393037,0.476057,-2.821609,-1.176345,-1.087953,-2.883989,2.15391,0.586555,0.258023,0.218118
4,PBMC,B02,34618,6,DMSO,0.025,%,LPS,0.01,µg/ml,...,-0.874583,-0.399795,-0.078365,0.232356,0.014621,0.20652,-0.557309,-0.14898,-0.12286,-0.110851


In [8]:
# removing costes features as they behave with great variance across all data
feature_df = feature_df.drop(feature_df.filter(regex="Costes").columns, axis=1)
print(feature_df.shape)
feature_df.head()

(5598682, 1264)


Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,...,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_CorrMito_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_03_256,Nuclei_Texture_SumEntropy_CorrPM_3_01_256,Nuclei_Texture_SumVariance_CorrPM_3_01_256,Nuclei_Texture_Variance_CorrER_3_00_256,Nuclei_Texture_Variance_CorrGasdermin_3_00_256
0,PBMC,B02,34618,6,DMSO,0.025,%,LPS,0.01,µg/ml,...,1.326095,0.584946,0.13338,0.729722,0.2677,-0.122157,0.10825,-0.068544,-0.205295,-0.197257
1,PBMC,B02,34618,6,DMSO,0.025,%,LPS,0.01,µg/ml,...,0.908872,0.230972,0.994988,0.866751,0.636846,0.874833,-0.929131,-0.158906,-0.198572,-0.185623
2,PBMC,B02,34618,6,DMSO,0.025,%,LPS,0.01,µg/ml,...,0.129185,1.223301,0.79899,0.717814,0.883606,0.699406,-1.227278,-0.163521,-0.16514,-0.174762
3,PBMC,B02,34618,6,DMSO,0.025,%,LPS,0.01,µg/ml,...,-1.393037,0.476057,-2.821609,-1.176345,-1.087953,-2.883989,2.15391,0.586555,0.258023,0.218118
4,PBMC,B02,34618,6,DMSO,0.025,%,LPS,0.01,µg/ml,...,-0.874583,-0.399795,-0.078365,0.232356,0.014621,0.20652,-0.557309,-0.14898,-0.12286,-0.110851


In [9]:
# replacing '/' in treatment dosage column to avoid errors in file interpolation including such strings
feature_df = feature_df.replace(to_replace="/", value="_per_", regex=True)

In [10]:
# replace nan values with 0
feature_df["Metadata_inducer1_concentration"] = feature_df[
    "Metadata_inducer1_concentration"
].fillna(0)
feature_df["Metadata_inducer2_concentration"] = feature_df[
    "Metadata_inducer2_concentration"
].fillna(0)
feature_df["Metadata_inhibitor_concentration"] = feature_df[
    "Metadata_inhibitor_concentration"
].fillna(0)

#### Combine Inducer1 and Inducer2 into one column

In [11]:
# treatment column merge
conditions = [
    (feature_df["Metadata_inducer2"].isnull()),
    feature_df["Metadata_inducer2"].notnull(),
]

results = [
    (feature_df["Metadata_inducer1"]).astype(str),
    (feature_df["Metadata_inducer1"] + "_" + feature_df["Metadata_inducer2"]).astype(
        str
    ),
]
feature_df["Metadata_Treatment"] = np.select(condlist=conditions, choicelist=results)

# dose column merge
conditions = [
    (feature_df["Metadata_inducer2"].isnull()),
    feature_df["Metadata_inducer2"].notnull(),
]

results = [
    (feature_df["Metadata_inducer1_concentration"].astype(str)).astype(str),
    (
        feature_df["Metadata_inducer1_concentration"].astype(str)
        + "_"
        + feature_df["Metadata_inducer2_concentration"].astype(str)
    ).astype(str),
]
feature_df["Metadata_Dose"] = np.select(condlist=conditions, choicelist=results)

In [12]:
feature_df["Metadata_inducer1_concentration"] = pd.to_numeric(
    feature_df["Metadata_inducer1_concentration"]
)

## N Beta Column condition generation
columns generated to used for linear modeling where terms separated by '__' will be a beta coefficient 

In [13]:
# one beta of inudcer1, inducer1 concentration, inhibitor, and inhibitor concentration all as 1 beta term
feature_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] = (
    feature_df["Metadata_Treatment"]
    + "_"
    + feature_df["Metadata_Dose"].astype(str)
    + "_"
    + feature_df["Metadata_inhibitor"].astype(str)
    + "_"
    + feature_df["Metadata_inhibitor_concentration"].astype(str)
).astype(str)


# two beta of inducer1, inhibitor, and inhibitor concentration all as 1 beta term + inducer1 concentration as 2nd beta term
feature_df["twob_Metadata_Treatment_Dose_Inhibitor_Dose"] = (
    feature_df["Metadata_Treatment"]
    + "_"
    + feature_df["Metadata_inhibitor"].astype(str)
    + "_"
    + feature_df["Metadata_inhibitor_concentration"].astype(str)
    + "__"
    + feature_df["Metadata_Dose"].astype(str)
).astype(str)

# three beta of inducer 1 as 1 beta term, inducer1 concentration as 2nd beta term, inhibitor and inhibitor concentration as 3rd beta term
feature_df["threeb_Metadata_Treatment_Dose_Inhibitor_Dose"] = (
    feature_df["Metadata_Treatment"]
    + "__"
    + feature_df["Metadata_Dose"].astype(str)
    + "__"
    + feature_df["Metadata_inhibitor"].astype(str)
    + "_"
    + feature_df["Metadata_inhibitor_concentration"].astype(str)
).astype(str)

# four beta of inducer 1 as 1 beta term, inducer1 concentration as 2nd beta term, inhibitor as 3rd beta term, and inhibitor concentration as 4th beta term
feature_df["fourb_Metadata_Treatment_Dose_Inhibitor_Dose"] = (
    feature_df["Metadata_Treatment"]
    + "__"
    + feature_df["Metadata_Dose"].astype(str)
    + "__"
    + feature_df["Metadata_inhibitor"].astype(str)
    + "__"
    + feature_df["Metadata_inhibitor_concentration"].astype(str)
).astype(str)

In [14]:
feature_df_table = pa.Table.from_pandas(feature_df)
pq.write_table(feature_df_table, feature_df_out_path)