In [1]:
import pathlib
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import pyarrow as pa

In [2]:
# Define inputs
feature_file = pathlib.Path(
    "../../Extracted_Features_(CSV_files)/SHSY5Y_sc_norm.parquet"
)
feature_df = pq.read_table(feature_file).to_pandas()

In [3]:
# define output file path
feature_df_out_path = pathlib.Path(
    "../../Extracted_Features_(CSV_files)/SHSY5Y_preprocessed_df_sc_norm.parquet"
)

In [4]:
print(feature_df.shape)
feature_df.head()

(597902, 2926)


Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,...,Nuclei_Texture_Variance_CorrGasdermin_3_02_256,Nuclei_Texture_Variance_CorrGasdermin_3_03_256,Nuclei_Texture_Variance_CorrMito_3_00_256,Nuclei_Texture_Variance_CorrMito_3_01_256,Nuclei_Texture_Variance_CorrMito_3_02_256,Nuclei_Texture_Variance_CorrMito_3_03_256,Nuclei_Texture_Variance_CorrPM_3_00_256,Nuclei_Texture_Variance_CorrPM_3_01_256,Nuclei_Texture_Variance_CorrPM_3_02_256,Nuclei_Texture_Variance_CorrPM_3_03_256
0,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,0.025742,0.016018,0.294581,0.313403,0.340904,0.304771,0.077819,0.076181,0.088,0.085886
1,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.040428,-0.038309,1.540182,1.422456,1.459884,1.458574,0.014013,0.010482,0.008044,0.008553
2,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.049183,-0.052954,0.002619,0.02267,0.01416,-0.006548,0.019567,0.035026,0.028281,0.011552
3,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.057027,-0.058113,-0.018521,-0.023614,-0.013422,-0.010141,-0.005182,-0.002528,0.003988,-0.004412
4,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.029618,-0.0241,-0.052448,-0.061118,-0.0535,-0.041029,0.010896,0.002573,0.008965,0.018884


In [5]:
# removing costes features as they behave with great variance across all data
feature_df = feature_df.drop(feature_df.filter(regex="Costes").columns, axis=1)
print(feature_df.shape)
feature_df.head()

(597902, 2866)


Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,...,Nuclei_Texture_Variance_CorrGasdermin_3_02_256,Nuclei_Texture_Variance_CorrGasdermin_3_03_256,Nuclei_Texture_Variance_CorrMito_3_00_256,Nuclei_Texture_Variance_CorrMito_3_01_256,Nuclei_Texture_Variance_CorrMito_3_02_256,Nuclei_Texture_Variance_CorrMito_3_03_256,Nuclei_Texture_Variance_CorrPM_3_00_256,Nuclei_Texture_Variance_CorrPM_3_01_256,Nuclei_Texture_Variance_CorrPM_3_02_256,Nuclei_Texture_Variance_CorrPM_3_03_256
0,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,0.025742,0.016018,0.294581,0.313403,0.340904,0.304771,0.077819,0.076181,0.088,0.085886
1,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.040428,-0.038309,1.540182,1.422456,1.459884,1.458574,0.014013,0.010482,0.008044,0.008553
2,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.049183,-0.052954,0.002619,0.02267,0.01416,-0.006548,0.019567,0.035026,0.028281,0.011552
3,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.057027,-0.058113,-0.018521,-0.023614,-0.013422,-0.010141,-0.005182,-0.002528,0.003988,-0.004412
4,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.029618,-0.0241,-0.052448,-0.061118,-0.0535,-0.041029,0.010896,0.002573,0.008965,0.018884


In [6]:
# replacing '/' in treatment dosage column to avoid errors in file interpolation including such strings
feature_df = feature_df.replace(to_replace="/", value="_per_", regex=True)

In [None]:
# replace nan values with 0
feature_df["Metadata_inducer1_concentration"] = feature_df[
    "Metadata_inducer1_concentration"
].fillna(0)
feature_df["Metadata_inducer2_concentration"] = feature_df[
    "Metadata_inducer2_concentration"
].fillna(0)
feature_df["Metadata_inhibitor_concentration"] = feature_df[
    "Metadata_inhibitor_concentration"
].fillna(0)

#### Combine Inducer1 and Inducer2 into one column

In [None]:
# treatment column merge
conditions = [
    (feature_df["Metadata_inducer2"].isnull()),
    feature_df["Metadata_inducer2"].notnull(),
]

results = [
    (feature_df["Metadata_inducer1"]).astype(str),
    (feature_df["Metadata_inducer1"] + "_" + feature_df["Metadata_inducer2"]).astype(
        str
    ),
]
feature_df["Metadata_Treatment"] = np.select(condlist=conditions, choicelist=results)

# dose column merge
conditions = [
    (feature_df["Metadata_inducer2"].isnull()),
    feature_df["Metadata_inducer2"].notnull(),
]

results = [
    (feature_df["Metadata_inducer1_concentration"].astype(str)).astype(str),
    (
        feature_df["Metadata_inducer1_concentration"].astype(str)
        + "_"
        + feature_df["Metadata_inducer2_concentration"].astype(str)
    ).astype(str),
]
feature_df["Metadata_Dose"] = np.select(condlist=conditions, choicelist=results)

In [None]:
feature_df["Metadata_inducer1_concentration"] = pd.to_numeric(
    feature_df["Metadata_inducer1_concentration"]
)

## N Beta Column condition generation
columns generated to used for linear modeling where terms separated by '__' will be a beta coefficient 

In [None]:
# one beta of inudcer1, inducer1 concentration, inhibitor, and inhibitor concentration all as 1 beta term
feature_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] = (
    feature_df["Metadata_Treatment"]
    + "_"
    + feature_df["Metadata_Dose"].astype(str)
    + "_"
    + feature_df["Metadata_inhibitor"].astype(str)
    + "_"
    + feature_df["Metadata_inhibitor_concentration"].astype(str)
).astype(str)


# two beta of inducer1, inhibitor, and inhibitor concentration all as 1 beta term + inducer1 concentration as 2nd beta term
feature_df["twob_Metadata_Treatment_Dose_Inhibitor_Dose"] = (
    feature_df["Metadata_Treatment"]
    + "_"
    + feature_df["Metadata_inhibitor"].astype(str)
    + "_"
    + feature_df["Metadata_inhibitor_concentration"].astype(str)
    + "__"
    + feature_df["Metadata_Dose"].astype(str)
).astype(str)

# three beta of inducer 1 as 1 beta term, inducer1 concentration as 2nd beta term, inhibitor and inhibitor concentration as 3rd beta term
feature_df["threeb_Metadata_Treatment_Dose_Inhibitor_Dose"] = (
    feature_df["Metadata_Treatment"]
    + "__"
    + feature_df["Metadata_Dose"].astype(str)
    + "__"
    + feature_df["Metadata_inhibitor"].astype(str)
    + "_"
    + feature_df["Metadata_inhibitor_concentration"].astype(str)
).astype(str)

# four beta of inducer 1 as 1 beta term, inducer1 concentration as 2nd beta term, inhibitor as 3rd beta term, and inhibitor concentration as 4th beta term
feature_df["fourb_Metadata_Treatment_Dose_Inhibitor_Dose"] = (
    feature_df["Metadata_Treatment"]
    + "__"
    + feature_df["Metadata_Dose"].astype(str)
    + "__"
    + feature_df["Metadata_inhibitor"].astype(str)
    + "__"
    + feature_df["Metadata_inhibitor_concentration"].astype(str)
).astype(str)

In [None]:
feature_df_table = pa.Table.from_pandas(feature_df)
pq.write_table(feature_df_table, feature_df_out_path)