In [1]:
import sys
import pathlib
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
import plotly.express as px
from pycytominer.cyto_utils import infer_cp_features
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import pdfkit
import pyarrow as pa
import pyarrow.parquet as pq

# import Union
from typing import Union

sys.path.append("..")
# from ..utils.utils import df_stats
import matplotlib.pyplot as plt

In [2]:
# Define inputs
feature_file = pathlib.Path(
    "../../Extracted_Features_(CSV_files)/SHSY5Y_sc_norm.parquet"
)
feature_df = pq.read_table(feature_file).to_pandas()
# feature_df = pd.read_csv(feature_file, engine="pyarrow")

In [3]:
# define output file path
feature_df_out_path = pathlib.Path(
    "../../Extracted_Features_(CSV_files)/SHSY5Y_preprocessed_df_sc_norm.parquet"
)

In [4]:
print(feature_df.shape)
feature_df.head()

(597902, 2926)


Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,...,Nuclei_Texture_Variance_CorrGasdermin_3_02_256,Nuclei_Texture_Variance_CorrGasdermin_3_03_256,Nuclei_Texture_Variance_CorrMito_3_00_256,Nuclei_Texture_Variance_CorrMito_3_01_256,Nuclei_Texture_Variance_CorrMito_3_02_256,Nuclei_Texture_Variance_CorrMito_3_03_256,Nuclei_Texture_Variance_CorrPM_3_00_256,Nuclei_Texture_Variance_CorrPM_3_01_256,Nuclei_Texture_Variance_CorrPM_3_02_256,Nuclei_Texture_Variance_CorrPM_3_03_256
0,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,0.025742,0.016018,0.294581,0.313403,0.340904,0.304771,0.077819,0.076181,0.088,0.085886
1,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.040428,-0.038309,1.540182,1.422456,1.459884,1.458574,0.014013,0.010482,0.008044,0.008553
2,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.049183,-0.052954,0.002619,0.02267,0.01416,-0.006548,0.019567,0.035026,0.028281,0.011552
3,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.057027,-0.058113,-0.018521,-0.023614,-0.013422,-0.010141,-0.005182,-0.002528,0.003988,-0.004412
4,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.029618,-0.0241,-0.052448,-0.061118,-0.0535,-0.041029,0.010896,0.002573,0.008965,0.018884


In [5]:
# removing costes features as they behave with great variance across all data
feature_df = feature_df.drop(feature_df.filter(regex="Costes").columns, axis=1)
print(feature_df.shape)
feature_df

(597902, 2866)


Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,...,Nuclei_Texture_Variance_CorrGasdermin_3_02_256,Nuclei_Texture_Variance_CorrGasdermin_3_03_256,Nuclei_Texture_Variance_CorrMito_3_00_256,Nuclei_Texture_Variance_CorrMito_3_01_256,Nuclei_Texture_Variance_CorrMito_3_02_256,Nuclei_Texture_Variance_CorrMito_3_03_256,Nuclei_Texture_Variance_CorrPM_3_00_256,Nuclei_Texture_Variance_CorrPM_3_01_256,Nuclei_Texture_Variance_CorrPM_3_02_256,Nuclei_Texture_Variance_CorrPM_3_03_256
0,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,0.025742,0.016018,0.294581,0.313403,0.340904,0.304771,0.077819,0.076181,0.088000,0.085886
1,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.040428,-0.038309,1.540182,1.422456,1.459884,1.458574,0.014013,0.010482,0.008044,0.008553
2,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.049183,-0.052954,0.002619,0.022670,0.014160,-0.006548,0.019567,0.035026,0.028281,0.011552
3,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.057027,-0.058113,-0.018521,-0.023614,-0.013422,-0.010141,-0.005182,-0.002528,0.003988,-0.004412
4,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.029618,-0.024100,-0.052448,-0.061118,-0.053500,-0.041029,0.010896,0.002573,0.008965,0.018884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597897,SH-SY5Y,O23,3555,6,Media ctr,,,media ctr,,,...,-0.066729,-0.065583,-0.116521,-0.123996,-0.121936,-0.117364,-0.025364,-0.035732,-0.036173,-0.029662
597898,SH-SY5Y,O23,3555,6,Media ctr,,,media ctr,,,...,-0.010271,-0.016745,-0.107070,-0.107569,-0.105465,-0.108141,-0.054601,-0.053046,-0.054858,-0.053933
597899,SH-SY5Y,O23,3555,6,Media ctr,,,media ctr,,,...,-0.050146,-0.051077,-0.084737,-0.084647,-0.084973,-0.085631,-0.029157,-0.024882,-0.027573,-0.028409
597900,SH-SY5Y,O23,3555,6,Media ctr,,,media ctr,,,...,0.051504,0.054596,0.122236,0.133660,0.127649,0.132234,0.356736,0.355430,0.357942,0.355784


In [6]:
# replacing '/' in treatment dosage column to avoid errors in file interpolation including such strings
feature_df = feature_df.replace(to_replace="/", value="_per_", regex=True)

In [7]:
# Recycled code from: https://github.com/WayScience/NF1_SchwannCell_data/blob/main/5_analyze_data/notebooks/linear_model/fit_linear_model.ipynb
cell_count_df = (
    feature_df.groupby("Metadata_Well")["Metadata_Plate"]
    .count()
    .reset_index()
    .rename(columns={"Metadata_Plate": "Metadata_number_of_singlecells"})
)

In [8]:
feature_df.head()

Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,...,Nuclei_Texture_Variance_CorrGasdermin_3_02_256,Nuclei_Texture_Variance_CorrGasdermin_3_03_256,Nuclei_Texture_Variance_CorrMito_3_00_256,Nuclei_Texture_Variance_CorrMito_3_01_256,Nuclei_Texture_Variance_CorrMito_3_02_256,Nuclei_Texture_Variance_CorrMito_3_03_256,Nuclei_Texture_Variance_CorrPM_3_00_256,Nuclei_Texture_Variance_CorrPM_3_01_256,Nuclei_Texture_Variance_CorrPM_3_02_256,Nuclei_Texture_Variance_CorrPM_3_03_256
0,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,0.025742,0.016018,0.294581,0.313403,0.340904,0.304771,0.077819,0.076181,0.088,0.085886
1,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.040428,-0.038309,1.540182,1.422456,1.459884,1.458574,0.014013,0.010482,0.008044,0.008553
2,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.049183,-0.052954,0.002619,0.02267,0.01416,-0.006548,0.019567,0.035026,0.028281,0.011552
3,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.057027,-0.058113,-0.018521,-0.023614,-0.013422,-0.010141,-0.005182,-0.002528,0.003988,-0.004412
4,SH-SY5Y,B13,3765,6,Media ctr,,,media ctr,,,...,-0.029618,-0.0241,-0.052448,-0.061118,-0.0535,-0.041029,0.010896,0.002573,0.008965,0.018884


In [9]:
# show max column in pandas df
pd.set_option("display.max_columns", 100)

In [10]:
# replace nan values with 0
feature_df["Metadata_inducer1_concentration"] = feature_df[
    "Metadata_inducer1_concentration"
].fillna(0)
feature_df["Metadata_inducer2_concentration"] = feature_df[
    "Metadata_inducer2_concentration"
].fillna(0)
feature_df["Metadata_inhibitor_concentration"] = feature_df[
    "Metadata_inhibitor_concentration"
].fillna(0)

#### Combine Inducer1 and Inducer2 into one column

In [11]:
# treatment column merge
conditions = [
    (feature_df["Metadata_inducer2"].isnull()),
    feature_df["Metadata_inducer2"].notnull(),
]

results = [
    (feature_df["Metadata_inducer1"]).astype(str),
    (feature_df["Metadata_inducer1"] + "_" + feature_df["Metadata_inducer2"]).astype(
        str
    ),
]
feature_df["Metadata_Treatment"] = np.select(conditions, results)

# dose column merge
conditions = [
    (feature_df["Metadata_inducer2"].isnull()),
    feature_df["Metadata_inducer2"].notnull(),
]

results = [
    (feature_df["Metadata_inducer1_concentration"].astype(str)).astype(str),
    (
        feature_df["Metadata_inducer1_concentration"].astype(str)
        + "_"
        + feature_df["Metadata_inducer2_concentration"].astype(str)
    ).astype(str),
]
feature_df["Metadata_Dose"] = np.select(conditions, results)

In [12]:
feature_df["Metadata_inducer1_concentration"] = pd.to_numeric(
    feature_df["Metadata_inducer1_concentration"]
)

## N Beta Column condition generation

In [13]:
feature_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] = (
    feature_df["Metadata_Treatment"]
    + "_"
    + feature_df["Metadata_Dose"].astype(str)
    + "_"
    + feature_df["Metadata_inhibitor"].astype(str)
    + "_"
    + feature_df["Metadata_inhibitor_concentration"].astype(str)
).astype(str)

feature_df["twob_Metadata_Treatment_Dose_Inhibitor_Dose"] = (
    feature_df["Metadata_Treatment"]
    + "_"
    + feature_df["Metadata_inhibitor"].astype(str)
    + "_"
    + feature_df["Metadata_inhibitor_concentration"].astype(str)
    + "__"
    + feature_df["Metadata_Dose"].astype(str)
).astype(str)

feature_df["threeb_Metadata_Treatment_Dose_Inhibitor_Dose"] = (
    feature_df["Metadata_Treatment"]
    + "__"
    + feature_df["Metadata_Dose"].astype(str)
    + "__"
    + feature_df["Metadata_inhibitor"].astype(str)
    + "_"
    + feature_df["Metadata_inhibitor_concentration"].astype(str)
).astype(str)
feature_df["fourb_Metadata_Treatment_Dose_Inhibitor_Dose"] = (
    feature_df["Metadata_Treatment"]
    + "__"
    + feature_df["Metadata_Dose"].astype(str)
    + "__"
    + feature_df["Metadata_inhibitor"].astype(str)
    + "__"
    + feature_df["Metadata_inhibitor_concentration"].astype(str)
).astype(str)

In [None]:
# feature_df.to_csv(feature_df_out_path, index=False)
feature_df_table = pa.Table.from_pandas(feature_df)
pq.write_table(feature_df_table, feature_df_out_path)