In [1]:
import ast
import itertools
import pathlib
import sys
import warnings

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import toml
from joblib import dump
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.utils import parallel_backend, shuffle

In [2]:
# Parameters
cell_type = "SHSY5Y"
aggregation = True
nomic = True
flag = True
control = "DMSO_0.100_%_DMSO_0.025_%"
treatment = "LPS_100.000_ug_per_ml_DMSO_0.025_%"

In [3]:
MODEL_TYPE = "binary_classification"
if flag == False:
    # read in toml file and get parameters
    toml_path = pathlib.Path("../1.train_models/single_class_config.toml")
    with open(toml_path, "r") as f:
        config = toml.load(f)
    f.close()
    control = config["logistic_regression_params"]["control"]
    treatment = config["logistic_regression_params"]["treatments"]
    aggregation = ast.literal_eval(config["logistic_regression_params"]["aggregation"])
    nomic = ast.literal_eval(config["logistic_regression_params"]["nomic"])
    cell_type = config["logistic_regression_params"]["cell_type"]
    print(aggregation, nomic, cell_type)

In [4]:
path = pathlib.Path(f"../../data/{cell_type}_preprocessed_sc_norm.parquet").resolve(
    strict=True
)

data_df = pd.read_parquet(path)

data_df.head(2)

Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,...,Nuclei_Texture_SumEntropy_CorrPM_3_01_256,Nuclei_Texture_SumVariance_CorrGasdermin_3_01_256,Nuclei_Texture_SumVariance_CorrMito_3_03_256,Nuclei_Texture_SumVariance_CorrPM_3_03_256,Metadata_Treatment,Metadata_Dose,oneb_Metadata_Treatment_Dose_Inhibitor_Dose,twob_Metadata_Treatment_Dose_Inhibitor_Dose,threeb_Metadata_Treatment_Dose_Inhibitor_Dose,fourb_Metadata_Treatment_Dose_Inhibitor_Dose
0,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,0.0,,...,1.851482,0.024721,0.307472,0.092086,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan
1,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,0.0,,...,0.897731,-0.041156,1.443262,0.009843,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan


In [5]:
data_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].unique()

array(['media_ctr_0.0_0_Media_ctr_0.0_0', 'DMSO_0.100_%_DMSO_1.000_%',
       'DMSO_0.100_%_Z-VAD-FMK_100.000_uM',
       'DMSO_0.100_%_Z-VAD-FMK_30.000_uM', 'DMSO_0.100_%_DMSO_0.025_%',
       'Thapsigargin_1.000_uM_DMSO_0.025_%',
       'Thapsigargin_10.000_uM_DMSO_0.025_%',
       'Topotecan_5.000_nM_DMSO_0.025_%',
       'Topotecan_10.000_nM_DMSO_0.025_%',
       'Topotecan_20.000_nM_DMSO_0.025_%',
       'LPS_0.010_ug_per_ml_DMSO_0.025_%',
       'LPS_0.100_ug_per_ml_DMSO_0.025_%',
       'LPS_1.000_ug_per_ml_DMSO_0.025_%',
       'LPS_10.000_ug_per_ml_DMSO_0.025_%',
       'LPS_10.000_ug_per_ml_Disulfiram_0.100_uM',
       'LPS_10.000_ug_per_ml_Disulfiram_1.000_uM',
       'LPS_10.000_ug_per_ml_Disulfiram_2.500_uM',
       'LPS_Nigericin_100.000_ug_per_ml_1.000_uM_DMSO_0.025_%',
       'LPS_Nigericin_100.000_ug_per_ml_3.000_uM_DMSO_0.025_%',
       'LPS_Nigericin_100.000_ug_per_ml_10.000_uM_DMSO_0.025_%',
       'Disulfiram_0.100_uM_DMSO_0.025_%',
       'Disulfiram_1.000_uM_DMSO

In [6]:
# set save path
if aggregation:
    if nomic:
        save_path = pathlib.Path(
            f"./indexes/{cell_type}/{MODEL_TYPE}/{control}_{treatment}"
        )
    elif not nomic:
        save_path = pathlib.Path(
            f"./indexes/{cell_type}/{MODEL_TYPE}{control}_{treatment}"
        )
elif not aggregation:
    if nomic:
        save_path = pathlib.Path(
            f"./indexes/{cell_type}/{MODEL_TYPE}/{control}_{treatment}"
        )
    elif not nomic:
        save_path = pathlib.Path(
            f"./indexes/{cell_type}/{MODEL_TYPE}/{control}_{treatment}"
        )
else:
    print("Error")

print(save_path)
# create save path if it doesn't exist
save_path.mkdir(parents=True, exist_ok=True)

indexes/SHSY5Y/binary_classification/DMSO_0.100_%_DMSO_0.025_%_LPS_100.000_ug_per_ml_DMSO_0.025_%


In [None]:
if nomic:
    nomic_df_path = pathlib.Path(
        f"../../2.Nomic_nELISA_Analysis/Data/clean/Plate2/nELISA_plate_430420_{cell_type}.csv"
    )
    df_nomic = pd.read_csv(nomic_df_path)
    # drop columns that contain [pgML]
    df_nomic = df_nomic.drop(
        columns=[col for col in df_nomic.columns if "[pgML]" in col]
    )
    # drop first 25 columns
    df_nomic = df_nomic.drop(columns=df_nomic.columns[3:25])
    df_nomic = df_nomic.drop(columns=df_nomic.columns[0:2])
else:
    df_nomic = None

In [None]:
# subset each column that contains metadata
metadata = data_df.filter(regex="Metadata")

# get all columns that are not metadata except for metadata_Well
data = data_df.drop(metadata.columns, axis=1)

# get the metadata_Well column
metadata_well = metadata["Metadata_Well"]

data = pd.merge(data, metadata_well, left_index=True, right_index=True)

In [None]:
if (aggregation) and (nomic):

    # subset each column that contains metadata
    metadata = data_df.filter(regex="Metadata")
    data_df = data_df.drop(metadata.columns, axis=1)
    data_df = pd.concat([data_df, metadata["Metadata_Well"]], axis=1)
    # groupby well and take mean of each well
    data_df = data_df.groupby("Metadata_Well").mean()
    # drop duplicate rows in the metadata_well column
    metadata = metadata.drop_duplicates(subset=["Metadata_Well"])
    # get the metadata for each well
    data_df = pd.merge(
        data_df, metadata, left_on="Metadata_Well", right_on="Metadata_Well"
    )
    data_df = pd.merge(
        data_df, df_nomic, left_on="Metadata_Well", right_on="position_x"
    )
    data_df = data_df.drop(columns=["position_x"])

elif (aggregation) and (not nomic):
    # subset each column that contains metadata
    metadata = data_df.filter(regex="Metadata")
    data_df = data_df.drop(metadata.columns, axis=1)
    data_df = pd.concat([data_df, metadata["Metadata_Well"]], axis=1)
    # groupby well and take mean of each well
    data_df = data_df.groupby("Metadata_Well").mean()
    # drop duplicate rows in the metadata_well column
    metadata = metadata.drop_duplicates(subset=["Metadata_Well"])
    # get the metadata for each well
    data_df = pd.merge(
        data_df, metadata, left_on="Metadata_Well", right_on="Metadata_Well"
    )
elif (not aggregation) and (nomic):
    data_df = pd.merge(
        data_df, df_nomic, left_on="Metadata_Well", right_on="position_x"
    )
    data_df = data_df.drop(columns=["position_x"])
elif not aggregation and not nomic:
    pass
else:
    print("Error")

In [None]:
# drop all metadata columns
data_x = data_df.drop(metadata.columns, axis=1)
labeled_data = data_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]

In [None]:
# https://github.com/WayScience/phenotypic_profiling_model/blob/main/1.split_data/split_data.ipynb

In [None]:
# get oneb_Metadata_Treatment_Dose_Inhibitor_Dose  =='DMSO_0.100_DMSO_0.025' and 'LPS_100.000_DMSO_0.025 and Thapsigargin_10.000_DMSO_0.025'
data_df = data_df[
    data_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin([control, treatment])
]

In [None]:
# ratio of data to be used for testing (ex 0.15 = 15%)
test_ratio = 0.25

# get indexes of training and testing data
training_data, testing_data = train_test_split(
    data_df,
    test_size=test_ratio,
    stratify=data_df[["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]],
    random_state=0,
)
train_indexes = training_data.index.to_numpy()
test_indexes = testing_data.index.to_numpy()

print(f"Training data has shape: {training_data.shape}")
print(f"Testing data has shape: {testing_data.shape}")

In [None]:
# create pandas dataframe with all indexes and their respective labels, stratified by phenotypic class
index_data = []
for index in train_indexes:
    index_data.append({"labeled_data_index": index, "label": "train"})
for index in test_indexes:
    index_data.append({"labeled_data_index": index, "label": "test"})

# make index data a dataframe and sort it by labeled data index
index_data = pd.DataFrame(index_data).sort_values(["labeled_data_index"])

In [None]:
# save indexes as tsv file
if aggregation:
    if nomic:
        index_data.to_csv(
            f"{save_path}/aggregated_sc_and_nomic_data_split_indexes.tsv", sep="\t"
        )
    elif not nomic:
        index_data.to_csv(f"{save_path}/aggregated_sc_data_split_indexes.tsv", sep="\t")
elif not aggregation:
    if nomic:
        index_data.to_csv(f"{save_path}/sc_and_nomic_data_split_indexes.tsv", sep="\t")
    elif not nomic:
        index_data.to_csv(f"{save_path}/sc_split_indexes.tsv", sep="\t")
else:
    print("Error")