In [1]:
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import pathlib
import warnings

from sklearn.utils import shuffle, parallel_backend
from sklearn.exceptions import ConvergenceWarning
from joblib import dump

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
)
import toml
import ast
import itertools
import sys

In [2]:
# read in toml file and get parameters
toml_path = pathlib.Path("single_class_config.toml")
with open(toml_path, "r") as f:
    config = toml.load(f)
f.close()
control = config["logistic_regression_params"]["control"]
treatment = config["logistic_regression_params"]["treatments"]
aggregation = ast.literal_eval(config["logistic_regression_params"]["aggregation"])
nomic = ast.literal_eval(config["logistic_regression_params"]["nomic"])
cell_type = config["logistic_regression_params"]["cell_type"]
print(aggregation, nomic, cell_type)

True True SHSY5Y


In [3]:
# load training data from indexes and features dataframe
# data_split_path = pathlib.Path(f"../0.split_data/indexes/data_split_indexes.tsv")
data_path = pathlib.Path("../../data/SHSY5Y_preprocessed_sc_norm.parquet")

# dataframe with only the labeled data we want (exclude certain phenotypic classes)
data_df = pq.read_table(data_path).to_pandas()

# import nomic data
nomic_df_path = pathlib.Path(
    f"../../2.Nomic_nELISA_Analysis/Data/clean/Plate2/nELISA_plate_430420_{cell_type}.csv"
)
df_nomic = pd.read_csv(nomic_df_path)

# clean up nomic data
df_nomic = df_nomic.drop(columns=[col for col in df_nomic.columns if "[pgML]" in col])
# drop first 25 columns (Metadata that is not needed)
df_nomic = df_nomic.drop(columns=df_nomic.columns[3:25])
df_nomic = df_nomic.drop(columns=df_nomic.columns[0:2])

In [4]:
metadata = data_df.filter(regex="Metadata")
data_df = data_df.drop(metadata.columns, axis=1)
data_df = pd.concat([data_df, metadata["Metadata_Well"]], axis=1)
# groupby well and take mean of each well
data_df = data_df.groupby("Metadata_Well").mean()
# drop duplicate rows in the metadata_well column
metadata = metadata.drop_duplicates(subset=["Metadata_Well"])
# get the metadata for each well
data_df = pd.merge(data_df, metadata, left_on="Metadata_Well", right_on="Metadata_Well")
data_df = pd.merge(data_df, df_nomic, left_on="Metadata_Well", right_on="position_x")
data_df = data_df.drop(columns=["position_x"])

In [5]:
if (aggregation == True) and (nomic == True):
    data_split_path = pathlib.Path(
        f"../0.split_data/indexes/aggregated_sc_and_nomic_data_split_indexes.tsv"
    )
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
    # subset each column that contains metadata
    metadata = data_df.filter(regex="Metadata")
    data_df = data_df.drop(metadata.columns, axis=1)
    data_df = pd.concat([data_df, metadata["Metadata_Well"]], axis=1)
    # groupby well and take mean of each well
    data_df = data_df.groupby("Metadata_Well").mean()
    # drop duplicate rows in the metadata_well column
    metadata = metadata.drop_duplicates(subset=["Metadata_Well"])
    # get the metadata for each well
    data_df = pd.merge(
        data_df, metadata, left_on="Metadata_Well", right_on="Metadata_Well"
    )
    data_df = pd.merge(
        data_df, df_nomic, left_on="Metadata_Well", right_on="position_x"
    )
    data_df = data_df.drop(columns=["position_x"])
elif (aggregation == True) and (nomic == False):
    data_split_path = pathlib.Path(
        f"../0.split_data/indexes/aggregated_sc_data_split_indexes.tsv"
    )
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
    # subset each column that contains metadata
    metadata = data_df.filter(regex="Metadata")
    data_df = data_df.drop(metadata.columns, axis=1)
    data_df = pd.concat([data_df, metadata["Metadata_Well"]], axis=1)
    # groupby well and take mean of each well
    data_df = data_df.groupby("Metadata_Well").mean()
    # drop duplicate rows in the metadata_well column
    metadata = metadata.drop_duplicates(subset=["Metadata_Well"])
    # get the metadata for each well
    data_df = pd.merge(
        data_df, metadata, left_on="Metadata_Well", right_on="Metadata_Well"
    )
    data_df = pd.merge(
        data_df, df_nomic, left_on="Metadata_Well", right_on="position_x"
    )
    data_df = data_df.drop(columns=["position_x"])
elif (aggregation == False) and (nomic == True):
    data_split_path = pathlib.Path(
        f"../0.split_data/indexes/sc_and_nomic_data_split_indexes.tsv"
    )
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
    data_df = pd.merge(
        data_df, df_nomic, left_on="Metadata_Well", right_on="position_x"
    )
    data_df = data_df.drop(columns=["position_x"])
elif aggregation == False and nomic == False:
    data_split_path = pathlib.Path(f"../0.split_data/indexes/sc_split_indexes.tsv")
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
else:
    print("Error")

In [6]:
# if nomic == True:
#     # import nomic data
#     nomic_df_path = pathlib.Path(
#         f"../../2.Nomic_nELISA_Analysis/Data/clean/Plate2/nELISA_plate_430420_{cell_type}.csv"
#     )
#     df_nomic = pd.read_csv(nomic_df_path)
#     # drop columns that contain [pgML] but keep the NSU (Normalized Standard Units) columns
#     df_nomic = df_nomic.drop(
#         columns=[col for col in df_nomic.columns if "[pgML]" in col]
#     )
#     # drop first 25 columns (Metadata that is not needed)
#     df_nomic = df_nomic.drop(columns=df_nomic.columns[3:25])
#     df_nomic = df_nomic.drop(columns=df_nomic.columns[0:2])

#     data_df = pd.merge(data_df, df_nomic, left_on="Metadata_Well", right_on="position_x")
# elif nomic == False:
#     pass
# else:
#     print('Error')

In [7]:
# subset data_df by indexes in data_split_indexes
training_data = data_df.loc[data_split_indexes["labeled_data_index"]]

In [8]:
training_data

Unnamed: 0,Metadata_Well,Cytoplasm_AreaShape_Compactness,Cytoplasm_AreaShape_Eccentricity,Cytoplasm_AreaShape_Extent,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_MajorAxisLength,Cytoplasm_AreaShape_MinorAxisLength,Cytoplasm_AreaShape_Orientation,Cytoplasm_AreaShape_Zernike_0_0,Cytoplasm_AreaShape_Zernike_1_1,...,TWEAK [NSU]_y,uPA [NSU]_y,VCAM-1 [NSU]_y,VEGF Receptor 2 (Flk-1) [NSU]_y,VEGF-A (165) [NSU]_y,VEGF-C [NSU]_y,VEGF-D [NSU]_y,VEGFR-1 [NSU]_y,WISP-1 (CCN4) [NSU]_y,XCL1 (Lymphotactin) [NSU]_y
0,B13,0.000263,0.050292,0.011215,-0.032031,0.139148,0.092653,-0.022733,-0.004550,-0.019608,...,-0.225271,2.011257,-1.313502,-0.528295,26.484897,-0.751687,-0.891735,-1.624976,1.324507,-0.622436
1,B14,-0.063223,-0.001418,0.035864,0.036794,0.037936,0.031201,-0.012884,0.028338,0.019985,...,-0.894582,1.230590,-0.966358,-0.353436,31.847475,-1.216944,-2.187799,0.107920,1.367803,0.180895
2,B15,-0.062009,0.001236,0.044042,0.030464,-0.002026,0.006311,0.010789,0.030538,0.022751,...,0.114074,2.302291,1.364515,-0.062003,30.013129,-0.001147,0.460551,0.005704,-0.403832,-1.415898
3,B16,-0.031699,0.047344,-0.003990,0.002975,0.115183,0.070404,-0.007908,-0.010212,-0.004997,...,-1.402026,1.120312,0.248420,-1.413591,28.235605,-1.411162,-0.597302,0.644832,-0.112562,-1.287083
4,B17,-0.045468,0.038261,0.034279,0.023820,0.163262,0.120615,-0.000391,0.018250,-0.015776,...,-1.075493,1.131858,-0.763617,-1.051842,25.473988,-1.297297,0.987359,-0.184818,1.174355,-1.467287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,O19,-0.030737,0.025373,0.019060,0.013307,0.087081,0.062723,0.030207,0.011448,-0.006952,...,-0.638728,2.325251,1.088603,-2.413005,30.040230,0.150468,2.537893,-2.072348,1.365336,0.546843
150,O20,-0.008031,0.149927,-0.087296,-0.039525,0.227378,-0.005209,0.012483,-0.121338,-0.041426,...,0.191073,1.324194,0.085482,-2.060541,26.633495,-0.656278,1.078025,-0.021887,0.382966,-0.827056
151,O21,-0.039158,0.083196,-0.033546,-0.022291,0.151747,0.022673,0.001550,-0.070861,-0.024675,...,-0.295899,2.450049,0.839780,-0.387338,30.082969,-0.812300,1.301290,0.240628,0.458829,-0.726808
152,O22,-0.078584,0.011797,0.072764,0.058422,0.033983,0.037911,0.019646,0.063310,0.006814,...,-0.328278,2.187440,0.442932,-0.330779,34.600973,-1.784995,-0.022841,0.475823,0.490670,-1.055123


In [9]:
training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].unique()

array(['media ctr_0_Media ctr_0.0', 'DMSO_0.100_DMSO_0.025',
       'LPS_0.010_DMSO_0.025', 'LPS_Nigericin_100.000_1.0_DMSO_0.025',
       'DMSO_0.100_DMSO_1.0', 'LPS_0.100_DMSO_0.025',
       'LPS_Nigericin_100.000_3.0_DMSO_0.025',
       'DMSO_0.100_Z-VAD-FMK_100.0', 'Thapsigargin_1.000_DMSO_0.025',
       'LPS_1.000_DMSO_0.025', 'LPS_Nigericin_100.000_10.0_DMSO_0.025',
       'DMSO_0.100_Z-VAD-FMK_30.0', 'Thapsigargin_10.000_DMSO_0.025',
       'LPS_10.000_DMSO_0.025', 'Disulfiram_0.100_DMSO_0.025',
       'Topotecan_5.000_DMSO_0.025', 'LPS_10.000_Disulfiram_0.1',
       'Disulfiram_1.000_DMSO_0.025', 'Topotecan_10.000_DMSO_0.025',
       'LPS_10.000_Disulfiram_1.0', 'Disulfiram_2.500_DMSO_0.025',
       'Topotecan_20.000_DMSO_0.025', 'LPS_10.000_Disulfiram_2.5',
       'H2O2_100.000_DMSO_0.025', 'LPS_10.000_Z-VAD-FMK_100.0',
       'LPS_100.000_DMSO_0.025', 'H2O2_100.000_Disulfiram_1.0',
       'LPS_Nigericin_1.000_1.0_DMSO_0.025',
       'H2O2_100.000_Z-VAD-FMK_100.0',
       'LPS

In [10]:
control
treatment

'LPS_100.000_DMSO_0.025'

In [11]:
# get oneb_Metadata_Treatment_Dose_Inhibitor_Dose  =='DMSO_0.100_DMSO_0.025' and 'LPS_100.000_DMSO_0.025 and Thapsigargin_10.000_DMSO_0.025'
training_data = training_data[
    training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(
        [control, treatment]
    )
]

In [12]:
training_data

Unnamed: 0,Metadata_Well,Cytoplasm_AreaShape_Compactness,Cytoplasm_AreaShape_Eccentricity,Cytoplasm_AreaShape_Extent,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_MajorAxisLength,Cytoplasm_AreaShape_MinorAxisLength,Cytoplasm_AreaShape_Orientation,Cytoplasm_AreaShape_Zernike_0_0,Cytoplasm_AreaShape_Zernike_1_1,...,TWEAK [NSU]_y,uPA [NSU]_y,VCAM-1 [NSU]_y,VEGF Receptor 2 (Flk-1) [NSU]_y,VEGF-A (165) [NSU]_y,VEGF-C [NSU]_y,VEGF-D [NSU]_y,VEGFR-1 [NSU]_y,WISP-1 (CCN4) [NSU]_y,XCL1 (Lymphotactin) [NSU]_y
1,B14,-0.063223,-0.001418,0.035864,0.036794,0.037936,0.031201,-0.012884,0.028338,0.019985,...,-0.894582,1.23059,-0.966358,-0.353436,31.847475,-1.216944,-2.187799,0.10792,1.367803,0.180895
2,B15,-0.062009,0.001236,0.044042,0.030464,-0.002026,0.006311,0.010789,0.030538,0.022751,...,0.114074,2.302291,1.364515,-0.062003,30.013129,-0.001147,0.460551,0.005704,-0.403832,-1.415898
12,C14,-0.041941,0.009593,0.016816,0.024724,-0.004871,-0.008984,-0.005426,0.025566,0.021008,...,-1.491348,0.753051,-0.353624,-1.750361,30.798287,-1.081958,0.381804,0.434433,1.076929,-0.690716
13,C15,-0.006042,0.063114,-0.011234,-0.020169,0.08216,0.026797,0.011032,-0.029677,-0.007507,...,-1.443101,1.668837,-0.8097,-1.622407,31.579312,-0.575102,1.046585,0.759533,1.462224,-1.570872
78,I14,-0.075586,-0.016629,0.070873,0.057914,-0.026897,-1.8e-05,-0.005342,0.08084,0.028966,...,-0.457974,1.789284,-0.078388,-1.130214,37.210404,0.136881,-0.230386,0.3931,1.519351,-0.957626
79,I15,-0.024548,0.008616,-0.002599,-0.010955,-0.030797,-0.031666,-0.015508,-0.003642,0.019646,...,-0.317506,1.276972,-0.184904,-0.250085,32.890864,-0.768775,0.201755,-0.051009,-0.73964,-0.182375
89,J14,0.344647,-0.060299,-0.227045,-0.159461,-0.075467,-0.04793,0.01425,-0.192829,-0.142429,...,0.369673,1.684895,0.885664,-2.195317,31.049379,-0.52378,-1.417898,1.516971,-0.951626,-0.961993
90,J15,-0.069172,0.002143,0.069379,0.037843,0.032119,0.029286,0.004375,0.05482,0.035552,...,-0.458767,1.322881,1.159109,-1.146321,34.579637,-1.392158,-1.62785,0.085441,-0.331116,0.629028
91,J16,-0.109533,-0.005858,0.157345,0.099615,0.222094,0.251497,-0.002614,0.141421,0.014662,...,-0.841824,3.584168,0.595476,-1.306669,37.195157,-0.515906,-1.393269,-0.998885,0.609292,-0.667795
92,J17,-0.119163,0.01234,0.144394,0.09486,0.214214,0.225011,0.026657,0.139912,0.039543,...,-0.356108,4.114721,-1.058368,-1.528527,35.417427,0.481774,-2.281248,-0.090374,-1.690186,0.137963


In [13]:
# at random downsample the DMSO treatment to match the number of wells in the LPS treatment
# get the number of wells in the LPS treatment
lps_wells = training_data[
    training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] == treatment
].shape[0]
# get the number of wells in the DMSO treatment
dmso_wells = training_data[
    training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] == control
].shape[0]
# downsample the DMSO treatment to match the number of wells in the LPS treatment
dmso_holdout = training_data[
    training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] == control
].sample(n=lps_wells)
# remove the downsampled DMSO wells from the data
training_data = training_data.drop(dmso_holdout.index)

In [14]:
training_data

Unnamed: 0,Metadata_Well,Cytoplasm_AreaShape_Compactness,Cytoplasm_AreaShape_Eccentricity,Cytoplasm_AreaShape_Extent,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_MajorAxisLength,Cytoplasm_AreaShape_MinorAxisLength,Cytoplasm_AreaShape_Orientation,Cytoplasm_AreaShape_Zernike_0_0,Cytoplasm_AreaShape_Zernike_1_1,...,TWEAK [NSU]_y,uPA [NSU]_y,VCAM-1 [NSU]_y,VEGF Receptor 2 (Flk-1) [NSU]_y,VEGF-A (165) [NSU]_y,VEGF-C [NSU]_y,VEGF-D [NSU]_y,VEGFR-1 [NSU]_y,WISP-1 (CCN4) [NSU]_y,XCL1 (Lymphotactin) [NSU]_y
1,B14,-0.063223,-0.001418,0.035864,0.036794,0.037936,0.031201,-0.012884,0.028338,0.019985,...,-0.894582,1.23059,-0.966358,-0.353436,31.847475,-1.216944,-2.187799,0.10792,1.367803,0.180895
12,C14,-0.041941,0.009593,0.016816,0.024724,-0.004871,-0.008984,-0.005426,0.025566,0.021008,...,-1.491348,0.753051,-0.353624,-1.750361,30.798287,-1.081958,0.381804,0.434433,1.076929,-0.690716
79,I15,-0.024548,0.008616,-0.002599,-0.010955,-0.030797,-0.031666,-0.015508,-0.003642,0.019646,...,-0.317506,1.276972,-0.184904,-0.250085,32.890864,-0.768775,0.201755,-0.051009,-0.73964,-0.182375
90,J15,-0.069172,0.002143,0.069379,0.037843,0.032119,0.029286,0.004375,0.05482,0.035552,...,-0.458767,1.322881,1.159109,-1.146321,34.579637,-1.392158,-1.62785,0.085441,-0.331116,0.629028
91,J16,-0.109533,-0.005858,0.157345,0.099615,0.222094,0.251497,-0.002614,0.141421,0.014662,...,-0.841824,3.584168,0.595476,-1.306669,37.195157,-0.515906,-1.393269,-0.998885,0.609292,-0.667795
92,J17,-0.119163,0.01234,0.144394,0.09486,0.214214,0.225011,0.026657,0.139912,0.039543,...,-0.356108,4.114721,-1.058368,-1.528527,35.417427,0.481774,-2.281248,-0.090374,-1.690186,0.137963
95,J20,-0.086101,0.005216,0.113798,0.060357,0.263558,0.272056,-0.022083,0.090666,0.016849,...,-1.535165,3.223288,-0.463249,0.215164,33.847974,-0.542996,-1.077504,0.072386,-0.78723,-1.10854
96,J21,-0.121875,-0.025362,0.148067,0.112572,0.216279,0.285047,-0.033124,0.150929,0.029143,...,-0.805656,5.585244,-0.722061,-1.265843,36.133497,-0.077595,0.897004,-0.453149,-0.18713,-1.321516


In [15]:
# define metadata columns
# subset each column that contains metadata
metadata = training_data.filter(regex="Metadata")
# drop all metadata columns
data_x = training_data.drop(metadata.columns, axis=1)
labeled_data = training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]

In [16]:
# specify model types, feature types, and phenotypic classes
model_types = ["final", "shuffled_baseline"]
feature_types = ["CP"]
phenotypic_classes = [f"{treatment}"]
# create stratified data sets forLPS k-fold cross validation
straified_k_folds = StratifiedKFold(n_splits=4, shuffle=False)

# create logistic regression model with following parameters
log_reg_model = LogisticRegression(
    penalty="elasticnet",
    solver="saga",
    max_iter=10,
    n_jobs=-1,
    random_state=0,
    class_weight="balanced",
)

# specify parameters to tune for
parameters = {"C": np.logspace(-3, 3, 7), "l1_ratio": np.linspace(0, 1, 11)}
print(f"Parameters being tested during grid search: {parameters}\n")

# create grid search with cross validation with hypertuning params
grid_search_cv = GridSearchCV(
    log_reg_model, parameters, cv=straified_k_folds, n_jobs=-1, scoring="f1_weighted"
)

# train model on training data on all combinations of model types, feature types, and phenotypic classes
for model_type, feature_type, phenotypic_class in itertools.product(
    model_types, feature_types, phenotypic_classes
):
    phenotypic_class_counts = training_data.loc[
        training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] == phenotypic_class
    ].shape[0]
    print(
        f"Training {model_type} model on {feature_type} features for {phenotypic_class} with {phenotypic_class_counts} samples"
    )

    if model_type == "shuffled_baseline":
        for column in data_x:
            np.random.shuffle(data_x[column].values)

    with parallel_backend("multiprocessing"):
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore", category=ConvergenceWarning, module="sklearn"
            )
            grid_search_cv = grid_search_cv.fit(data_x, labeled_data)

    # print info for best estimator
    print(f"Best parameters: {grid_search_cv.best_params_}")
    print(f"Score of best estimator: {grid_search_cv.best_score_}\n")

    if (aggregation == True) and (nomic == True):
        results_dir = f"./models/single_class/{cell_type}/aggregated_with_nomic/{control}__{treatment}"
    elif (aggregation == True) and (nomic == False):
        results_dir = (
            f"./models/single_class/{cell_type}/aggregated/{control}__{treatment}"
        )
    elif (aggregation == False) and (nomic == True):
        results_dir = (
            f"./models/single_class/{cell_type}/sc_with_nomic/{control}__{treatment}"
        )
    elif (aggregation == False) and (nomic == False):
        results_dir = f"./models/single_class/{cell_type}/sc/{control}__{treatment}"
    else:
        print("Error")

    # create results directory if it doesn't exist
    pathlib.Path(results_dir).mkdir(parents=True, exist_ok=True)

    # save final estimator
    dump(
        grid_search_cv.best_estimator_,
        f"{results_dir}/{model_type}__{feature_type}.joblib",
    )

Parameters being tested during grid search: {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]), 'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])}

Training final model on CP features for LPS_100.000_DMSO_0.025 with 4 samples
Best parameters: {'C': 0.001, 'l1_ratio': 0.0}
Score of best estimator: 1.0

Training shuffled_baseline model on CP features for LPS_100.000_DMSO_0.025 with 4 samples
Best parameters: {'C': 0.001, 'l1_ratio': 0.0}
Score of best estimator: 0.6666666666666667



In [17]:
# save condfig copy specific to this model to the folder with the results
# use pathlib
config_copy_path = pathlib.Path(f"{results_dir}/{model_type}__{feature_type}.toml")
with open(config_copy_path, "w") as f:
    toml.dump(config, f)
f.close()