In [1]:
import ast
import itertools
import pathlib
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import seaborn as sns
import toml
from joblib import dump, load
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
)
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.utils import parallel_backend, shuffle

In [2]:
# Parameters
cell_type = "SHSY5Y"
aggregation = True
nomic = True
flag = True
control = "DMSO_0.100_%_DMSO_0.025_%"
treatment = "LPS_100.000_ug_per_ml_DMSO_0.025_%"

In [3]:
MODEL_TYPE = "binary_classification"
if flag == False:
    # read in toml file and get parameters
    toml_path = pathlib.Path("../1.train_models/single_class_config.toml")
    with open(toml_path, "r") as f:
        config = toml.load(f)
    control = config["logistic_regression_params"]["control"]
    treatment = config["logistic_regression_params"]["treatments"]
    aggregation = ast.literal_eval(config["logistic_regression_params"]["aggregation"])
    nomic = ast.literal_eval(config["logistic_regression_params"]["nomic"])
    cell_type = config["logistic_regression_params"]["cell_type"]
    print(aggregation, nomic, cell_type)

In [4]:
if flag == False:
    # read in toml file and get parameters
    toml_path = pathlib.Path("../1.train_models/single_class_config.toml")
    with open(toml_path, "r") as f:
        config = toml.load(f)
    f.close()
    control = config["logistic_regression_params"]["control"]
    treatment = config["logistic_regression_params"]["treatments"]
    aggregation = ast.literal_eval(config["logistic_regression_params"]["aggregation"])
    nomic = ast.literal_eval(config["logistic_regression_params"]["nomic"])
    cell_type = config["logistic_regression_params"]["cell_type"]
    print(aggregation, nomic, cell_type)

In [5]:
# load training data from indexes and features dataframe
# data_split_path = pathlib.Path(f"../0.split_data/indexes/data_split_indexes.tsv")
data_path = pathlib.Path(f"../../data/{cell_type}_preprocessed_sc_norm.parquet")

# dataframe with only the labeled data we want (exclude certain phenotypic classes)
data_df = pq.read_table(data_path).to_pandas()

# import nomic data
nomic_df_path = pathlib.Path(
    f"../../2.Nomic_nELISA_Analysis/Data/clean/Plate2/nELISA_plate_430420_{cell_type}.csv"
)
df_nomic = pd.read_csv(nomic_df_path)

# clean up nomic data
df_nomic = df_nomic.drop(columns=[col for col in df_nomic.columns if "[pgML]" in col])
# drop first 25 columns (Metadata that is not needed)
df_nomic = df_nomic.drop(columns=df_nomic.columns[3:25])
df_nomic = df_nomic.drop(columns=df_nomic.columns[0:2])

In [6]:
if (aggregation == True) and (nomic == True):

    data_split_path = pathlib.Path(
        f"../0.split_data/indexes/{cell_type}/{MODEL_TYPE}/{control}_{treatment}/aggregated_sc_and_nomic_data_split_indexes.tsv"
    )
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
    # subset each column that contains metadata
    metadata = data_df.filter(regex="Metadata")
    data_df = data_df.drop(metadata.columns, axis=1)
    data_df = pd.concat([data_df, metadata["Metadata_Well"]], axis=1)
    # groupby well and take mean of each well
    data_df = data_df.groupby("Metadata_Well").mean()
    # drop duplicate rows in the metadata_well column
    metadata = metadata.drop_duplicates(subset=["Metadata_Well"])
    # get the metadata for each well
    data_df = pd.merge(
        data_df, metadata, left_on="Metadata_Well", right_on="Metadata_Well"
    )
    data_df = pd.merge(
        data_df, df_nomic, left_on="Metadata_Well", right_on="position_x"
    )
    data_df = data_df.drop(columns=["position_x"])

elif (aggregation == True) and (nomic == False):
    data_split_path = pathlib.Path(
        f"../0.split_data/indexes/{cell_type}/{MODEL_TYPE}/{control}_{treatment}/aggregated_sc_data_split_indexes.tsv"
    )
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
    # subset each column that contains metadata
    metadata = data_df.filter(regex="Metadata")
    data_df = data_df.drop(metadata.columns, axis=1)
    data_df = pd.concat([data_df, metadata["Metadata_Well"]], axis=1)
    # groupby well and take mean of each well
    data_df = data_df.groupby("Metadata_Well").mean()
    # drop duplicate rows in the metadata_well column
    metadata = metadata.drop_duplicates(subset=["Metadata_Well"])
    # get the metadata for each well
    data_df = pd.merge(
        data_df, metadata, left_on="Metadata_Well", right_on="Metadata_Well"
    )
elif (aggregation == False) and (nomic == True):
    data_split_path = pathlib.Path(
        f"../0.split_data/indexes/{cell_type}/{MODEL_TYPE}/{control}_{treatment}/sc_and_nomic_data_split_indexes.tsv"
    )
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
    data_df = pd.merge(
        data_df, df_nomic, left_on="Metadata_Well", right_on="position_x"
    )
    data_df = data_df.drop(columns=["position_x"])
elif aggregation == False and nomic == False:
    data_split_path = pathlib.Path(
        f"../0.split_data/indexes/{cell_type}/{MODEL_TYPE}/{control}_{treatment}/sc_split_indexes.tsv"
    )
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
else:
    print("Error")
data_df

Unnamed: 0,Metadata_Well,Cytoplasm_AreaShape_Compactness,Cytoplasm_AreaShape_Eccentricity,Cytoplasm_AreaShape_Extent,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_MajorAxisLength,Cytoplasm_AreaShape_MinorAxisLength,Cytoplasm_AreaShape_Orientation,Cytoplasm_AreaShape_Zernike_0_0,Cytoplasm_AreaShape_Zernike_1_1,...,TWEAK [NSU],uPA [NSU],VCAM-1 [NSU],VEGF Receptor 2 (Flk-1) [NSU],VEGF-A (165) [NSU],VEGF-C [NSU],VEGF-D [NSU],VEGFR-1 [NSU],WISP-1 (CCN4) [NSU],XCL1 (Lymphotactin) [NSU]
0,B13,0.000263,0.050292,0.011215,-0.032031,0.139148,0.092653,-0.022733,-0.004550,-0.019608,...,-0.225271,2.011257,-1.313502,-0.528295,26.484897,-0.751687,-0.891735,-1.624976,1.324507,-0.622436
1,B14,-0.063223,-0.001418,0.035864,0.036794,0.037936,0.031201,-0.012884,0.028338,0.019985,...,-0.894582,1.230590,-0.966358,-0.353436,31.847475,-1.216944,-2.187799,0.107920,1.367803,0.180895
2,B15,-0.062009,0.001236,0.044042,0.030464,-0.002026,0.006311,0.010789,0.030538,0.022751,...,0.114074,2.302291,1.364515,-0.062003,30.013129,-0.001147,0.460551,0.005704,-0.403832,-1.415898
3,B16,-0.031699,0.047344,-0.003990,0.002975,0.115183,0.070404,-0.007908,-0.010212,-0.004997,...,-1.402026,1.120312,0.248420,-1.413591,28.235605,-1.411162,-0.597302,0.644832,-0.112562,-1.287083
4,B17,-0.045468,0.038261,0.034279,0.023820,0.163262,0.120615,-0.000391,0.018250,-0.015776,...,-1.075493,1.131858,-0.763617,-1.051842,25.473988,-1.297297,0.987359,-0.184818,1.174355,-1.467287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,O19,-0.030737,0.025373,0.019060,0.013307,0.087081,0.062723,0.030207,0.011448,-0.006952,...,-0.638728,2.325251,1.088603,-2.413005,30.040230,0.150468,2.537893,-2.072348,1.365336,0.546843
150,O20,-0.008031,0.149927,-0.087296,-0.039525,0.227378,-0.005209,0.012483,-0.121338,-0.041426,...,0.191073,1.324194,0.085482,-2.060541,26.633495,-0.656278,1.078025,-0.021887,0.382966,-0.827056
151,O21,-0.039158,0.083196,-0.033546,-0.022291,0.151747,0.022673,0.001550,-0.070861,-0.024675,...,-0.295899,2.450049,0.839780,-0.387338,30.082969,-0.812300,1.301290,0.240628,0.458829,-0.726808
152,O22,-0.078584,0.011797,0.072764,0.058422,0.033983,0.037911,0.019646,0.063310,0.006814,...,-0.328278,2.187440,0.442932,-0.330779,34.600973,-1.784995,-0.022841,0.475823,0.490670,-1.055123


In [7]:
data_split_indexes.index = data_split_indexes["labeled_data_index"]

In [8]:
# subset data_df by indexes in data_split_indexes
data_all = data_df.loc[data_split_indexes["labeled_data_index"]]
data_all["label"] = data_split_indexes["label"]

In [9]:
# get oneb_Metadata_Treatment_Dose_Inhibitor_Dose  =='DMSO_0.100_DMSO_0.025' and 'LPS_100.000_DMSO_0.025 and Thapsigargin_10.000_DMSO_0.025'
data_all = data_all[
    data_all["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin([control, treatment])
]

In [10]:
# at random downsample the DMSO treatment to match the number of wells in the LPS treatment
seed = 0
# get the number of wells in the LPS treatment
trt_wells = data_all[
    data_all["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] == treatment
].shape[0]
# get the number of wells in the DMSO treatment
dmso_wells = data_all[
    data_all["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] == control
].shape[0]
if dmso_wells > trt_wells:
    # downsample the DMSO treatment to match the number of wells in the LPS treatment
    dmso_holdout = data_all[
        data_all["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] == control
    ].sample(n=trt_wells, random_state=seed)
    # remove the downsampled DMSO wells from the data
    data_all = data_all
    pass

In [11]:
# set model path from parameters
if (aggregation == True) and (nomic == True):
    model_path = pathlib.Path(
        f"models/single_class/{cell_type}/aggregated_with_nomic/{MODEL_TYPE}/{control}__{treatment}"
    )
elif (aggregation == True) and (nomic == False):
    model_path = pathlib.Path(
        f"models/single_class/{cell_type}/aggregated/{MODEL_TYPE}/{control}__{treatment}"
    )
elif (aggregation == False) and (nomic == True):
    model_path = pathlib.Path(
        f"models/single_class/{cell_type}/sc_with_nomic/{MODEL_TYPE}/{control}__{treatment}"
    )
elif (aggregation == False) and (nomic == False):
    model_path = pathlib.Path(
        f"models/single_class/{cell_type}/sc/{MODEL_TYPE}/{control}__{treatment}"
    )
else:
    print("Error")

In [12]:
model_types = ["final", "shuffled_baseline"]
feature_types = ["CP"]
phenotypic_classes = [treatment]

In [13]:
# define metadata columns
# subset each column that contains metadata
metadata = data_all.filter(regex="Metadata")
# drop all metadata columns
data_x = data_all.drop(metadata.columns, axis=1)
labeled_data = data_all["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
evaluation_types = ["train", "test"]


train_labeled_data = data_all.loc[data_all["label"] == "train"][
    "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"
]
test_labeled_data = data_all.loc[data_all["label"] == "test"][
    "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"
]

In [14]:
# set path for figures
if (aggregation == True) and (nomic == True):
    figure_path = pathlib.Path(
        f"./figures/single_class/{cell_type}/aggregated_with_nomic/{control}__{treatment}"
    )
    results_path = pathlib.Path(
        f"./results/single_class/{cell_type}/aggregated_with_nomic/{control}__{treatment}"
    )
elif (aggregation == True) and (nomic == False):
    figure_path = pathlib.Path(
        f"./figures/single_class/{cell_type}/aggregated/{control}__{treatment}"
    )
    results_path = pathlib.Path(
        f"./results/single_class/{cell_type}/aggregated/{control}__{treatment}"
    )
elif (aggregation == False) and (nomic == True):
    figure_path = pathlib.Path(
        f"./figures/single_class/{cell_type}/sc_with_nomic/{control}__{treatment}"
    )
    results_path = pathlib.Path(
        f"./results/single_class/{cell_type}/sc_with_nomic/{control}__{treatment}"
    )
elif (aggregation == False) and (nomic == False):
    figure_path = pathlib.Path(
        f"./figures/single_class/{cell_type}/sc/{control}__{treatment}"
    )
    results_path = pathlib.Path(
        f"./results/single_class/{cell_type}/sc/{control}__{treatment}"
    )
else:
    print("Error")
figure_path.mkdir(parents=True, exist_ok=True)
results_path.mkdir(parents=True, exist_ok=True)

In [15]:
data_x.reset_index(drop=True, inplace=True)
# create empty dataframe to store predictions
compiled_predictions = pd.DataFrame(
    columns=[
        "Phenotypic_Class_Predicted",
        "Phenotypic_Class_True",
        "data_split",
        "shuffled",
        "feature_type",
    ],
)

# test model on testing data
for model_type, feature_type, phenotypic_class, evaluation_type in itertools.product(
    model_types, feature_types, phenotypic_classes, evaluation_types
):
    print(model_type, feature_type, phenotypic_class, evaluation_type)
    # load model
    model = load(f"../1.train_models/{model_path}/{model_type}__{feature_type}.joblib")
    print(model)

    if evaluation_type == "train":
        # get row that are labeled train in label column
        train_data_x = data_x.loc[data_x["label"] == "train"]
        train_data_x = train_data_x.drop("label", axis=1)

        predictions = model.predict(train_data_x)
        # get probabilities
        probabilities = model.predict_proba(train_data_x)
        # get accuracy
        accuracy = accuracy_score(train_labeled_data, predictions)
        f1 = f1_score(train_labeled_data, predictions, average="weighted")
        train_predictions_df = pd.DataFrame(
            {
                "Phenotypic_Class_Predicted": predictions,
                "Phenotypic_Class_True": train_labeled_data,
                "data_split": evaluation_type,
                "shuffled": "shuffled" in model_type,
                "feature_type": feature_type,
            }
        )

        compiled_predictions = pd.concat(
            [compiled_predictions, train_predictions_df], axis=0, ignore_index=True
        )
    elif evaluation_type == "test":
        # get row that are labeled test in label column
        test_data_x = data_x.loc[data_x["label"] == "test"]
        test_data_x = test_data_x.drop("label", axis=1)
        predictions = model.predict(test_data_x)
        # get probabilities
        probabilities = model.predict_proba(test_data_x)
        # get accuracy
        accuracy = accuracy_score(test_labeled_data, predictions)
        # get f1 score
        f1 = f1_score(test_labeled_data, predictions, average="weighted")
        test_predictions_df = pd.DataFrame(
            {
                "Phenotypic_Class_Predicted": predictions,
                "Phenotypic_Class_True": test_labeled_data,
                "data_split": evaluation_type,
                "shuffled": "shuffled" in model_type,
                "feature_type": feature_type,
            }
        )
        compiled_predictions = pd.concat(
            [compiled_predictions, test_predictions_df], axis=0, ignore_index=True
        )

final CP LPS_100.000_ug_per_ml_DMSO_0.025_% train
LogisticRegression(C=0.001, class_weight='balanced', l1_ratio=0.0, max_iter=10,
                   n_jobs=-1, penalty='elasticnet', random_state=0,
                   solver='saga')
final CP LPS_100.000_ug_per_ml_DMSO_0.025_% test
LogisticRegression(C=0.001, class_weight='balanced', l1_ratio=0.0, max_iter=10,
                   n_jobs=-1, penalty='elasticnet', random_state=0,
                   solver='saga')
shuffled_baseline CP LPS_100.000_ug_per_ml_DMSO_0.025_% train
LogisticRegression(C=0.001, class_weight='balanced', l1_ratio=0.1, max_iter=10,
                   n_jobs=-1, penalty='elasticnet', random_state=0,
                   solver='saga')
shuffled_baseline CP LPS_100.000_ug_per_ml_DMSO_0.025_% test
LogisticRegression(C=0.001, class_weight='balanced', l1_ratio=0.1, max_iter=10,
                   n_jobs=-1, penalty='elasticnet', random_state=0,
                   solver='saga')


  compiled_predictions = pd.concat(
  compiled_predictions = pd.concat(
  compiled_predictions = pd.concat(


In [16]:
# write compiled predictions to csv file in results folder
compiled_predictions.to_csv(f"{results_path}/compiled_predictions.csv", index=False)

In [17]:
compiled_predictions

Unnamed: 0,Phenotypic_Class_Predicted,Phenotypic_Class_True,data_split,shuffled,feature_type
0,DMSO_0.100_%_DMSO_0.025_%,DMSO_0.100_%_DMSO_0.025_%,train,False,CP
1,DMSO_0.100_%_DMSO_0.025_%,DMSO_0.100_%_DMSO_0.025_%,train,False,CP
2,DMSO_0.100_%_DMSO_0.025_%,DMSO_0.100_%_DMSO_0.025_%,train,False,CP
3,DMSO_0.100_%_DMSO_0.025_%,DMSO_0.100_%_DMSO_0.025_%,train,False,CP
4,DMSO_0.100_%_DMSO_0.025_%,DMSO_0.100_%_DMSO_0.025_%,train,False,CP
5,DMSO_0.100_%_DMSO_0.025_%,DMSO_0.100_%_DMSO_0.025_%,train,False,CP
6,LPS_100.000_ug_per_ml_DMSO_0.025_%,LPS_100.000_ug_per_ml_DMSO_0.025_%,train,False,CP
7,LPS_100.000_ug_per_ml_DMSO_0.025_%,LPS_100.000_ug_per_ml_DMSO_0.025_%,train,False,CP
8,LPS_100.000_ug_per_ml_DMSO_0.025_%,LPS_100.000_ug_per_ml_DMSO_0.025_%,train,False,CP
9,DMSO_0.100_%_DMSO_0.025_%,DMSO_0.100_%_DMSO_0.025_%,test,False,CP
