In [1]:
import ast
import itertools
import pathlib
import sys
import warnings

import joblib
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import toml
from joblib import dump
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import ElasticNetCV, LogisticRegression, MultiTaskElasticNetCV

# import mse
from sklearn.metrics import mean_squared_error, r2_score

# import RepeatedKFold
from sklearn.model_selection import (
    GridSearchCV,
    RepeatedKFold,
    StratifiedKFold,
    train_test_split,
)
from sklearn.utils import parallel_backend, shuffle

In [2]:
# Parameters
cell_type = "PBMC"
aggregation = True
nomic = True
flag = True
control = "DMSO_0.100_DMSO_0.025"
treatment = "LPS_100.000_DMSO_0.025"

In [3]:
MODEL_TYPE = "regression"
if flag == False:
    # read in toml file and get parameters
    toml_path = pathlib.Path("single_class_config.toml")
    with open(toml_path, "r") as f:
        config = toml.load(f)
    control = config["logistic_regression_params"]["control"]
    treatment = config["logistic_regression_params"]["treatments"]
    aggregation = ast.literal_eval(config["logistic_regression_params"]["aggregation"])
    nomic = ast.literal_eval(config["logistic_regression_params"]["nomic"])
    cell_type = config["logistic_regression_params"]["cell_type"]

In [4]:
# load training data from indexes and features dataframe
# data_split_path = pathlib.Path(f"../0.split_data/indexes/data_split_indexes.tsv")
# data_path = pathlib.Path(f"../../data/{cell_type}_preprocessed_sc_norm.parquet")
data_path = pathlib.Path(
    "../../data/PBMC_subset_sc_norm_DMSO_0.100_DMSO_0.025_LPS_100.000_DMSO_0.025.parquet"
)

# dataframe with only the labeled data we want (exclude certain phenotypic classes)
data_df = pq.read_table(data_path).to_pandas()

# import nomic data
nomic_df_path = pathlib.Path(
    f"../../2.Nomic_nELISA_Analysis/Data/clean/Plate2/nELISA_plate_430420_{cell_type}_cleanup4correlation.csv"
)
df_nomic = pd.read_csv(nomic_df_path)

# clean up nomic data
df_nomic = df_nomic.drop(columns=[col for col in df_nomic.columns if "[pgML]" in col])
# drop first 25 columns (Metadata that is not needed)
# df_nomic = df_nomic.drop(columns=df_nomic.columns[3:25])
# df_nomic = df_nomic.drop(columns=df_nomic.columns[0:2])

In [5]:
if (aggregation == True) and (nomic == True):
    data_split_path = pathlib.Path(
        f"../0.split_data/indexes/{cell_type}/{MODEL_TYPE}/{control}_{treatment}/aggregated_sc_and_nomic_data_split_indexes.tsv"
    )
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
    # subset each column that contains metadata
    metadata = data_df.filter(regex="Metadata")
    data_df = data_df.drop(metadata.columns, axis=1)
    data_df = pd.concat([data_df, metadata["Metadata_Well"]], axis=1)
    # groupby well and take mean of each well
    data_df = data_df.groupby("Metadata_Well").mean()
    # drop duplicate rows in the metadata_well column
    metadata = metadata.drop_duplicates(subset=["Metadata_Well"])
    # get the metadata for each well
    data_df = pd.merge(
        data_df, metadata, left_on="Metadata_Well", right_on="Metadata_Well"
    )
    data_df = pd.merge(
        data_df,
        df_nomic,
        left_on=["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
        right_on=["Metadata_position_x", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    )
    data_df = data_df.drop(columns=["Metadata_position_x"])
elif (aggregation == True) and (nomic == False):
    data_split_path = pathlib.Path(
        f"../0.split_data/indexes/{cell_type}/{MODEL_TYPE}/{control}_{treatment}/aggregated_sc_data_split_indexes.tsv"
    )
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
    # subset each column that contains metadata
    metadata = data_df.filter(regex="Metadata")
    data_df = data_df.drop(metadata.columns, axis=1)
    data_df = pd.concat([data_df, metadata["Metadata_Well"]], axis=1)
    # groupby well and take mean of each well
    data_df = data_df.groupby("Metadata_Well").mean()
    # drop duplicate rows in the metadata_well column
    metadata = metadata.drop_duplicates(subset=["Metadata_Well"])
    # get the metadata for each well
    data_df = pd.merge(
        data_df,
        df_nomic,
        left_on=["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
        right_on=["Metadata_position_x", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    )
elif (aggregation == False) and (nomic == True):
    data_split_path = pathlib.Path(
        f"../0.split_data/indexes/{cell_type}/{MODEL_TYPE}/{control}_{treatment}/sc_and_nomic_data_split_indexes.tsv"
    )
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
    data_df = pd.merge(
        data_df,
        df_nomic,
        left_on=["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
        right_on=["Metadata_position_x", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    )
    data_df = data_df.drop(columns=["Metadata_position_x"])
elif aggregation == False and nomic == False:
    data_split_path = pathlib.Path(
        f"../0.split_data/indexes/{cell_type}/{MODEL_TYPE}/{control}_{treatment}/sc_split_indexes.tsv"
    )
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
else:
    print("Error")

In [6]:
# select tht indexes for the training and test set
train_indexes = data_split_indexes.loc[data_split_indexes["label"] == "train"]
test_indexes = data_split_indexes.loc[data_split_indexes["label"] == "test"]

In [7]:
# subset data_df by indexes in data_split_indexes
training_data = data_df.loc[train_indexes["labeled_data_index"]]
testing_data = data_df.loc[test_indexes["labeled_data_index"]]

In [8]:
# get oneb_Metadata_Treatment_Dose_Inhibitor_Dose  =='DMSO_0.100_DMSO_0.025' and 'LPS_100.000_DMSO_0.025 and Thapsigargin_10.000_DMSO_0.025'
training_data = training_data[
    training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(
        [control, treatment]
    )
]
testing_data = testing_data[
    testing_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(
        [control, treatment]
    )
]

In [9]:
# at random downsample the DMSO treatment to match the number of wells in the LPS treatment
seed = 0
# get the number of wells in the LPS treatment
trt_wells = training_data[
    training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] == treatment
].shape[0]
# get the number of wells in the DMSO treatment
dmso_wells = training_data[
    training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] == control
].shape[0]
# downsample the DMSO treatment to match the number of wells in the LPS treatment
dmso_holdout = training_data[
    training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] == control
].sample(n=trt_wells, random_state=seed)
# remove the downsampled DMSO wells from the data
training_data = training_data.drop(dmso_holdout.index)

In [10]:
# define metadata columns
# subset each column that contains metadata
metadata = training_data.filter(regex="Metadata")
# drop all metadata columns
train_data_x = training_data.drop(metadata.columns, axis=1)
train_treatments = training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
# get all columns that contain "NSU" in the column name
train_data_y_cols = train_data_x.filter(regex="NSU").columns
train_data_y = training_data[train_data_y_cols]
train_data_x = train_data_x.drop(train_data_y_cols, axis=1)


# define metadata columns
# subset each column that contains metadata
metadata = testing_data.filter(regex="Metadata")
# drop all metadata columns
test_data_x = testing_data.drop(metadata.columns, axis=1)
test_treatments = testing_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
# get all columns that contain "NSU" in the column name
test_data_y_cols = test_data_x.filter(regex="NSU").columns
test_data_y = testing_data[test_data_y_cols]
test_data_x = test_data_x.drop(test_data_y_cols, axis=1)

In [15]:
train_data_y

Unnamed: 0,Activin A [NSU],AITRL (GITR Ligand) [NSU],Amphiregulin [NSU],Amyloid beta [NSU],APRIL [NSU],BAFF [NSU],BCMA (TNFRSF17) [NSU],BDNF [NSU],BMP2 [NSU],BMP3 [NSU],...,TWEAK [NSU],uPA [NSU],VCAM-1 [NSU],VEGF Receptor 2 (Flk-1) [NSU],VEGF-A (165) [NSU],VEGF-C [NSU],VEGF-D [NSU],VEGFR-1 [NSU],WISP-1 (CCN4) [NSU],XCL1 (Lymphotactin) [NSU]
0,0.09771,0.461685,0.270477,0.514695,0.479281,0.270494,0.708849,0.134432,0.350986,0.216932,...,0.386063,0.469875,0.395392,0.560129,0.504521,0.490444,0.258834,0.238358,0.524276,0.25067
4,0.06186,0.196318,0.236491,0.474891,0.174672,0.824721,0.704521,0.254823,0.443939,0.268677,...,0.755683,0.374554,0.486915,0.389375,0.369421,0.680276,0.182956,0.263281,0.213596,0.064645
6,0.706485,0.477823,0.806104,0.303776,0.25443,0.09428,0.670885,0.250955,0.470768,0.169451,...,0.736028,0.428286,0.288884,0.527908,0.210755,0.448465,0.422773,0.535603,0.209011,0.170498
8,0.060998,0.596601,0.129926,0.30261,0.559309,0.087533,0.54111,0.350256,0.52826,0.313411,...,0.254542,0.630644,0.586271,0.258029,0.561051,0.551671,0.582053,0.087565,0.140992,0.234191
10,0.83782,0.574834,0.869218,0.394956,0.118325,0.192057,0.584911,0.411283,0.340821,0.253736,...,0.328239,0.315633,0.364173,0.607592,0.176816,0.37892,0.310344,0.651217,0.679571,0.222324
11,0.686719,0.169058,0.759551,0.469769,0.464808,0.094883,0.345938,0.355706,0.262622,0.226736,...,0.360981,0.527316,0.405934,0.619578,0.329964,0.57783,0.0,0.456104,0.255216,0.255703


In [11]:
print(train_data_x.shape, train_data_y.shape, test_data_x.shape, test_data_y.shape)

(6, 1245) (6, 187) (3, 1245) (3, 187)


In [12]:
# set model path from parameters
if (aggregation == True) and (nomic == True):
    model_path = pathlib.Path(
        f"models/single_class/{cell_type}/aggregated_with_nomic/{MODEL_TYPE}/{control}__{treatment}"
    )
elif (aggregation == True) and (nomic == False):
    model_path = pathlib.Path(
        f"models/single_class/{cell_type}/aggregated/{MODEL_TYPE}/{control}__{treatment}"
    )
elif (aggregation == False) and (nomic == True):
    model_path = pathlib.Path(
        f"models/single_class/{cell_type}/sc_with_nomic/{MODEL_TYPE}/{control}__{treatment}"
    )
elif (aggregation == False) and (nomic == False):
    model_path = pathlib.Path(
        f"models/single_class/{cell_type}/sc/{MODEL_TYPE}/{control}__{treatment}"
    )
else:
    print("Error")

In [13]:
shuffles = ["final", "shuffled_baseline"]
feature_types = ["CP"]
evaluation_types = ["train", "test"]
# create stratified data sets for continuous labels
compiled_predictions = pd.DataFrame(
    columns=[
        "Prediction",
        "Actual_value",
        "data_split",
        "shuffled",
        "feature_type",
        "MSE",
        "r2",
        "treatment",
    ]
)

model = MultiTaskElasticNetCV(
    random_state=0,
    max_iter=10,
    cv=5,
    l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.99],
    alphas=[0.01, 0.1, 1, 10, 100],
)
# train model on training data on all combinations of model types, feature types, and phenotypic classes

for shuffle, feature_type, evaluation_type in itertools.product(
    shuffles, feature_types, evaluation_types
):
    model = joblib.load(
        f"../1.train_models/{model_path}/{shuffle}__{feature_type}_all_nomic.joblib"
    )

    if evaluation_type == "train":
        # get row that are labeled train in label column
        # train_data_x = data_x.loc[data_x["label"] == "train"]
        # train_data_x = train_data_x.drop("label", axis=1)
        if shuffle == "shuffled_baseline":
            for column in train_data_x:
                np.random.shuffle(train_data_x[column].values)
        predictions = model.predict(train_data_x)
        # get probabilities
        # probabilities = model.predict_proba(train_data_x)
        # get accuracy
        MSE = mean_squared_error(train_data_y, predictions)
        # get r2 score
        r2 = r2_score(train_data_y, predictions)
        train_predictions_df = pd.DataFrame(
            {
                "Prediction": predictions,
                "Actual_value": train_data_y,
                "data_split": evaluation_type,
                "shuffled": "shuffled" in shuffle,
                "feature_type": feature_type,
                "MSE": MSE,
                "r2": r2,
            }
        )
        # oneb_Metadata_Treatment_Dose_Inhibitor_Dose
        train_predictions_df["treatment"] = train_treatments

        compiled_predictions = pd.concat(
            [compiled_predictions, train_predictions_df], axis=0, ignore_index=True
        )
    elif evaluation_type == "test":

        # get row that are labeled train in label column
        # train_data_x = data_x.loc[data_x["label"] == "train"]
        # train_data_x = train_data_x.drop("label", axis=1)
        if shuffle == "shuffled_baseline":
            for column in test_data_x:
                np.random.shuffle(test_data_x[column].values)
        predictions = model.predict(test_data_x)
        print(predictions, test_data_y)
        # get probabilities
        # probabilities = model.predict_proba(train_data_x)
        # get accuracy
        MSE = mean_squared_error(test_data_y, predictions)
        # get r2 score
        r2 = r2_score(test_data_y, predictions)
        test_predictions_df = pd.DataFrame(
            {
                "Prediction": predictions,
                "Actual_value": test_data_y,
                "data_split": evaluation_type,
                "shuffled": "shuffled" in shuffle,
                "feature_type": feature_type,
                "MSE": MSE,
                "r2": r2,
            }
        )
        # oneb_Metadata_Treatment_Dose_Inhibitor_Dose
        test_predictions_df["treatment"] = test_treatments

        compiled_predictions = pd.concat(
            [compiled_predictions, test_predictions_df], axis=0, ignore_index=True
        )

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- AITRL (GITR Ligand) [NSU]
- APRIL [NSU]
- Activin A [NSU]
- Amphiregulin [NSU]
- Amyloid beta [NSU]
- ...
