In [1]:
import ast
import itertools
import math
import pathlib
import sys
import warnings

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import toml
from joblib import dump
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import ElasticNetCV, LogisticRegression, MultiTaskElasticNetCV

# import mse
from sklearn.metrics import mean_squared_error, r2_score

# import RepeatedKFold
from sklearn.model_selection import (
    GridSearchCV,
    RepeatedKFold,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.utils import parallel_backend, shuffle

In [2]:
# Parameters
cell_type = "SHSY5Y"
aggregation = True
nomic = True
flag = True
shuffle = True

In [3]:
# set shuffle value
if shuffle:
    shuffle = "shuffled_baseline"
else:
    shuffle = "final"

In [4]:
MODEL_TYPE = "regression"

In [6]:
# load training data from indexes and features dataframe
# data_split_path = pathlib.Path(f"../0.split_data/indexes/data_split_indexes.tsv")
data_path = pathlib.Path(f"../../../data/{cell_type}_preprocessed_sc_norm.parquet")

# dataframe with only the labeled data we want (exclude certain phenotypic classes)
data_df = pq.read_table(data_path).to_pandas()

# import nomic data
nomic_df_path = pathlib.Path(
    f"../../../2.Nomic_nELISA_Analysis/Data/clean/Plate2/nELISA_plate_430420_{cell_type}_cleanup4correlation.csv"
)
df_nomic = pd.read_csv(nomic_df_path)

# clean up nomic data
df_nomic = df_nomic.drop(columns=[col for col in df_nomic.columns if "[pgML]" in col])
# drop first 25 columns (Metadata that is not needed)
# df_nomic = df_nomic.drop(columns=df_nomic.columns[3:25])
# df_nomic = df_nomic.drop(columns=df_nomic.columns[0:2])

In [7]:
print(df_nomic["Activin A [NSU]"].describe())

count    154.000000
mean       0.512790
std        0.191118
min        0.000000
25%        0.372157
50%        0.498080
75%        0.635876
max        1.000000
Name: Activin A [NSU], dtype: float64


In [8]:
if (aggregation == True) and (nomic == True):
    data_split_path = pathlib.Path(
        f"../../0.split_data/indexes/{cell_type}/{MODEL_TYPE}/aggregated_sc_and_nomic_data_split_indexes.tsv"
    )
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
    # subset each column that contains metadata
    metadata = data_df.filter(regex="Metadata")
    data_df = data_df.drop(metadata.columns, axis=1)
    data_df = pd.concat([data_df, metadata["Metadata_Well"]], axis=1)
    # groupby well and take mean of each well
    data_df = data_df.groupby("Metadata_Well").mean()
    # drop duplicate rows in the metadata_well column
    metadata = metadata.drop_duplicates(subset=["Metadata_Well"])
    # get the metadata for each well
    data_df = pd.merge(
        data_df, metadata, left_on="Metadata_Well", right_on="Metadata_Well"
    )
    data_df = pd.merge(
        data_df,
        df_nomic,
        left_on=["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
        right_on=["Metadata_position_x", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    )
    data_df = data_df.drop(columns=["Metadata_position_x"])
elif (aggregation == True) and (nomic == False):
    data_split_path = pathlib.Path(
        f"../../0.split_data/indexes/{cell_type}/{MODEL_TYPE}/aggregated_sc_data_split_indexes.tsv"
    )
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
    # subset each column that contains metadata
    metadata = data_df.filter(regex="Metadata")
    data_df = data_df.drop(metadata.columns, axis=1)
    data_df = pd.concat([data_df, metadata["Metadata_Well"]], axis=1)
    # groupby well and take mean of each well
    data_df = data_df.groupby("Metadata_Well").mean()
    # drop duplicate rows in the metadata_well column
    metadata = metadata.drop_duplicates(subset=["Metadata_Well"])
    # get the metadata for each well
    data_df = pd.merge(
        data_df, metadata, left_on="Metadata_Well", right_on="Metadata_Well"
    )
elif (aggregation == False) and (nomic == True):
    data_split_path = pathlib.Path(
        f"../../0.split_data/indexes/{cell_type}/{MODEL_TYPE}/sc_and_nomic_data_split_indexes.tsv"
    )
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
    data_df = pd.merge(
        data_df,
        df_nomic,
        left_on=["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
        right_on=["Metadata_position_x", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    )
    data_df = data_df.drop(columns=["Metadata_position_x"])
elif aggregation == False and nomic == False:
    data_split_path = pathlib.Path(
        f"../../0.split_data/indexes/{cell_type}/{MODEL_TYPE}/sc_split_indexes.tsv"
    )
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
else:
    print("Error")

In [9]:
# select tht indexes for the training and test set
train_indexes = data_split_indexes.loc[data_split_indexes["label"] == "train"]

In [10]:
# subset data_df by indexes in data_split_indexes
training_data = data_df.loc[train_indexes["labeled_data_index"]]

In [11]:
training_data.head()

Unnamed: 0,Metadata_Well,Cytoplasm_AreaShape_Compactness,Cytoplasm_AreaShape_Eccentricity,Cytoplasm_AreaShape_Extent,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_MajorAxisLength,Cytoplasm_AreaShape_MinorAxisLength,Cytoplasm_AreaShape_Orientation,Cytoplasm_AreaShape_Zernike_0_0,Cytoplasm_AreaShape_Zernike_1_1,...,uPA [NSU],VCAM-1 [NSU],VEGF Receptor 2 (Flk-1) [NSU],VEGF-A (165) [NSU],VEGF-C [NSU],VEGF-D [NSU],VEGFR-1 [NSU],WISP-1 (CCN4) [NSU],XCL1 (Lymphotactin) [NSU],fourb_Metadata_Treatment_Dose_Inhibitor_Dose_y
1,B15,-0.062009,0.001236,0.044042,0.030464,-0.002026,0.006311,0.010789,0.030538,0.022751,...,0.214721,0.723799,0.592903,0.287432,0.375582,0.577606,0.554267,0.457302,0.298807,DMSO__0.100__DMSO__0.025
4,B18,-0.050341,0.055021,0.077588,0.038459,0.425906,0.355382,0.015133,0.035574,-0.010523,...,0.345951,0.274663,0.756694,0.372978,0.242493,0.428503,0.503385,0.550132,0.358411,LPS_Nigericin__100.000_1.0__DMSO__0.025
5,B19,-0.062131,0.044846,0.080372,0.043515,0.339065,0.290019,-0.008782,0.062351,-0.025408,...,0.245971,0.256898,0.416341,0.391095,0.159457,0.484938,0.696098,0.47062,0.589472,LPS_Nigericin__100.000_1.0__DMSO__0.025
6,B20,-0.019812,0.073886,0.005299,0.001903,0.233947,0.140215,0.016051,-0.021427,0.014053,...,0.226773,0.617068,0.419034,0.156464,0.31866,0.773322,0.298497,0.740774,0.485919,LPS__0.010__DMSO__0.025
10,C14,-0.041941,0.009593,0.016816,0.024724,-0.004871,-0.008984,-0.005426,0.025566,0.021008,...,0.070474,0.354318,0.257079,0.337151,0.231814,0.56643,0.653679,0.68856,0.509965,DMSO__0.100__DMSO__0.025


In [12]:
# # get oneb_Metadata_Treatment_Dose_Inhibitor_Dose  =='DMSO_0.100_DMSO_0.025' and 'LPS_100.000_DMSO_0.025 and Thapsigargin_10.000_DMSO_0.025'
# training_data = training_data[
#     training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(
#         [control, treatment]
#     )
# ]

In [13]:
# TODO holdout certain treatments and different percentages of hold out for each treatment
# where holdout is the test set

In [14]:
# # at random downsample the DMSO treatment to match the number of wells in the LPS treatment
# seed = 0
# # get the number of wells in the LPS treatment
# trt_wells = training_data[
#     training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] == treatment
# ].shape[0]
# # get the number of wells in the DMSO treatment
# dmso_wells = training_data[
#     training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] == control
# ].shape[0]
# # downsample the DMSO treatment to match the number of wells in the LPS treatment
# dmso_holdout = training_data[
#     training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] == control
# ].sample(n=trt_wells, random_state=seed)
# # remove the downsampled DMSO wells from the data
# training_data = training_data.drop(dmso_holdout.index)

In [15]:
# define metadata columns
# subset each column that contains metadata
metadata = training_data.filter(regex="Metadata")
# drop all metadata columns
data_x = training_data.drop(metadata.columns, axis=1)
labeled_data = training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
# get all columns that contain "NSU" in the column name
data_y_cols = data_x.filter(regex="NSU").columns
train_y = training_data[data_y_cols]
train_x = data_x.drop(data_y_cols, axis=1)

In [16]:
from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()
loo.get_n_splits(train_x)
loo.get_n_splits(train_y)

44

In [17]:
for cytokine in train_y.columns:
    train_data_y = train_y[cytokine]
    model = ElasticNetCV(
        random_state=0,
        max_iter=100000,
        cv=loo,
        l1_ratio=[0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 0.99],
        alphas=[0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
        fit_intercept=True,
        selection="random",
    )
    # train model on training data on all combinations of model types, feature types, and phenotypic classes

    if shuffle == "shuffled_baseline":
        print("Shuffling data")
        for column in train_x:
            np.random.shuffle(train_x[column].values)
    else:
        print("Not shuffling data")
    # define parameters to search over
    with parallel_backend("multiprocessing"):
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore", category=ConvergenceWarning, module="sklearn"
            )
            # create a logistic regression model
            model.fit(train_x, train_data_y)
            scores = cross_val_score(
                model,
                train_x,
                train_data_y,
                scoring="neg_mean_absolute_error",
                cv=loo,
                n_jobs=-1,
            )
            print(scores)
            print(f"Mean MAE: {scores.mean()}")
            print(f"Std MAE: {scores.std()}")
            print(f"R2: {model.score(train_x, train_data_y)}")

    if (aggregation == True) and (nomic == True):
        results_dir = f"../models/regression/{cell_type}/aggregated_with_nomic/"
    elif (aggregation == True) and (nomic == False):
        results_dir = f"../models/regression/{cell_type}/aggregated/"
    elif (aggregation == False) and (nomic == True):
        results_dir = f"../models/regression/{cell_type}/sc_with_nomic/"
    elif (aggregation == False) and (nomic == False):
        results_dir = f"../models/regression/{cell_type}/sc/"
    else:
        print("Error")

    # create results directory if it doesn't exist
    pathlib.Path(results_dir).mkdir(parents=True, exist_ok=True)

    # save final estimator
    if shuffle == "shuffled_baseline":
        dump(
            model,
            f"{results_dir}/{cytokine}_shuffled_baseline__all_nomic.joblib",
        )
    elif shuffle == "final":
        dump(
            model,
            f"{results_dir}/{cytokine}_final__all_nomic.joblib",
        )
    else:
        print("Error")

    # save condfig copy specific to this model to the folder with the results
    # use pathlib
    if shuffle == "shuffled_baseline":
        config_copy_path = pathlib.Path(
            f"{results_dir}/{cytokine}_shuffled_baseline__all_nomic.toml"
        )
    elif shuffle == "final":
        config_copy_path = pathlib.Path(
            f"{results_dir}/{cytokine}_final__all_nomic.toml"
        )
    else:
        print("Error")

    # write toml file with parameters used from injected parameters

    with open(config_copy_path, "w") as f:
        f.write(f"model_type='{shuffle}'\n")
        f.write(f"aggregation={aggregation}\n")
        f.write(f"nomic={nomic}\n")
        f.write(f"cell_type='{cell_type}'\n")
        f.write(f"feature=all\n")

Shuffling data


KeyboardInterrupt: 