In [1]:
import itertools
import pathlib

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import toml
from joblib import dump
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.utils import parallel_backend, shuffle

In [2]:
# Parameters
cell_type = "SHSY5Y"
aggregation = True
nomic = True
flag = True

In [3]:
MODEL_TYPE = "regression"

In [4]:
# toml file path
TOML_PATH = pathlib.Path("../splits.toml")
# read toml file via toml
data_splits_by_treatments = toml.load(TOML_PATH)

# define the 100% test set data treatments
test_100_percent = data_splits_by_treatments["splits"]["data_splits_100"]
test_75_percent = data_splits_by_treatments["splits"]["data_splits_75"]

In [5]:
path = pathlib.Path(f"../../data/{cell_type}_preprocessed_sc_norm.parquet")

data_df = pq.read_table(path).to_pandas()

data_df.head()

Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,...,Nuclei_Texture_SumEntropy_CorrPM_3_01_256,Nuclei_Texture_SumVariance_CorrGasdermin_3_01_256,Nuclei_Texture_SumVariance_CorrMito_3_03_256,Nuclei_Texture_SumVariance_CorrPM_3_03_256,Metadata_Treatment,Metadata_Dose,oneb_Metadata_Treatment_Dose_Inhibitor_Dose,twob_Metadata_Treatment_Dose_Inhibitor_Dose,threeb_Metadata_Treatment_Dose_Inhibitor_Dose,fourb_Metadata_Treatment_Dose_Inhibitor_Dose
0,SH-SY5Y,B13,3765,6,Media_ctr,0.0,,media_ctr,0.0,,...,1.851482,0.024721,0.307472,0.092086,media_ctr,0,media_ctr_0_Media_ctr_0.0,media_ctr_Media_ctr_0.0__0,media_ctr__0__Media_ctr_0.0,media_ctr__0__Media_ctr__0.0
1,SH-SY5Y,B13,3765,6,Media_ctr,0.0,,media_ctr,0.0,,...,0.897731,-0.041156,1.443262,0.009843,media_ctr,0,media_ctr_0_Media_ctr_0.0,media_ctr_Media_ctr_0.0__0,media_ctr__0__Media_ctr_0.0,media_ctr__0__Media_ctr__0.0
2,SH-SY5Y,B13,3765,6,Media_ctr,0.0,,media_ctr,0.0,,...,0.82597,-0.044386,-0.020445,0.000848,media_ctr,0,media_ctr_0_Media_ctr_0.0,media_ctr_Media_ctr_0.0__0,media_ctr__0__Media_ctr_0.0,media_ctr__0__Media_ctr__0.0
3,SH-SY5Y,B13,3765,6,Media_ctr,0.0,,media_ctr,0.0,,...,0.546308,-0.058328,-0.009632,-0.005811,media_ctr,0,media_ctr_0_Media_ctr_0.0,media_ctr_Media_ctr_0.0__0,media_ctr__0__Media_ctr_0.0,media_ctr__0__Media_ctr__0.0
4,SH-SY5Y,B13,3765,6,Media_ctr,0.0,,media_ctr,0.0,,...,1.034724,-0.035518,-0.038205,0.01769,media_ctr,0,media_ctr_0_Media_ctr_0.0,media_ctr_Media_ctr_0.0__0,media_ctr__0__Media_ctr_0.0,media_ctr__0__Media_ctr__0.0


In [6]:
if nomic == True:
    # import nomic data
    nomic_df_path = pathlib.Path(
        f"../../2.Nomic_nELISA_Analysis/Data/clean/Plate2/nELISA_plate_430420_{cell_type}_cleanup4correlation.csv"
    )
    df_nomic = pd.read_csv(nomic_df_path)

    # drop columns that contain [pgML]
    df_nomic = df_nomic.drop(
        columns=[col for col in df_nomic.columns if "[pgML]" in col]
    )
    # drop first 25 columns
    # df_nomic = df_nomic.drop(columns=df_nomic.columns[3:25])
    # df_nomic = df_nomic.drop(columns=df_nomic.columns[0:2])
else:
    df_nomic = None

In [7]:
# subset each column that contains metadata
metadata = data_df.filter(regex="Metadata")

# get all columns that are not metadata except for metadata_Well
data = data_df.drop(metadata.columns, axis=1)

# get the metadata_Well column
metadata_well = metadata[
    ["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
]

data_df = pd.merge(data, metadata_well, left_index=True, right_index=True)

In [8]:
if (aggregation == True) and (nomic == True):

    # subset each column that contains metadata
    metadata = data_df.filter(regex="Metadata")
    data_df = data_df.drop(metadata.columns, axis=1)
    data_df = pd.concat([data_df, metadata["Metadata_Well"]], axis=1)
    # groupby well and take mean of each well
    data_df = data_df.groupby("Metadata_Well").mean()
    # drop duplicate rows in the metadata_well column
    metadata = metadata.drop_duplicates(subset=["Metadata_Well"])
    # get the metadata for each well
    data_df = pd.merge(
        data_df, metadata, left_on="Metadata_Well", right_on="Metadata_Well"
    )
    data_df = pd.merge(
        data_df,
        df_nomic,
        left_on=["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
        right_on=["Metadata_position_x", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    )
    data_df = data_df.drop(columns=["Metadata_position_x"])
    # drop all metadata columns
    labeled_data = data_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    data_x = data_df.drop(metadata.columns, axis=1)

elif (aggregation == True) and (nomic == False):
    # subset each column that contains metadata
    metadata = data.filter(regex="Metadata")
    data_df = data_df.drop(metadata.columns, axis=1)
    data_df = pd.concat([data_df, metadata["Metadata_Well"]], axis=1)
    # groupby well and take mean of each well
    data_df = data_df.groupby("Metadata_Well").mean()
    # drop duplicate rows in the metadata_well column
    metadata = metadata.drop_duplicates(subset=["Metadata_Well"])
    # get the metadata for each well
    data_df = pd.merge(
        data_df,
        df_nomic,
        left_on=["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
        right_on=["Metadata_position_x", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    )
elif (aggregation == False) and (nomic == True):
    data_df = pd.merge(
        data_df,
        df_nomic,
        left_on=["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
        right_on=["Metadata_position_x", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    )
    data_df = data_df.drop(columns=["Metadata_position_x"])
elif aggregation == False and nomic == False:
    pass
else:
    print("Error")

This model and code is both inspired and reused from: https://github.com/WayScience/phenotypic_profiling_model/blob/main/1.split_data/split_data.ipynb
The bulk of this work was done by **Roshan Kern** I have only made minor changes to the code to make it more modular and easier to use for my purposes.

In [9]:
# get oneb_Metadata_Treatment_Dose_Inhibitor_Dose  =='DMSO_0.100_DMSO_0.025' and 'LPS_100.000_DMSO_0.025 and Thapsigargin_10.000_DMSO_0.025'
# data_df = data_df[
#     data_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin([control, treatment])
# ]

In [10]:
data_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].unique()

array(['DMSO_0.100_DMSO_0.025', 'LPS_0.010_DMSO_0.025',
       'LPS_Nigericin_100.000_1.0_DMSO_0.025', 'LPS_0.100_DMSO_0.025',
       'LPS_Nigericin_100.000_3.0_DMSO_0.025',
       'Thapsigargin_1.000_DMSO_0.025', 'LPS_1.000_DMSO_0.025',
       'LPS_Nigericin_100.000_10.0_DMSO_0.025',
       'Thapsigargin_10.000_DMSO_0.025', 'LPS_10.000_DMSO_0.025',
       'Disulfiram_0.100_DMSO_0.025', 'Topotecan_5.000_DMSO_0.025',
       'Disulfiram_1.000_DMSO_0.025', 'Topotecan_10.000_DMSO_0.025',
       'Disulfiram_2.500_DMSO_0.025', 'Topotecan_20.000_DMSO_0.025',
       'H2O2_100.000_DMSO_0.025', 'LPS_100.000_DMSO_0.025',
       'LPS_Nigericin_1.000_1.0_DMSO_0.025',
       'LPS_Nigericin_1.000_3.0_DMSO_0.025', 'Flagellin_0.100_DMSO_0.025',
       'LPS_Nigericin_1.000_10.0_DMSO_0.025',
       'Flagellin_1.000_DMSO_0.025'], dtype=object)

In [11]:
# variable test and train set splits
# 100% test set
# subset the following treatments for test set
test_set_all = data_df[
    data_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(test_100_percent)
]
# 75% test set and 25% train set
test_set_75 = data_df[
    data_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(test_75_percent)
]

# 50% test set and 50% train set
# get all treatments that are not in the_test_set_all and the test_set_75
test_set_50 = data_df[
    ~data_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(
        test_set_all["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    )
]
test_set_50 = test_set_50[
    ~test_set_50["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(
        test_set_75["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    )
]

print(test_set_all.shape, test_set_75.shape, test_set_50.shape)

(8, 1441) (8, 1441) (84, 1441)


In [12]:
# get the train test splits from each group
# 100% test set
test_set_all

# 75% test set and 25% train set
test_ratio = 0.75
training_data_set_75, testing_data_set_75 = train_test_split(
    test_set_75,
    test_size=test_ratio,
    stratify=test_set_75["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    random_state=0,
)

# 50% test set and 50% train set
test_ratio = 0.5
training_data_set_50, testing_data_set_50 = train_test_split(
    test_set_50,
    test_size=test_ratio,
    stratify=test_set_50["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    random_state=0,
)

# verify that the correct splits have been made
# 100% test set
print(f"Shape for the 100% test set: {test_set_all.shape}\n")

# 75% test set and 25% train set
print(
    f"Shape for the 75% test set: {training_data_set_75.shape};\nShape for the 75% train set: {testing_data_set_75.shape}\n"
)

# 50% test set and 50% train set
print(
    f"Shape for the 50% test set: {training_data_set_50.shape};\nShape for the 50% train set: {testing_data_set_50.shape}"
)

Shape for the 100% test set: (8, 1441)

Shape for the 75% test set: (2, 1441);
Shape for the 75% train set: (6, 1441)

Shape for the 50% test set: (42, 1441);
Shape for the 50% train set: (42, 1441)


In [13]:
# combine all testing sets together while preserving the index
testing_data_set = pd.concat(
    [test_set_all, testing_data_set_75, testing_data_set_50], axis=0
)
testing_data_set = testing_data_set.sort_index()
testing_data_set

# combine all training sets together while preserving the index
training_data_set = pd.concat([training_data_set_75, training_data_set_50], axis=0)
training_data_set = training_data_set.sort_index()
training_data_set

print(
    f"Testing set length: {len(testing_data_set)}\nTraining set length: {len(training_data_set)}"
)

# get the indexes for the training and testing sets
testing_data_set_index = testing_data_set.index
training_data_set_index = training_data_set.index

Testing set length: 56
Training set length: 44


In [14]:
# # ratio of data to be used for testing (ex 0.15 = 15%)
# test_ratio = 0.5

# # get indexes of training and testing data
# training_data, testing_data = train_test_split(
#     data_df,
#     test_size=test_ratio,
#     stratify=data_df[["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]],
#     random_state=0,
# )
# train_indexes = training_data.index.to_numpy()
# test_indexes = testing_data.index.to_numpy()

# print(f"Training data has shape: {training_data.shape}")
# print(f"Testing data has shape: {testing_data.shape}")

In [15]:
# create pandas dataframe with all indexes and their respective labels, stratified by phenotypic class
index_data = []
for index in training_data_set_index:
    index_data.append({"labeled_data_index": index, "label": "train"})
for index in testing_data_set_index:
    index_data.append({"labeled_data_index": index, "label": "test"})

# make index data a dataframe and sort it by labeled data index
index_data = pd.DataFrame(index_data).sort_values(["labeled_data_index"])

In [16]:
# set save path
if aggregation == True:
    if nomic == True:
        save_path = pathlib.Path(f"./indexes/{cell_type}/{MODEL_TYPE}")
    elif nomic == False:
        save_path = pathlib.Path(f"./indexes/{cell_type}/{MODEL_TYPE}")
elif aggregation == False:
    if nomic == True:
        save_path = pathlib.Path(f"./indexes/{cell_type}/{MODEL_TYPE}")
    elif nomic == False:
        save_path = pathlib.Path(f"./indexes/{cell_type}/{MODEL_TYPE}")
else:
    print("Error")

print(save_path)
# create save path if it doesn't exist
save_path.mkdir(parents=True, exist_ok=True)

indexes/SHSY5Y/regression


In [17]:
# save indexes as tsv file
if aggregation == True:
    if nomic == True:
        index_data.to_csv(
            f"{save_path}/aggregated_sc_and_nomic_data_split_indexes.tsv", sep="\t"
        )
    elif nomic == False:
        index_data.to_csv(f"{save_path}/aggregated_sc_data_split_indexes.tsv", sep="\t")
elif aggregation == False:
    if nomic == True:
        index_data.to_csv(f"{save_path}/sc_and_nomic_data_split_indexes.tsv", sep="\t")
    elif nomic == False:
        index_data.to_csv(f"{save_path}/sc_split_indexes.tsv", sep="\t")
else:
    print("Error")