In [18]:
import argparse
import itertools
import pathlib

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import toml
from joblib import dump
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split

In [None]:
argparser = argparse.ArgumentParser()
argparser.add_argument("--cell_type", default="all")

args = argparser.parse_args()

cell_type = args.cell_type

In [20]:
# Parameters
aggregation = True
nomic = True

In [21]:
MODEL_TYPE = "regression"

In [22]:
# toml file path
TOML_PATH = pathlib.Path("../splits.toml")
# read toml file via toml
data_splits_by_treatments = toml.load(TOML_PATH)

# define the 100% test set data treatments
test_100_percent = data_splits_by_treatments["splits"]["data_splits_100"]
test_75_percent = data_splits_by_treatments["splits"]["data_splits_75"]

In [23]:
path = pathlib.Path(
    f"../../../data/{cell_type}_preprocessed_sc_norm_aggregated.parquet"
)

data_df = pq.read_table(path).to_pandas()

data_df.head()

Unnamed: 0,Metadata_Well,Cytoplasm_AreaShape_Compactness,Cytoplasm_AreaShape_Eccentricity,Cytoplasm_AreaShape_Extent,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_MajorAxisLength,Cytoplasm_AreaShape_MinorAxisLength,Cytoplasm_AreaShape_Orientation,Cytoplasm_AreaShape_Zernike_0_0,Cytoplasm_AreaShape_Zernike_1_1,...,uPA [NSU],VCAM-1 [NSU],VEGF Receptor 2 (Flk-1) [NSU],VEGF-A (165) [NSU],VEGF-C [NSU],VEGF-D [NSU],VEGFR-1 [NSU],WISP-1 (CCN4) [NSU],XCL1 (Lymphotactin) [NSU],fourb_Metadata_Treatment_Dose_Inhibitor_Dose
0,B14,-0.063223,-0.001418,0.035864,0.036794,0.037936,0.031201,-0.012884,0.028338,0.019985,...,0.114937,0.222551,0.534935,0.403588,0.213858,0.201769,0.577968,0.733987,0.76376,DMSO__0.100__DMSO__0.025
1,B15,-0.062009,0.001236,0.044042,0.030464,-0.002026,0.006311,0.010789,0.030538,0.022751,...,0.214721,0.723799,0.592903,0.287432,0.375582,0.577606,0.554267,0.457302,0.298807,DMSO__0.100__DMSO__0.025
2,B16,-0.031699,0.047344,-0.00399,0.002975,0.115183,0.070404,-0.007908,-0.010212,-0.004997,...,0.104669,0.483786,0.324065,0.174874,0.188023,0.427482,0.702465,0.502791,0.336315,LPS__0.010__DMSO__0.025
3,B17,-0.045468,0.038261,0.034279,0.02382,0.163262,0.120615,-0.000391,0.01825,-0.015776,...,0.105744,0.26615,0.396018,0.0,0.20317,0.652367,0.51009,0.703775,0.283843,LPS__0.010__DMSO__0.025
4,B18,-0.050341,0.055021,0.077588,0.038459,0.425906,0.355382,0.015133,0.035574,-0.010523,...,0.345951,0.274663,0.756694,0.372978,0.242493,0.428503,0.503385,0.550132,0.358411,LPS_Nigericin__100.000_1.0__DMSO__0.025


In [24]:
# subset each column that contains metadata
metadata = data_df.filter(regex="Metadata")

# get all columns that are not metadata except for metadata_Well
data = data_df.drop(metadata.columns, axis=1)

# get the metadata_Well column
metadata_well = metadata[
    ["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
]

data_df = pd.merge(data, metadata_well, left_index=True, right_index=True)

This model and code is both inspired and reused from: https://github.com/WayScience/phenotypic_profiling_model/blob/main/1.split_data/split_data.ipynb
The bulk of this work was done by **Roshan Kern** I have only made minor changes to the code to make it more modular and easier to use for my purposes.

In [25]:
data_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].unique()

array(['DMSO_0.100_DMSO_0.025', 'LPS_0.010_DMSO_0.025',
       'LPS_Nigericin_100.000_1.0_DMSO_0.025', 'LPS_0.100_DMSO_0.025',
       'LPS_Nigericin_100.000_3.0_DMSO_0.025',
       'Thapsigargin_1.000_DMSO_0.025', 'LPS_1.000_DMSO_0.025',
       'LPS_Nigericin_100.000_10.0_DMSO_0.025',
       'Thapsigargin_10.000_DMSO_0.025', 'LPS_10.000_DMSO_0.025',
       'Disulfiram_0.100_DMSO_0.025', 'Topotecan_5.000_DMSO_0.025',
       'Disulfiram_1.000_DMSO_0.025', 'Topotecan_10.000_DMSO_0.025',
       'Disulfiram_2.500_DMSO_0.025', 'Topotecan_20.000_DMSO_0.025',
       'H2O2_100.000_DMSO_0.025', 'LPS_100.000_DMSO_0.025',
       'LPS_Nigericin_1.000_1.0_DMSO_0.025',
       'LPS_Nigericin_1.000_3.0_DMSO_0.025', 'Flagellin_0.100_DMSO_0.025',
       'LPS_Nigericin_1.000_10.0_DMSO_0.025',
       'Flagellin_1.000_DMSO_0.025'], dtype=object)

In [26]:
# variable test and train set splits
# 100% test set
# subset the following treatments for test set
test_set_all = data_df[
    data_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(test_100_percent)
]
# 75% test set and 25% train set
test_set_75 = data_df[
    data_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(test_75_percent)
]

# 50% test set and 50% train set
# get all treatments that are not in the_test_set_all and the test_set_75
test_set_50 = data_df[
    ~data_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(
        test_set_all["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    )
]
test_set_50 = test_set_50[
    ~test_set_50["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(
        test_set_75["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    )
]

print(test_set_all.shape, test_set_75.shape, test_set_50.shape)

(8, 1440) (8, 1440) (84, 1440)


In [27]:
# get the train test splits from each group
# 100% test set
test_set_all

# 75% test set and 25% train set
test_ratio = 0.75
training_data_set_75, testing_data_set_75 = train_test_split(
    test_set_75,
    test_size=test_ratio,
    stratify=test_set_75["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    random_state=0,
)

# 50% test set and 50% train set
test_ratio = 0.5
training_data_set_50, testing_data_set_50 = train_test_split(
    test_set_50,
    test_size=test_ratio,
    stratify=test_set_50["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    random_state=0,
)

# verify that the correct splits have been made
# 100% test set
print(f"Shape for the 100% test set: {test_set_all.shape}\n")

# 75% test set and 25% train set
print(
    f"Shape for the 75% test set: {training_data_set_75.shape};\nShape for the 75% train set: {testing_data_set_75.shape}\n"
)

# 50% test set and 50% train set
print(
    f"Shape for the 50% test set: {training_data_set_50.shape};\nShape for the 50% train set: {testing_data_set_50.shape}"
)

Shape for the 100% test set: (8, 1440)

Shape for the 75% test set: (2, 1440);
Shape for the 75% train set: (6, 1440)

Shape for the 50% test set: (42, 1440);
Shape for the 50% train set: (42, 1440)


In [28]:
# combine all testing sets together while preserving the index
testing_data_set = pd.concat(
    [test_set_all, testing_data_set_75, testing_data_set_50], axis=0
)
testing_data_set = testing_data_set.sort_index()
testing_data_set

# combine all training sets together while preserving the index
training_data_set = pd.concat([training_data_set_75, training_data_set_50], axis=0)
training_data_set = training_data_set.sort_index()
training_data_set

print(
    f"Testing set length: {len(testing_data_set)}\nTraining set length: {len(training_data_set)}"
)

# get the indexes for the training and testing sets
testing_data_set_index = testing_data_set.index
training_data_set_index = training_data_set.index

Testing set length: 56
Training set length: 44


In [29]:
# create pandas dataframe with all indexes and their respective labels, stratified by phenotypic class
index_data = []
for index in training_data_set_index:
    index_data.append({"labeled_data_index": index, "label": "train"})
for index in testing_data_set_index:
    index_data.append({"labeled_data_index": index, "label": "test"})

# make index data a dataframe and sort it by labeled data index
index_data = pd.DataFrame(index_data).sort_values(["labeled_data_index"])

In [30]:
# set save path
if aggregation == True:
    if nomic == True:
        save_path = pathlib.Path(f"../indexes/{cell_type}/{MODEL_TYPE}")
    elif nomic == False:
        save_path = pathlib.Path(f"../indexes/{cell_type}/{MODEL_TYPE}")
elif aggregation == False:
    if nomic == True:
        save_path = pathlib.Path(f"../indexes/{cell_type}/{MODEL_TYPE}")
    elif nomic == False:
        save_path = pathlib.Path(f"../indexes/{cell_type}/{MODEL_TYPE}")
else:
    print("Error")

print(save_path)
# create save path if it doesn't exist
save_path.mkdir(parents=True, exist_ok=True)

../indexes/SHSY5Y/regression


In [31]:
# save indexes as tsv file
if aggregation == True:
    if nomic == True:
        index_data.to_csv(
            f"{save_path}/aggregated_sc_and_nomic_data_split_indexes.tsv", sep="\t"
        )
    elif nomic == False:
        index_data.to_csv(f"{save_path}/aggregated_sc_data_split_indexes.tsv", sep="\t")
elif aggregation == False:
    if nomic == True:
        index_data.to_csv(f"{save_path}/sc_and_nomic_data_split_indexes.tsv", sep="\t")
    elif nomic == False:
        index_data.to_csv(f"{save_path}/sc_split_indexes.tsv", sep="\t")
else:
    print("Error")