In [1]:
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import pathlib
import warnings

from sklearn.utils import shuffle, parallel_backend
from sklearn.exceptions import ConvergenceWarning
from joblib import dump

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
)
import itertools
import sys
import ast
import toml

In [2]:
path = pathlib.Path("../../data/SHSY5Y_preprocessed_sc_norm.parquet")

df = pq.read_table(path).to_pandas()

In [3]:
# read in toml file and get parameters
toml_path = pathlib.Path("../1.train_models/single_class_config.toml")
with open(toml_path, "r") as f:
    config = toml.load(f)
f.close()
aggregation = ast.literal_eval(config["logistic_regression_params"]["aggregation"])
nomic = ast.literal_eval(config["logistic_regression_params"]["nomic"])
cell_type = config["logistic_regression_params"]["cell_type"]

In [5]:
if nomic == True:
    nomic_df_path = pathlib.Path(
        f"../../2.Nomic_nELISA_Analysis/Data/clean/Plate2/nELISA_plate_430420_{cell_type}.csv"
    )
    df_nomic = pd.read_csv(nomic_df_path)
    # drop columns that contain [pgML]
    df_nomic = df_nomic.drop(
        columns=[col for col in df_nomic.columns if "[pgML]" in col]
    )
    # drop first 25 columns
    df_nomic = df_nomic.drop(columns=df_nomic.columns[3:25])
    df_nomic = df_nomic.drop(columns=df_nomic.columns[0:2])
else:
    df_nomic = None

In [6]:
# subset each column that contains metadata
metadata = df.filter(regex="Metadata")

# get all columns that are not metadata except for metadata_Well
data = df.drop(metadata.columns, axis=1)

# get the metadata_Well column
metadata_well = metadata["Metadata_Well"]

data = pd.merge(data, metadata_well, left_index=True, right_index=True)


if aggregation == True:
    # aggregate the data by Metadata_Well
    # remove duplicates by Metadata_Well
    metadata = metadata.drop_duplicates(subset=["Metadata_Well"])
    data = data.groupby("Metadata_Well").mean()
    # add the metadata back to the data on metadata_Well column
    data = pd.merge(data, metadata, left_index=True, right_on="Metadata_Well")
else:
    data = df

In [26]:
if nomic == True:
    # merge the nomic data with the data
    data = pd.merge(data, df_nomic, left_on="Metadata_Well", right_on="position_x")
data

Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,...,TWEAK [NSU]_y,uPA [NSU]_y,VCAM-1 [NSU]_y,VEGF Receptor 2 (Flk-1) [NSU]_y,VEGF-A (165) [NSU]_y,VEGF-C [NSU]_y,VEGF-D [NSU]_y,VEGFR-1 [NSU]_y,WISP-1 (CCN4) [NSU]_y,XCL1 (Lymphotactin) [NSU]_y
0,SH-SY5Y,B13,3765,6,Media ctr,0.0,,media ctr,0.0,,...,-0.225271,2.011257,-1.313502,-0.528295,26.484897,-0.751687,-0.891735,-1.624976,1.324507,-0.622436
1,SH-SY5Y,B13,3765,6,Media ctr,0.0,,media ctr,0.0,,...,-0.225271,2.011257,-1.313502,-0.528295,26.484897,-0.751687,-0.891735,-1.624976,1.324507,-0.622436
2,SH-SY5Y,B13,3765,6,Media ctr,0.0,,media ctr,0.0,,...,-0.225271,2.011257,-1.313502,-0.528295,26.484897,-0.751687,-0.891735,-1.624976,1.324507,-0.622436
3,SH-SY5Y,B13,3765,6,Media ctr,0.0,,media ctr,0.0,,...,-0.225271,2.011257,-1.313502,-0.528295,26.484897,-0.751687,-0.891735,-1.624976,1.324507,-0.622436
4,SH-SY5Y,B13,3765,6,Media ctr,0.0,,media ctr,0.0,,...,-0.225271,2.011257,-1.313502,-0.528295,26.484897,-0.751687,-0.891735,-1.624976,1.324507,-0.622436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597897,SH-SY5Y,O23,3555,6,Media ctr,0.0,,media ctr,0.0,,...,0.890262,0.276895,1.572519,0.586528,28.875726,-1.432632,2.668866,-0.527891,1.833440,-1.061733
597898,SH-SY5Y,O23,3555,6,Media ctr,0.0,,media ctr,0.0,,...,0.890262,0.276895,1.572519,0.586528,28.875726,-1.432632,2.668866,-0.527891,1.833440,-1.061733
597899,SH-SY5Y,O23,3555,6,Media ctr,0.0,,media ctr,0.0,,...,0.890262,0.276895,1.572519,0.586528,28.875726,-1.432632,2.668866,-0.527891,1.833440,-1.061733
597900,SH-SY5Y,O23,3555,6,Media ctr,0.0,,media ctr,0.0,,...,0.890262,0.276895,1.572519,0.586528,28.875726,-1.432632,2.668866,-0.527891,1.833440,-1.061733


In [5]:
# drop all metadata columns
data_x = data.drop(metadata.columns, axis=1)
labeled_data = data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]

In [6]:
# https://github.com/WayScience/phenotypic_profiling_model/blob/main/1.split_data/split_data.ipynb

In [7]:
# ratio of data to be used for testing (ex 0.15 = 15%)
test_ratio = 0.25

# get indexes of training and testing data
training_data, testing_data = train_test_split(
    data,
    test_size=test_ratio,
    stratify=data[["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]],
    random_state=1,
)
train_indexes = training_data.index.to_numpy()
test_indexes = testing_data.index.to_numpy()

print(f"Training data has shape: {training_data.shape}")
print(f"Testing data has shape: {testing_data.shape}")

Training data has shape: (448426, 1276)
Testing data has shape: (149476, 1276)


In [8]:
# create pandas dataframe with all indexes and their respective labels, stratified by phenotypic class
index_data = []
for index in train_indexes:
    index_data.append({"labeled_data_index": index, "label": "train"})
for index in test_indexes:
    index_data.append({"labeled_data_index": index, "label": "test"})

# make index data a dataframe and sort it by labeled data index
index_data = pd.DataFrame(index_data).sort_values(["labeled_data_index"])

In [9]:
# make results dir for saving
results_dir = pathlib.Path("indexes/")
results_dir.mkdir(parents=True, exist_ok=True)
# save indexes as tsv file
if aggregation == True:
    if nomic == True:
        index_data.to_csv(
            f"{results_dir}/aggregated_sc_and_nomic_data_split_indexes.tsv", sep="\t"
        )
    elif nomic == False:
        index_data.to_csv(
            f"{results_dir}/aggregated_sc_data_split_indexes.tsv", sep="\t"
        )
elif aggregation == False:
    if nomic == True:
        index_data.to_csv(
            f"{results_dir}/sc_and_nomic_data_split_indexes.tsv", sep="\t"
        )
    elif nomic == False:
        index_data.to_csv(f"{results_dir}/sc_split_indexes.tsv", sep="\t")