In [1]:
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import pathlib
import warnings

from sklearn.utils import shuffle, parallel_backend
from sklearn.exceptions import ConvergenceWarning
from joblib import dump

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
)
import itertools
import sys
import ast
import toml

In [2]:
path = pathlib.Path("../../data/SHSY5Y_preprocessed_sc_norm.parquet")

df = pq.read_table(path).to_pandas()

In [3]:
# read in toml file and get parameters
toml_path = pathlib.Path("../1.train_models/single_class_config.toml")
with open(toml_path, "r") as f:
    config = toml.load(f)
f.close()
aggregation = ast.literal_eval(config["logistic_regression_params"]["aggregation"])
nomic = ast.literal_eval(config["logistic_regression_params"]["nomic"])
cell_type = config["logistic_regression_params"]["cell_type"]

In [4]:
if nomic == True:
    nomic_df_path = pathlib.Path(
        f"../../2.Nomic_nELISA_Analysis/Data/clean/Plate2/nELISA_plate_430420_{cell_type}.csv"
    )
    df_nomic = pd.read_csv(nomic_df_path)
    # drop columns that contain [pgML]
    df_nomic = df_nomic.drop(
        columns=[col for col in df_nomic.columns if "[pgML]" in col]
    )
    # drop first 25 columns
    df_nomic = df_nomic.drop(columns=df_nomic.columns[3:25])
    df_nomic = df_nomic.drop(columns=df_nomic.columns[0:2])
else:
    df_nomic = None

In [5]:
df_nomic

Unnamed: 0,position_x,Activin A [NSU],AITRL (GITR Ligand) [NSU],Amphiregulin [NSU],Amyloid beta [NSU],APRIL [NSU],BAFF [NSU],BCMA (TNFRSF17) [NSU],BDNF [NSU],BMP2 [NSU],...,TWEAK [NSU],uPA [NSU],VCAM-1 [NSU],VEGF Receptor 2 (Flk-1) [NSU],VEGF-A (165) [NSU],VEGF-C [NSU],VEGF-D [NSU],VEGFR-1 [NSU],WISP-1 (CCN4) [NSU],XCL1 (Lymphotactin) [NSU]
0,C13,0.050305,-0.046947,1.361487,0.553065,3.161210,0.323546,0.200321,-4.628914,-1.455375,...,-1.053944,1.931219,0.439120,-1.538831,30.157265,-0.733615,0.413442,-0.897618,-1.841281,0.048389
1,D13,-0.136531,1.014299,-1.829310,-2.860160,3.326872,-0.293770,0.520861,1.004449,-0.906373,...,-0.871663,3.274879,-0.738697,0.816301,29.581649,-0.939235,-0.385612,-1.410167,-1.406394,-0.863555
2,E13,1.240569,0.229261,1.995245,-2.499106,-4.232099,-1.807823,-0.656196,0.746356,-0.001466,...,-0.251360,1.923855,-0.523146,0.204165,31.313849,-2.824671,1.220794,-1.488981,1.496881,-0.107094
3,F13,0.629592,1.492948,-1.857380,-0.308989,-0.640024,0.713347,-0.554004,-1.260493,0.067077,...,-1.318833,2.165008,-0.282784,-0.779901,28.934441,-0.304395,0.177668,-0.300231,0.083451,-1.798261
4,G13,-0.375887,0.390727,-0.026048,-0.563376,1.802794,0.231624,1.355083,-0.230052,-0.209977,...,-1.003948,1.439554,0.407104,-0.753367,32.680499,-1.739692,-1.222531,0.841332,-0.464893,0.077351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,M15,-0.182526,-0.449384,-1.877599,-0.522611,-2.868667,0.952490,-0.253642,-1.289233,-0.228855,...,-0.241167,1.149727,-1.670979,-0.802934,34.971722,-0.551051,-3.609578,0.868698,1.002614,-0.481627
150,B13,1.565601,0.584670,0.294442,0.405039,-5.985005,-0.071077,-0.532917,1.011322,-0.742379,...,-0.225271,2.011257,-1.313502,-0.528295,26.484897,-0.751687,-0.891735,-1.624976,1.324507,-0.622436
151,I13,0.573625,-0.063392,0.155447,-1.606968,1.601579,0.382720,1.028351,-2.033671,-0.306703,...,-1.885210,2.713198,0.031470,-0.883777,34.473422,-0.743824,0.045387,-0.535662,-0.498182,-0.772981
152,O22,-0.911440,-0.830525,-0.216470,0.747194,-3.469261,0.241577,-2.145295,2.626943,-1.119424,...,-0.328278,2.187440,0.442932,-0.330779,34.600973,-1.784995,-0.022841,0.475823,0.490670,-1.055123


In [6]:
# subset each column that contains metadata
metadata = df.filter(regex="Metadata")

# get all columns that are not metadata except for metadata_Well
data = df.drop(metadata.columns, axis=1)

# get the metadata_Well column
metadata_well = metadata["Metadata_Well"]

data = pd.merge(data, metadata_well, left_index=True, right_index=True)


if aggregation == True:
    # aggregate the data by Metadata_Well
    # remove duplicates by Metadata_Well
    metadata = metadata.drop_duplicates(subset=["Metadata_Well"])
    data = data.groupby("Metadata_Well").mean()
    # add the metadata back to the data on metadata_Well column
    data = pd.merge(data, metadata, left_index=True, right_on="Metadata_Well")
else:
    data = df
if nomic == True:
    # merge the nomic data with the data
    data = pd.merge(data, df_nomic, left_on="Metadata_Well", right_on="position_x")
elif nomic == False:
    pass

In [7]:
data

Unnamed: 0,Cytoplasm_AreaShape_Compactness,Cytoplasm_AreaShape_Eccentricity,Cytoplasm_AreaShape_Extent,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_MajorAxisLength,Cytoplasm_AreaShape_MinorAxisLength,Cytoplasm_AreaShape_Orientation,Cytoplasm_AreaShape_Zernike_0_0,Cytoplasm_AreaShape_Zernike_1_1,Cytoplasm_AreaShape_Zernike_2_0,...,TWEAK [NSU],uPA [NSU],VCAM-1 [NSU],VEGF Receptor 2 (Flk-1) [NSU],VEGF-A (165) [NSU],VEGF-C [NSU],VEGF-D [NSU],VEGFR-1 [NSU],WISP-1 (CCN4) [NSU],XCL1 (Lymphotactin) [NSU]
0,0.000263,0.050292,0.011215,-0.032031,0.139148,0.092653,-0.022733,-0.004550,-0.019608,0.030791,...,-0.225271,2.011257,-1.313502,-0.528295,26.484897,-0.751687,-0.891735,-1.624976,1.324507,-0.622436
1,-0.063223,-0.001418,0.035864,0.036794,0.037936,0.031201,-0.012884,0.028338,0.019985,0.020941,...,-0.894582,1.230590,-0.966358,-0.353436,31.847475,-1.216944,-2.187799,0.107920,1.367803,0.180895
2,-0.062009,0.001236,0.044042,0.030464,-0.002026,0.006311,0.010789,0.030538,0.022751,0.021311,...,0.114074,2.302291,1.364515,-0.062003,30.013129,-0.001147,0.460551,0.005704,-0.403832,-1.415898
3,-0.031699,0.047344,-0.003990,0.002975,0.115183,0.070404,-0.007908,-0.010212,-0.004997,0.067747,...,-1.402026,1.120312,0.248420,-1.413591,28.235605,-1.411162,-0.597302,0.644832,-0.112562,-1.287083
4,-0.045468,0.038261,0.034279,0.023820,0.163262,0.120615,-0.000391,0.018250,-0.015776,0.079127,...,-1.075493,1.131858,-0.763617,-1.051842,25.473988,-1.297297,0.987359,-0.184818,1.174355,-1.467287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,-0.030737,0.025373,0.019060,0.013307,0.087081,0.062723,0.030207,0.011448,-0.006952,0.048732,...,-0.638728,2.325251,1.088603,-2.413005,30.040230,0.150468,2.537893,-2.072348,1.365336,0.546843
150,-0.008031,0.149927,-0.087296,-0.039525,0.227378,-0.005209,0.012483,-0.121338,-0.041426,-0.011089,...,0.191073,1.324194,0.085482,-2.060541,26.633495,-0.656278,1.078025,-0.021887,0.382966,-0.827056
151,-0.039158,0.083196,-0.033546,-0.022291,0.151747,0.022673,0.001550,-0.070861,-0.024675,0.016000,...,-0.295899,2.450049,0.839780,-0.387338,30.082969,-0.812300,1.301290,0.240628,0.458829,-0.726808
152,-0.078584,0.011797,0.072764,0.058422,0.033983,0.037911,0.019646,0.063310,0.006814,0.056728,...,-0.328278,2.187440,0.442932,-0.330779,34.600973,-1.784995,-0.022841,0.475823,0.490670,-1.055123


In [8]:
# drop all metadata columns
data_x = data.drop(metadata.columns, axis=1)
labeled_data = data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]

In [9]:
labeled_data

0                     media ctr_0_Media ctr_0.0
1                         DMSO_0.100_DMSO_0.025
2                         DMSO_0.100_DMSO_0.025
3                          LPS_0.010_DMSO_0.025
4                          LPS_0.010_DMSO_0.025
                         ...                   
149                   media ctr_0_Media ctr_0.0
150    LPS_Nigericin_1.000_10.0_Z-VAD-FMK_100.0
151    LPS_Nigericin_1.000_10.0_Z-VAD-FMK_100.0
152                   media ctr_0_Media ctr_0.0
153                   media ctr_0_Media ctr_0.0
Name: oneb_Metadata_Treatment_Dose_Inhibitor_Dose, Length: 154, dtype: object

In [10]:
# https://github.com/WayScience/phenotypic_profiling_model/blob/main/1.split_data/split_data.ipynb

In [11]:
# ratio of data to be used for testing (ex 0.15 = 15%)
test_ratio = 0.25

# get indexes of training and testing data
training_data, testing_data = train_test_split(
    data,
    test_size=test_ratio,
    stratify=data[["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]],
    random_state=1,
)
train_indexes = training_data.index.to_numpy()
test_indexes = testing_data.index.to_numpy()

print(f"Training data has shape: {training_data.shape}")
print(f"Testing data has shape: {testing_data.shape}")

Training data has shape: (115, 1464)
Testing data has shape: (39, 1464)


In [12]:
# create pandas dataframe with all indexes and their respective labels, stratified by phenotypic class
index_data = []
for index in train_indexes:
    index_data.append({"labeled_data_index": index, "label": "train"})
for index in test_indexes:
    index_data.append({"labeled_data_index": index, "label": "test"})

# make index data a dataframe and sort it by labeled data index
index_data = pd.DataFrame(index_data).sort_values(["labeled_data_index"])

In [13]:
# make results dir for saving
results_dir = pathlib.Path("indexes/")
results_dir.mkdir(parents=True, exist_ok=True)
# save indexes as tsv file
if aggregation == True:
    if nomic == True:
        index_data.to_csv(
            f"{results_dir}/aggregated_sc_and_nomic_data_split_indexes.tsv", sep="\t"
        )
    elif nomic == False:
        index_data.to_csv(
            f"{results_dir}/aggregated_sc_data_split_indexes.tsv", sep="\t"
        )
elif aggregation == False:
    if nomic == True:
        index_data.to_csv(
            f"{results_dir}/sc_and_nomic_data_split_indexes.tsv", sep="\t"
        )
    elif nomic == False:
        index_data.to_csv(f"{results_dir}/sc_split_indexes.tsv", sep="\t")