# Spliting Data
Here, we utilize the feature-selected profiles generated in the preceding module notebook [here](../0.freature_selection/), focusing on dividing the data into training, testing, and holdout sets for machine learning training.

In [1]:
import sys
import json
import pathlib
import warnings

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

sys.path.append("../../")  # noqa
from src.utils import split_meta_and_features  # noqa

# ignoring warnings
warnings.catch_warnings(action="ignore")



## Paramters

Below are the parameters defined that are used in this notebook

In [2]:
# setting seed constants
seed = 0
np.random.seed(seed)
compartments = ["Cells", "Cytoplasm", "Nuclei"]

# directory to get all the inputs for this notebook
data_dir = pathlib.Path("../../data").resolve(strict=True)
results_dir = pathlib.Path("../../results").resolve(strict=True)
fs_dir = (results_dir / "0.feature_selection").resolve(strict=True)

# directory to store all the output of this notebook
data_split_dir = (results_dir / "1.data_splits").resolve()
data_split_dir.mkdir(exist_ok=True)

In [3]:
# data paths
fs_profile_path = (fs_dir / "shared_cell_injury_profile_fs.csv.gz").resolve(strict=True)

# load data
fs_profile_df = pd.read_csv(fs_profile_path)

# splitting meta and feature column names
fs_meta, fs_feats = split_meta_and_features(fs_profile_df, compartments=compartments)

# display
print("fs profile with control: ", fs_profile_df.shape)
fs_profile_df.head()

fs profile with control:  (16701, 239)


Unnamed: 0,Compound BRD (short),Compound Class,Compound SMILES,Mahalanobis distance,Characteristics [Cell Line],Control Type,Comment [Image File Path],Experimental Condition [Experimental Batch],Compound BRD,Compound Unichem URL,...,Nuclei_RadialDistribution_RadialCV_ER_2of4,Nuclei_RadialDistribution_RadialCV_ER_3of4,Nuclei_RadialDistribution_RadialCV_ER_4of4,Nuclei_RadialDistribution_RadialCV_Mito_1of4,Nuclei_RadialDistribution_RadialCV_Mito_2of4,Nuclei_RadialDistribution_RadialCV_Mito_3of4,Nuclei_RadialDistribution_RadialCV_Mito_4of4,Nuclei_RadialDistribution_RadialCV_RNA_1of4,Nuclei_RadialDistribution_RadialCV_RNA_2of4,Nuclei_RadialDistribution_RadialCV_RNA_3of4
0,,Solvent control,CS(=O)C,7.51,U2OS,Negative,/incoming/BR00110363/,1,,https://www.ebi.ac.uk/unichem/rest/inchikey/IA...,...,0.061971,0.024875,-0.014376,0.082153,0.049476,0.013555,0.041612,0.04222,0.12682,0.077685
1,,Solvent control,CS(=O)C,6.21,U2OS,Negative,/incoming/BR00110363/,1,,https://www.ebi.ac.uk/unichem/rest/inchikey/IA...,...,0.041521,-0.000281,-0.007509,0.01636,0.033403,-0.01525,0.002146,0.076559,0.076925,0.051081
2,,Solvent control,CS(=O)C,10.94,U2OS,Negative,/incoming/BR00110363/,1,,https://www.ebi.ac.uk/unichem/rest/inchikey/IA...,...,0.039369,0.000791,-0.031197,-0.065423,-0.087133,-0.086666,-0.155381,0.083111,0.093994,0.040758
3,,Solvent control,CS(=O)C,7.59,U2OS,Negative,/incoming/BR00110363/,1,,https://www.ebi.ac.uk/unichem/rest/inchikey/IA...,...,0.020103,0.015091,-0.013667,0.010676,-0.00671,0.003637,-0.018047,-0.015474,-0.017643,0.011583
4,,Solvent control,CS(=O)C,5.28,U2OS,Negative,/incoming/BR00110363/,1,,https://www.ebi.ac.uk/unichem/rest/inchikey/IA...,...,0.000525,-0.026855,-0.023019,-0.058229,-0.06446,-0.046734,-0.071742,-0.03314,-0.04065,-0.019959


## Exploring the data set

Below is a  exploration of the selected features dataset. The aim is to identify treatments, extract metadata, and gain a understanding of the experiment's design.

Below demonstrates the amount of wells does each treatment have. 

In [4]:
# displying the amount of wells per treatments
well_treatments_counts_df = (
    fs_profile_df["Compound Name"].value_counts().to_frame().reset_index()
)

well_treatments_counts_df

Unnamed: 0,Compound Name,count
0,DMSO,9855
1,Wortmannin,600
2,Colchicine,512
3,Nocodazole,504
4,Radicicol,504
...,...,...
139,Carmustine,24
140,Thio-TEPA,24
141,Chlorambucil,24
142,Ebselen oxide,24


Below we show the amount of wells does a specific cell celluar injury has

In [5]:
# Displaying how many how wells does each cell injury have
cell_injury_well_counts = (
    fs_profile_df["injury_type"].value_counts().to_frame().reset_index()
)
cell_injury_well_counts

Unnamed: 0,injury_type,count
0,Control,9855
1,Cytoskeletal,1472
2,Miscellaneous,1302
3,Kinase,1104
4,Genotoxin,944
5,Hsp90,552
6,Redox,312
7,Saponin,288
8,HDAC,168
9,Proteasome,144


Next we wanted to extract some metadata regarding how many compound and wells are treated with a given compounds

This will be saved in the `results/0.data_splits` directory

In [6]:
meta_injury = []
for injury_type, df in fs_profile_df.groupby("injury_type"):
    # extract n_wells, n_compounds and unique compounds per injury_type
    n_wells = df.shape[0]
    unique_compounds = list(df["Compound Name"].unique())
    n_compounds = len(unique_compounds)

    # store information
    meta_injury.append([injury_type, n_wells, n_compounds, unique_compounds])

injury_meta_df = pd.DataFrame(
    meta_injury, columns=["injury_type", "n_wells", "n_compounds", "compound_list"]
).sort_values("n_wells", ascending=False)
injury_meta_df.to_csv(data_split_dir / "injury_well_counts_table.csv", index=False)

# display
print("shape:", injury_meta_df.shape)
injury_meta_df

shape: (15, 4)


Unnamed: 0,injury_type,n_wells,n_compounds,compound_list
0,Control,9855,1,[DMSO]
1,Cytoskeletal,1472,15,"[Nocodazole, Colchicine, Paclitaxel, Vinblasti..."
7,Miscellaneous,1302,39,"[L-Buthionine-(S,R)-sulfoximine, CDDO Im, Cino..."
6,Kinase,1104,13,"[Wortmannin, Staurosporine, PI-103, BEZ-235, A..."
3,Genotoxin,944,22,"[Camptothecin, CX-5461, Doxorubicin, Cladribin..."
5,Hsp90,552,3,"[Radicicol, Geldanamycin, 17-AAG]"
11,Redox,312,12,"[Menadione, PKF118-310, 4-Amino-1-naphthol (HC..."
12,Saponin,288,11,"[Digitonin, Saikosaponin A, Polygalasaponin F,..."
4,HDAC,168,5,"[AR-42, SAHA, ITF 2357, Panobinostat, Apicidin]"
8,Mitochondria,144,4,"[Antimycin A, CCCP, Rotenone, Oligomycin A]"


> Barchart showing the number of wells that are labeled with a given injury

Next, we construct the profile metadata. This provides a structured overview of how the treatments assicoated with injuries were applied, detailing the treatments administered to each plate.

This will be saved in the `results/0.data_splits` directory

In [7]:
injury_meta_dict = {}
for injury, df in fs_profile_df.groupby("injury_type"):
    # collecting treatment metadata
    plates = df["Plate"].unique().tolist()
    treatment_meta = {}
    treatment_meta["n_plates"] = len(plates)
    treatment_meta["n_wells"] = df.shape[0]
    treatment_meta["n_treatments"] = len(df["Compound Name"].unique())
    treatment_meta["associated_plates"] = plates

    # counting treatments
    treatment_counter = {}
    for treatment, df2 in df.groupby("Compound Name"):
        if treatment is np.nan:
            continue
        n_treatments = df2.shape[0]
        treatment_counter[treatment] = n_treatments

    # storing treatment counts
    treatment_meta["treatments"] = treatment_counter
    injury_meta_dict[injury] = treatment_meta

# save dictionary into a json file
with open(data_split_dir / "injury_metadata.json", mode="w") as stream:
    json.dump(injury_meta_dict, stream)

Here we build a plate metadata infromations where we look at the type of treatments and amount of wells with the treatment that are present in the dataset

This will be saved in `results/0.data_splits`

In [8]:
plate_meta = {}
for plate_id, df in fs_profile_df.groupby("Plate"):
    unique_compounds = list(df["Compound Name"].unique())
    n_treatments = len(unique_compounds)

    # counting treatments
    treatment_counter = {}
    for treatment, df2 in df.groupby("Compound Name"):
        n_treatments = df2.shape[0]
        treatment_counter[treatment] = n_treatments

    plate_meta[plate_id] = treatment_counter

# save dictionary into a json file
with open(data_split_dir / "cell_injury_plate_info.json", mode="w") as stream:
    json.dump(plate_meta, stream)

Set numerical labels for the treatment

In [9]:
# creating a dictionary that contains the numeric-encoded labels and write out as json file
main_labeler = {}
injury_labels_encoder = {
    name: idx for idx, name in enumerate(fs_profile_df["injury_type"].unique().tolist())
}
injury_labels_decoder = {
    idx: name for idx, name in enumerate(fs_profile_df["injury_type"].unique().tolist())
}
main_labeler["encoder"] = injury_labels_encoder
main_labeler["decoder"] = injury_labels_decoder

# write out as json file
with open(data_split_dir / "injury_codes.json", mode="w") as file_buffer:
    json.dump(main_labeler, file_buffer)

# display main_labeler
main_labeler

{'encoder': {'Control': 0,
  'Cytoskeletal': 1,
  'Hsp90': 2,
  'Kinase': 3,
  'Genotoxin': 4,
  'Miscellaneous': 5,
  'Redox': 6,
  'HDAC': 7,
  'mTOR': 8,
  'Proteasome': 9,
  'Saponin': 10,
  'Mitochondria': 11,
  'Ferroptosis': 12,
  'Tannin': 13,
  'Nonspecific reactive': 14},
 'decoder': {0: 'Control',
  1: 'Cytoskeletal',
  2: 'Hsp90',
  3: 'Kinase',
  4: 'Genotoxin',
  5: 'Miscellaneous',
  6: 'Redox',
  7: 'HDAC',
  8: 'mTOR',
  9: 'Proteasome',
  10: 'Saponin',
  11: 'Mitochondria',
  12: 'Ferroptosis',
  13: 'Tannin',
  14: 'Nonspecific reactive'}}

Now that we have assigned numerical labels to each type of cell injury, we can replace the corresponding injury names with these numerical values to meet the requirements of machine learning algorithms.

In [10]:
# updating main dataframe with numerical labels that represents cell injury
# this will be saved as an "injury_code"
injury_code = fs_profile_df["injury_type"].apply(
    lambda injury: injury_labels_encoder[injury]
)

# add the injury code into the main data set
fs_profile_df.insert(0, "injury_code", injury_code)

# # display new injury column
print(fs_profile_df["injury_type"].unique())
print(fs_profile_df["injury_code"].unique())

['Control' 'Cytoskeletal' 'Hsp90' 'Kinase' 'Genotoxin' 'Miscellaneous'
 'Redox' 'HDAC' 'mTOR' 'Proteasome' 'Saponin' 'Mitochondria' 'Ferroptosis'
 'Tannin' 'Nonspecific reactive']
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]


## Data Splitting 
---

### Holdout Dataset

Here we collected out holdout dataset. The holdout dataset is a subset of the dataset that is not used during model training or tuning. Instead, it is reserved solely for evaluating the model's performance after it has been trained.

In this notebook, we will include three different types of held-out datasets before proceeding with our machine learning training and evaluation.
 - Plate hold out 
 - treatment hold out 
 - well hold out 

Each of these held outdata will be stored in the `results/1.data_splits` directory 


### Plate Holdout

Plates are randomly selected based on their Plate ID and save them as our `plate_holdout` data.

In [11]:
# plate
seed = 0
n_plates = 10

# setting random seed globally
np.random.seed(seed)

# selecting plates randomly from a list
selected_plates = (
    np.random.choice(fs_profile_df["Plate"].unique().tolist(), (n_plates, 1))
    .flatten()
    .tolist()
)
plate_holdout_df = fs_profile_df.loc[fs_profile_df["Plate"].isin(selected_plates)]

# take the indices of the held out data frame and use it to drop those samples from
# the main dataset. And then check if those indices are dropped
plate_idx_to_drop = plate_holdout_df.index.tolist()
fs_profile_df = fs_profile_df.drop(plate_idx_to_drop)
assert all(
    [
        True if num not in fs_profile_df.index.tolist() else False
        for num in plate_idx_to_drop
    ]
), "index to be dropped found in the main dataframe"

# saving the holdout data
plate_holdout_df.to_csv(
    data_split_dir / "plate_holdout.csv.gz", index=False, compression="gzip"
)

# display
print("plate holdout shape:", plate_holdout_df.shape)
plate_holdout_df.head()

plate holdout shape: (1948, 240)


Unnamed: 0,injury_code,Compound BRD (short),Compound Class,Compound SMILES,Mahalanobis distance,Characteristics [Cell Line],Control Type,Comment [Image File Path],Experimental Condition [Experimental Batch],Compound BRD,...,Nuclei_RadialDistribution_RadialCV_ER_2of4,Nuclei_RadialDistribution_RadialCV_ER_3of4,Nuclei_RadialDistribution_RadialCV_ER_4of4,Nuclei_RadialDistribution_RadialCV_Mito_1of4,Nuclei_RadialDistribution_RadialCV_Mito_2of4,Nuclei_RadialDistribution_RadialCV_Mito_3of4,Nuclei_RadialDistribution_RadialCV_Mito_4of4,Nuclei_RadialDistribution_RadialCV_RNA_1of4,Nuclei_RadialDistribution_RadialCV_RNA_2of4,Nuclei_RadialDistribution_RadialCV_RNA_3of4
1044,0,,Solvent control,CS(=O)C,8.05,U2OS,Negative,/incoming/BR00110368/,1,,...,0.004947,0.01087,-0.012711,-0.026305,0.005659,0.020612,0.02298,-0.017959,0.019123,0.027852
1045,0,,Solvent control,CS(=O)C,5.96,U2OS,Negative,/incoming/BR00110368/,1,,...,0.050258,0.077955,0.073336,-0.027207,0.001036,0.020167,-0.005732,0.135607,0.181091,0.097072
1046,0,,Solvent control,CS(=O)C,4.49,U2OS,Negative,/incoming/BR00110368/,1,,...,0.017767,0.036134,0.033406,-0.02304,4.2e-05,0.028691,-0.023482,-0.047025,0.042399,0.082567
1047,0,,Solvent control,CS(=O)C,6.66,U2OS,Negative,/incoming/BR00110368/,1,,...,-0.016906,0.002204,-0.009299,-0.019047,-0.005583,-0.028728,-0.013437,-0.007416,-0.01005,0.018247
1048,0,,Solvent control,CS(=O)C,3.81,U2OS,Negative,/incoming/BR00110368/,1,,...,-0.039018,-0.015167,-0.02162,-0.091378,-0.071885,-0.03067,-0.06563,0.054054,0.026117,0.001689


### Treatment holdout

To establish our treatment holdout, we first need to find the number of treatments and wells associated with a specific cell injury, considering the removal of randomly selected plates from the previous step.

To determine which cell injuries should be considered for a single treatment holdout, we establish a threshold of 10 unique compounds. This means that a cell injury type must have at least 10 unique compounds to qualify for selection in the treatment holdout. Any cell injury types failing to meet this criterion will be disregarded.

Once the cell injuries are identified for treatment holdout, we select our holdout treatment by grouping each injury type and choosing the treatment with the fewest wells. This becomes our treatment holdout dataset.

In [12]:
injury_treatment_metadata = (
    fs_profile_df.groupby(["injury_type", "Compound Name"])
    .size()
    .reset_index(name="n_wells")
)
injury_treatment_metadata

Unnamed: 0,injury_type,Compound Name,n_wells
0,Control,DMSO,8783
1,Cytoskeletal,ARQ 621,12
2,Cytoskeletal,Citreoviridin,18
3,Cytoskeletal,Citrinin,18
4,Cytoskeletal,Colchicine,457
...,...,...,...
139,Tannin,Corilagin,18
140,Tannin,Gallotannin,24
141,Tannin,Punicalagin,18
142,mTOR,Rapamycin,42


In [13]:
# setting random seed
min_treatments_per_injury = 10

# Filter out the injury types for which we can select a complete treatment.
# We are using a threshold of 10. If an injury type is associated with fewer than 10 compounds,
# we do not conduct treatment holdout on those injury types.
accepted_injuries = []
for injury_type, df in injury_treatment_metadata.groupby("injury_type"):
    n_treatments = df.shape[0]
    if n_treatments >= min_treatments_per_injury:
        accepted_injuries.append(df)

accepted_injuries = pd.concat(accepted_injuries)

# Next, we select the treatment that will be held out within each injury type.
# We group treatments based on injury type and choose the treatment with the fewest wells
# as our holdout.
selected_treatments_to_holdout = []
for injury_type, df in accepted_injuries.groupby("injury_type"):
    held_treatment = df.min().iloc[1]
    selected_treatments_to_holdout.append([injury_type, held_treatment])

# convert to dataframe
selected_treatments_to_holdout = pd.DataFrame(
    selected_treatments_to_holdout, columns="injury_type held_treatment".split()
)

print("Below are the accepted cell injuries and treatments to be held out")
selected_treatments_to_holdout

Below are the accepted cell injuries and treatments to be held out


Unnamed: 0,injury_type,held_treatment
0,Cytoskeletal,ARQ 621
1,Genotoxin,Aphidicolin
2,Kinase,AZD 1152-HQPA
3,Miscellaneous,Aloisine RP106
4,Redox,4-Amino-1-naphthol (HCl)
5,Saponin,Bacopasaponin C


In [14]:
# select all wells that have the treatments to be heldout
treatment_holdout_df = fs_profile_df.loc[
    fs_profile_df["Compound Name"].isin(
        selected_treatments_to_holdout["held_treatment"]
    )
]

# take the indices of the held out data frame and use it to drop those samples from
# the main dataset. And then check if those indices are dropped
treatment_idx_to_drop = treatment_holdout_df.index.tolist()
fs_profile_df = fs_profile_df.drop(treatment_idx_to_drop)
assert all(
    [
        True if num not in fs_profile_df.index.tolist() else False
        for num in treatment_idx_to_drop
    ]
), "index to be dropped found in the main dataframe"
# saving the holdout data
treatment_holdout_df.to_csv(
    data_split_dir / "treatment_holdout.csv.gz", index=False, compression="gzip"
)

# display
print("Treatment holdout shape:", treatment_holdout_df.shape)
treatment_holdout_df.head()

Treatment holdout shape: (126, 240)


Unnamed: 0,injury_code,Compound BRD (short),Compound Class,Compound SMILES,Mahalanobis distance,Characteristics [Cell Line],Control Type,Comment [Image File Path],Experimental Condition [Experimental Batch],Compound BRD,...,Nuclei_RadialDistribution_RadialCV_ER_2of4,Nuclei_RadialDistribution_RadialCV_ER_3of4,Nuclei_RadialDistribution_RadialCV_ER_4of4,Nuclei_RadialDistribution_RadialCV_Mito_1of4,Nuclei_RadialDistribution_RadialCV_Mito_2of4,Nuclei_RadialDistribution_RadialCV_Mito_3of4,Nuclei_RadialDistribution_RadialCV_Mito_4of4,Nuclei_RadialDistribution_RadialCV_RNA_1of4,Nuclei_RadialDistribution_RadialCV_RNA_2of4,Nuclei_RadialDistribution_RadialCV_RNA_3of4
10865,1,BRD-K43436647,Cytoskeletal,C#CC[C@H](C1=NC2=C(C=CC(=C2)Cl)C(=O)N1NC3=CC=C...,26.08,U2OS,,/incoming/BR00114093/,2,BRD-K43436647-001-03-9,...,0.781917,0.939894,1.211027,0.303891,0.47888,1.285548,0.704287,-0.691839,-0.545945,0.714014
10866,1,BRD-K43436647,Cytoskeletal,C#CC[C@H](C1=NC2=C(C=CC(=C2)Cl)C(=O)N1NC3=CC=C...,12.94,U2OS,,/incoming/BR00114093/,2,BRD-K43436647-001-03-9,...,0.552652,0.663508,0.415782,0.736578,0.725775,0.935914,0.512768,-0.199308,0.022528,0.264811
10867,1,BRD-K43436647,Cytoskeletal,C#CC[C@H](C1=NC2=C(C=CC(=C2)Cl)C(=O)N1NC3=CC=C...,10.33,U2OS,,/incoming/BR00114093/,2,BRD-K43436647-001-03-9,...,0.322742,0.317817,0.258732,0.384653,0.391773,0.408019,0.204817,-0.07979,0.088065,0.188274
10868,1,BRD-K43436647,Cytoskeletal,C#CC[C@H](C1=NC2=C(C=CC(=C2)Cl)C(=O)N1NC3=CC=C...,4.15,U2OS,,/incoming/BR00114093/,2,BRD-K43436647-001-03-9,...,0.034607,0.131182,0.158356,0.090093,0.116435,0.120137,0.183495,-0.027594,-0.00477,0.034306
10869,1,BRD-K43436647,Cytoskeletal,C#CC[C@H](C1=NC2=C(C=CC(=C2)Cl)C(=O)N1NC3=CC=C...,4.61,U2OS,,/incoming/BR00114093/,2,BRD-K43436647-001-03-9,...,0.039577,0.081793,0.077952,0.08591,0.078307,0.142999,0.177868,0.046739,0.066569,0.108963


### Well holdout 

To generate the well hold out data, each plate was iterated and random wells were selected. However, an additional step was condcuting which was to seperate the control wells and the treated wells, due to the large label imbalance with the controls. Therefore, 5 wells were randomly selected and 10 wells were randomly selected from each individual plate

In [15]:
# parameters
seed = 0
n_controls = 5
n_samples = 10

# setting random seed globally
np.random.seed(seed)

# collecting randomly select wells based on treatment
wells_heldout_df = []
for treatment, df in fs_profile_df.groupby("Plate", as_index=False):
    # separate control wells and rest of all wells since there is a huge label imbalance
    # selected 5 control wells and 10 random wells from the plate
    df_control = df.loc[df["Compound Name"] == "DMSO"].sample(
        n=n_controls, random_state=seed
    )
    df_treated = df.loc[df["Compound Name"] != "DMSO"].sample(
        n=n_samples, random_state=seed
    )

    # concatenate those together
    well_heldout = pd.concat([df_control, df_treated])

    wells_heldout_df.append(well_heldout)

# genearte treatment holdout dataframe
wells_heldout_df = pd.concat(wells_heldout_df)

# take the indices of the held out data frame and use it to drop those samples from
# the main dataset. And then check if those indices are dropped
wells_idx_to_drop = wells_heldout_df.index.tolist()
fs_profile_df = fs_profile_df.drop(wells_idx_to_drop)
assert all(
    [
        True if num not in fs_profile_df.index.tolist() else False
        for num in treatment_idx_to_drop
    ]
), "index to be dropped found in the main dataframe"

# saving the holdout data
wells_heldout_df.to_csv(
    data_split_dir / "wells_holdout.csv.gz", index=False, compression="gzip"
)

# display
print("Wells holdout shape:", wells_heldout_df.shape)
wells_heldout_df.head()

Wells holdout shape: (1125, 240)


Unnamed: 0,injury_code,Compound BRD (short),Compound Class,Compound SMILES,Mahalanobis distance,Characteristics [Cell Line],Control Type,Comment [Image File Path],Experimental Condition [Experimental Batch],Compound BRD,...,Nuclei_RadialDistribution_RadialCV_ER_2of4,Nuclei_RadialDistribution_RadialCV_ER_3of4,Nuclei_RadialDistribution_RadialCV_ER_4of4,Nuclei_RadialDistribution_RadialCV_Mito_1of4,Nuclei_RadialDistribution_RadialCV_Mito_2of4,Nuclei_RadialDistribution_RadialCV_Mito_3of4,Nuclei_RadialDistribution_RadialCV_Mito_4of4,Nuclei_RadialDistribution_RadialCV_RNA_1of4,Nuclei_RadialDistribution_RadialCV_RNA_2of4,Nuclei_RadialDistribution_RadialCV_RNA_3of4
4994,0,,Solvent control,CS(=O)C,2.02,U2OS,Negative,/incoming/BR00109990/,1,,...,0.046747,0.11275,0.106888,-0.010461,0.01002,0.064024,0.064756,0.052397,0.07559,0.10408
5058,0,,Solvent control,CS(=O)C,0.97,U2OS,Negative,/incoming/BR00109990/,1,,...,0.160839,0.080362,0.09143,0.087356,0.084203,0.050642,0.078312,0.093513,0.171366,0.061648
5050,0,,Solvent control,CS(=O)C,0.58,U2OS,Negative,/incoming/BR00109990/,1,,...,0.141853,0.014751,0.015401,0.069577,0.05588,0.028956,-0.052656,0.099781,0.084154,0.07687
5035,0,,Solvent control,CS(=O)C,0.82,U2OS,Negative,/incoming/BR00109990/,1,,...,-0.007644,-0.003606,0.004558,-0.010846,-0.006748,0.018612,0.00824,0.051558,0.035722,0.014961
4991,0,,Solvent control,CS(=O)C,2.26,U2OS,Negative,/incoming/BR00109990/,1,,...,-0.095556,-0.047512,-0.035205,-0.094716,-0.077216,-0.044001,-0.061641,-0.041822,-0.047356,0.016481


## Saving training dataset

Once the data holdout has been generated, the next step is to save the training dataset that will serve as the basis for training the multi-class logistic regression model.

In [16]:
# Showing the amount of data we have after removing the holdout data
meta_injury = []
for injury_type, df in fs_profile_df.groupby("injury_type"):
    # extract n_wells, n_compounds and unique compounds per injury_type
    n_wells = df.shape[0]
    injury_code = df["injury_code"].unique()[0]
    unique_compounds = list(df["Compound Name"].unique())
    n_compounds = len(unique_compounds)

    # store information
    meta_injury.append(
        [injury_type, injury_code, n_wells, n_compounds, unique_compounds]
    )

# creating data frame
injury_meta_df = pd.DataFrame(
    meta_injury,
    columns=["injury_type", "injury_code", "n_wells", "n_compounds", "compound_list"],
).sort_values("n_wells", ascending=False)

# display
injury_meta_df

Unnamed: 0,injury_type,injury_code,n_wells,n_compounds,compound_list
0,Control,0,8408,1,[DMSO]
1,Cytoskeletal,1,1102,14,"[Nocodazole, Colchicine, Paclitaxel, Vinblasti..."
7,Miscellaneous,5,1006,38,"[L-Buthionine-(S,R)-sulfoximine, CDDO Im, Cino..."
6,Kinase,3,750,12,"[Wortmannin, Staurosporine, PI-103, BEZ-235, S..."
3,Genotoxin,4,738,21,"[Camptothecin, CX-5461, Doxorubicin, Cladribin..."
5,Hsp90,2,418,3,"[Radicicol, Geldanamycin, 17-AAG]"
11,Redox,6,215,11,"[Menadione, PKF118-310, Dunnione, MGR2, SIN-1 ..."
12,Saponin,10,163,10,"[Digitonin, Saikosaponin A, Polygalasaponin F,..."
4,HDAC,7,138,5,"[AR-42, SAHA, ITF 2357, Panobinostat, Apicidin]"
10,Proteasome,9,117,4,"[Carfilzomib, Bortezomib, (S)-MG132, (R)-MG132]"


In [17]:
# shape of the update training and testing dataset after removing holdout
print("training shape after removing holdouts", fs_profile_df.shape)
fs_profile_df.head()

training shape after removing holdouts (13502, 240)


Unnamed: 0,injury_code,Compound BRD (short),Compound Class,Compound SMILES,Mahalanobis distance,Characteristics [Cell Line],Control Type,Comment [Image File Path],Experimental Condition [Experimental Batch],Compound BRD,...,Nuclei_RadialDistribution_RadialCV_ER_2of4,Nuclei_RadialDistribution_RadialCV_ER_3of4,Nuclei_RadialDistribution_RadialCV_ER_4of4,Nuclei_RadialDistribution_RadialCV_Mito_1of4,Nuclei_RadialDistribution_RadialCV_Mito_2of4,Nuclei_RadialDistribution_RadialCV_Mito_3of4,Nuclei_RadialDistribution_RadialCV_Mito_4of4,Nuclei_RadialDistribution_RadialCV_RNA_1of4,Nuclei_RadialDistribution_RadialCV_RNA_2of4,Nuclei_RadialDistribution_RadialCV_RNA_3of4
0,0,,Solvent control,CS(=O)C,7.51,U2OS,Negative,/incoming/BR00110363/,1,,...,0.061971,0.024875,-0.014376,0.082153,0.049476,0.013555,0.041612,0.04222,0.12682,0.077685
1,0,,Solvent control,CS(=O)C,6.21,U2OS,Negative,/incoming/BR00110363/,1,,...,0.041521,-0.000281,-0.007509,0.01636,0.033403,-0.01525,0.002146,0.076559,0.076925,0.051081
2,0,,Solvent control,CS(=O)C,10.94,U2OS,Negative,/incoming/BR00110363/,1,,...,0.039369,0.000791,-0.031197,-0.065423,-0.087133,-0.086666,-0.155381,0.083111,0.093994,0.040758
3,0,,Solvent control,CS(=O)C,7.59,U2OS,Negative,/incoming/BR00110363/,1,,...,0.020103,0.015091,-0.013667,0.010676,-0.00671,0.003637,-0.018047,-0.015474,-0.017643,0.011583
4,0,,Solvent control,CS(=O)C,5.28,U2OS,Negative,/incoming/BR00110363/,1,,...,0.000525,-0.026855,-0.023019,-0.058229,-0.06446,-0.046734,-0.071742,-0.03314,-0.04065,-0.019959


In [18]:
# split the data into trianing and testing sets
meta_cols, feat_cols = split_meta_and_features(fs_profile_df)
X = fs_profile_df[feat_cols]
y = fs_profile_df["injury_code"]

# spliting dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.80, random_state=seed, stratify=y
)

# saving training dataset as csv file
X_train.to_csv(data_split_dir / "X_train.csv.gz", compression="gzip", index=False)
X_test.to_csv(data_split_dir / "X_test.csv.gz", compression="gzip", index=False)
y_train.to_csv(data_split_dir / "y_train.csv.gz", compression="gzip", index=False)
y_test.to_csv(data_split_dir / "y_test.csv.gz", compression="gzip", index=False)

# display data split sizes
print("X training size", X_train.shape)
print("X testing size", X_test.shape)
print("y training size", y_train.shape)
print("y testing size", y_test.shape)

X training size (10801, 207)
X testing size (2701, 207)
y training size (10801,)
y testing size (2701,)


In [19]:
# save metadata after holdout
cell_injury_metadata = fs_profile_df[fs_meta]
cell_injury_metadata.to_csv(
    data_split_dir / "cell_injury_metadata_after_holdout.csv.gz",
    compression="gzip",
    index=False,
)

# display
print("Metadata shape", cell_injury_metadata.shape)
cell_injury_metadata.head()

Metadata shape (13502, 32)


Unnamed: 0,Compound BRD (short),Compound Class,Compound SMILES,Mahalanobis distance,Characteristics [Cell Line],Control Type,Comment [Image File Path],Experimental Condition [Experimental Batch],Compound BRD,Compound Unichem URL,...,Term Source 1 REF,Compound Name,Compound IUPAC,Term Source 1 Accession,Mahalanobis distance significant,Comment [Image Prefix],Characteristics [Organism],Compound Manuscript Number,Compound InChIKey,Compound PubChem URL
0,,Solvent control,CS(=O)C,7.51,U2OS,Negative,/incoming/BR00110363/,1,,https://www.ebi.ac.uk/unichem/rest/inchikey/IA...,...,NCBITaxon,DMSO,methylsulfinylmethane,NCBITaxon_9606,No,r02c02,Homo sapiens,502,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,https://pubchem.ncbi.nlm.nih.gov/compound/679
1,,Solvent control,CS(=O)C,6.21,U2OS,Negative,/incoming/BR00110363/,1,,https://www.ebi.ac.uk/unichem/rest/inchikey/IA...,...,NCBITaxon,DMSO,methylsulfinylmethane,NCBITaxon_9606,No,r02c03,Homo sapiens,502,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,https://pubchem.ncbi.nlm.nih.gov/compound/679
2,,Solvent control,CS(=O)C,10.94,U2OS,Negative,/incoming/BR00110363/,1,,https://www.ebi.ac.uk/unichem/rest/inchikey/IA...,...,NCBITaxon,DMSO,methylsulfinylmethane,NCBITaxon_9606,No,r02c04,Homo sapiens,502,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,https://pubchem.ncbi.nlm.nih.gov/compound/679
3,,Solvent control,CS(=O)C,7.59,U2OS,Negative,/incoming/BR00110363/,1,,https://www.ebi.ac.uk/unichem/rest/inchikey/IA...,...,NCBITaxon,DMSO,methylsulfinylmethane,NCBITaxon_9606,No,r02c05,Homo sapiens,502,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,https://pubchem.ncbi.nlm.nih.gov/compound/679
4,,Solvent control,CS(=O)C,5.28,U2OS,Negative,/incoming/BR00110363/,1,,https://www.ebi.ac.uk/unichem/rest/inchikey/IA...,...,NCBITaxon,DMSO,methylsulfinylmethane,NCBITaxon_9606,No,r02c06,Homo sapiens,502,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,https://pubchem.ncbi.nlm.nih.gov/compound/679
