# Spliting Data
Here, we utilize the feature-selected profiles generated in the preceding module notebook [here](../0.freature_selection/), focusing on dividing the data into training, testing, and holdout sets for machine learning training.

In [1]:
import json
import pathlib
import warnings

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# ignoring warnings
warnings.catch_warnings(action="ignore")



## Paramters

Below are the parameters defined that are used in this notebook

----

In [2]:
# directory to get all the inputs for this notebook
data_dir = pathlib.Path("../../data").resolve(strict=True)
results_dir = pathlib.Path("../../results").resolve(strict=True)
fs_dir = (results_dir / "0.feature_selection").resolve(strict=True)

# directory to store all the output of this notebook

data_split_dir = (results_dir / "1.data_splits").resolve()
data_split_dir.mkdir(exist_ok=True)

In [3]:
# data paths
fs_profile_path = (fs_dir / "cell_injury_profile_fs.csv.gz").resolve(strict=True)

# load data
fs_profile_df = pd.read_csv(fs_profile_path)

# display
print("fs profile with control: ", fs_profile_df.shape)
fs_profile_df.head()

fs profile with control:  (16701, 378)


  fs_profile_df = pd.read_csv(fs_profile_path)


Unnamed: 0,Compound BRD (short),Mahalanobis distance significant,Channels,Compound SMILES,Compound PubChem URL,Compound IUPAC,Compound PubChem CID,Characteristics [Cell Line],Comment [Image File Path],Well,...,Nuclei_Texture_InverseDifferenceMoment_DNA_20_0,Nuclei_Texture_InverseDifferenceMoment_DNA_5_0,Nuclei_Texture_InverseDifferenceMoment_RNA_5_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_DNA_10_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_DNA_10_0,Nuclei_Texture_SumEntropy_DNA_20_0,Nuclei_Texture_SumVariance_DNA_20_0
0,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00110363/,B2,...,-0.011258,9.8e-05,0.057244,0.160847,-0.083034,-0.02329,-0.066369,-0.015235,-0.035909,-0.032067
1,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00110363/,B3,...,0.064689,0.025857,0.099848,0.017477,0.0213,0.058137,-0.09728,-0.073545,-0.044883,-0.01524
2,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00110363/,B4,...,0.020937,0.04106,0.119247,0.111741,0.041592,0.224199,-0.088845,0.000327,-0.003115,-0.014406
3,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00110363/,B5,...,0.006589,0.022156,0.036473,-0.013141,0.00869,0.06086,0.044924,0.040528,0.070877,0.072871
4,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00110363/,B6,...,-0.028361,0.007213,0.023068,0.110361,0.054405,0.030157,0.06648,0.03891,0.048559,0.056829


## Exploring the data set

Below is a  exploration of the selected features dataset. The aim is to identify treatments, extract metadata, and gain a understanding of the experiment's design.

Below demonstrates the amount of wells does each treatment have. 

In [4]:
# displying the amount of wells per treatments
well_treatments_counts_df = (
    fs_profile_df["Compound Name"].value_counts().to_frame().reset_index()
)
well_treatments_counts_df

Unnamed: 0,Compound Name,count
0,DMSO,9855
1,Wortmannin,600
2,Colchicine,512
3,Nocodazole,504
4,Radicicol,504
...,...,...
139,Carmustine,24
140,Thio-TEPA,24
141,Chlorambucil,24
142,Ebselen oxide,24


Below we show the amount of wells does a specific cell celluar injury has

In [5]:
# Displaying how many how wells does each cell injury have
cell_injury_well_counts = (
    fs_profile_df["injury_type"].value_counts().to_frame().reset_index()
)
cell_injury_well_counts

Unnamed: 0,injury_type,count
0,Control,9855
1,Cytoskeletal,1472
2,Miscellaneous,1302
3,Kinase,1104
4,Genotoxin,944
5,Hsp90,552
6,Redox,312
7,Saponin,288
8,HDAC,168
9,Proteasome,144


Here, we're storing the metadata and feature column names into a JSON file to simplify loading during feature engineering processes.

This will be saved in the `results/0.data_splits` directory

In [6]:
# collecting metadata and feature column names
feature_cols = fs_profile_df.columns[32:].tolist()
raw_features = {
    "compartments": list(set([name.split("_")[0] for name in feature_cols])),
    "meta_features": fs_profile_df.columns[:32].tolist(),
    "feature_cols": feature_cols,
}

# saving into JSON file
with open(data_split_dir / "raw_feature_names.json", mode="w") as stream:
    json.dump(raw_features, stream)

Next we wanted to extract some metadata regarding how many compound and wells are treated with a given compounds

This will be saved in the `results/0.data_splits` directory

In [7]:
meta_injury = []
for injury_type, df in fs_profile_df.groupby("injury_type"):
    # extract n_wells, n_compounds and unique compounds per injury_type
    n_wells = df.shape[0]
    unique_compounds = list(df["Compound Name"].unique())
    n_compounds = len(unique_compounds)

    # store information
    meta_injury.append([injury_type, n_wells, n_compounds, unique_compounds])

injury_meta_df = pd.DataFrame(
    meta_injury, columns=["injury_type", "n_wells", "n_compounds", "compound_list"]
).sort_values("n_wells", ascending=False)
injury_meta_df.to_csv(data_split_dir / "injury_well_counts_table.csv", index=False)

# display
print("shape:", injury_meta_df.shape)
injury_meta_df

shape: (15, 4)


Unnamed: 0,injury_type,n_wells,n_compounds,compound_list
0,Control,9855,1,[DMSO]
1,Cytoskeletal,1472,15,"[Nocodazole, Colchicine, Paclitaxel, Vinblasti..."
7,Miscellaneous,1302,39,"[L-Buthionine-(S,R)-sulfoximine, CDDO Im, Cino..."
6,Kinase,1104,13,"[Wortmannin, Staurosporine, PI-103, BEZ-235, A..."
3,Genotoxin,944,22,"[Camptothecin, CX-5461, Doxorubicin, Cladribin..."
5,Hsp90,552,3,"[Radicicol, Geldanamycin, 17-AAG]"
11,Redox,312,12,"[Menadione, PKF118-310, 4-Amino-1-naphthol (HC..."
12,Saponin,288,11,"[Digitonin, Saikosaponin A, Polygalasaponin F,..."
4,HDAC,168,5,"[AR-42, SAHA, ITF 2357, Panobinostat, Apicidin]"
8,Mitochondria,144,4,"[Antimycin A, CCCP, Rotenone, Oligomycin A]"


> Barchart showing the number of wells that are labeled with a given injury

Next, we construct the profile metadata. This provides a structured overview of how the treatments assicoated with injuries were applied, detailing the treatments administered to each plate.

This will be saved in the `results/0.data_splits` directory

In [8]:
injury_meta_dict = {}
for injury, df in fs_profile_df.groupby("injury_type"):
    # collecting treatment metadata
    plates = df["Plate"].unique().tolist()
    treatment_meta = {}
    treatment_meta["n_plates"] = len(plates)
    treatment_meta["n_wells"] = df.shape[0]
    treatment_meta["n_treatments"] = len(df["Compound Name"].unique())
    treatment_meta["associated_plates"] = plates

    # counting treatments
    treatment_counter = {}
    for treatment, df2 in df.groupby("Compound Name"):
        if treatment is np.nan:
            continue
        n_treatments = df2.shape[0]
        treatment_counter[treatment] = n_treatments

    # storing treatment counts
    treatment_meta["treatments"] = treatment_counter
    injury_meta_dict[injury] = treatment_meta

# save dictionary into a json file
with open(data_split_dir / "injury_metadata.json", mode="w") as stream:
    json.dump(injury_meta_dict, stream)

Here we build a plate metadata infromations where we look at the type of treatments and amount of wells with the treatment that are present in the dataset

This will be saved in `results/0.data_splits`

In [9]:
plate_meta = {}
for plate_id, df in fs_profile_df.groupby("Plate"):
    unique_compounds = list(df["Compound Name"].unique())
    n_treatments = len(unique_compounds)

    # counting treatments
    treatment_counter = {}
    for treatment, df2 in df.groupby("Compound Name"):
        n_treatments = df2.shape[0]
        treatment_counter[treatment] = n_treatments

    plate_meta[plate_id] = treatment_counter

# save dictionary into a json file
with open(data_split_dir / "plate_info.json", mode="w") as stream:
    json.dump(plate_meta, stream)

Set numerical labels for the treatment

In [10]:
# creating a dictionary that contains the numeric-encoded labels and write out as json file
main_labeler = {}
injury_labels_encoder = {
    name: idx for idx, name in enumerate(fs_profile_df["injury_type"].unique().tolist())
}
injury_labels_decoder = {
    idx: name for idx, name in enumerate(fs_profile_df["injury_type"].unique().tolist())
}
main_labeler["encoder"] = injury_labels_encoder
main_labeler["decoder"] = injury_labels_decoder

# write out as json file
with open(data_split_dir / "injury_codes.json", mode="w") as file_buffer:
    json.dump(main_labeler, file_buffer)

# display main_labeler
main_labeler

{'encoder': {'Control': 0,
  'Cytoskeletal': 1,
  'Hsp90': 2,
  'Kinase': 3,
  'Genotoxin': 4,
  'Miscellaneous': 5,
  'Redox': 6,
  'HDAC': 7,
  'mTOR': 8,
  'Proteasome': 9,
  'Saponin': 10,
  'Mitochondria': 11,
  'Ferroptosis': 12,
  'Tannin': 13,
  'Nonspecific reactive': 14},
 'decoder': {0: 'Control',
  1: 'Cytoskeletal',
  2: 'Hsp90',
  3: 'Kinase',
  4: 'Genotoxin',
  5: 'Miscellaneous',
  6: 'Redox',
  7: 'HDAC',
  8: 'mTOR',
  9: 'Proteasome',
  10: 'Saponin',
  11: 'Mitochondria',
  12: 'Ferroptosis',
  13: 'Tannin',
  14: 'Nonspecific reactive'}}

Now that we have assigned numerical labels to each type of cell injury, we can replace the corresponding injury names with these numerical values to meet the requirements of machine learning algorithms.

In [11]:
# updating main dataframe with numerical labels that represents cell injury
fs_profile_df["injury_type"] = fs_profile_df["injury_type"].apply(
    lambda injury: injury_labels_encoder[injury]
)

# display new injury column
fs_profile_df["injury_type"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

## Data Splitting 
---

### Holdout Dataset

Here we collected out holdout dataset. The holdout dataset is a subset of the dataset that is not used during model training or tuning. Instead, it is reserved solely for evaluating the model's performance after it has been trained.

In this notebook, we will include three different types of held-out datasets before proceeding with our machine learning training and evaluation.
 - Plate hold out 
 - treatment hold out 
 - well hold out 

Each of these held outdata will be stored in the `results/1.data_splits` directory 


## Holdout plate

Plates are randomly selected based on their Plate ID and save them as our `plate_holdout` data.

In [12]:
# plate
seed = 0
n_plates = 10

# setting random seed globally
np.random.seed(seed)

# selecting plates randomly from a list
selected_plates = (
    np.random.choice(fs_profile_df["Plate"].unique().tolist(), (n_plates, 1))
    .flatten()
    .tolist()
)
plate_holdout_df = fs_profile_df.loc[fs_profile_df["Plate"].isin(selected_plates)]

# take the indices of the held out data frame and use it to drop those samples from
# the main dataset. And then check if those indices are dropped
plate_idx_to_drop = plate_holdout_df.index.tolist()
fs_profile_df = fs_profile_df.drop(plate_idx_to_drop)
assert all(
    [
        True if num not in fs_profile_df.index.tolist() else False
        for num in plate_idx_to_drop
    ]
), "index to be dropped found in the main dataframe"

# saving the holdout data
plate_holdout_df.to_csv(
    data_split_dir / "plate_holdout.csv.gz", index=False, compression="gzip"
)

# display
print("plate holdout shape:", plate_holdout_df.shape)
plate_holdout_df.head()

plate holdout shape: (1948, 378)


Unnamed: 0,Compound BRD (short),Mahalanobis distance significant,Channels,Compound SMILES,Compound PubChem URL,Compound IUPAC,Compound PubChem CID,Characteristics [Cell Line],Comment [Image File Path],Well,...,Nuclei_Texture_InverseDifferenceMoment_DNA_20_0,Nuclei_Texture_InverseDifferenceMoment_DNA_5_0,Nuclei_Texture_InverseDifferenceMoment_RNA_5_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_DNA_10_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_DNA_10_0,Nuclei_Texture_SumEntropy_DNA_20_0,Nuclei_Texture_SumVariance_DNA_20_0
1044,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00110368/,B2,...,-0.062886,0.011554,0.087596,0.163291,0.058129,-0.00701,0.100495,0.093309,0.108031,0.139935
1045,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00110368/,B3,...,-0.018239,0.05969,0.11874,0.031366,-0.00188,0.124516,-0.115299,0.06554,0.095688,0.097536
1046,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00110368/,B4,...,-0.054246,0.013374,0.010342,0.070002,0.002531,0.079322,0.127617,0.071349,0.025576,0.05115
1047,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00110368/,B5,...,-0.015467,0.01659,0.077109,0.00767,0.039326,0.022608,0.012423,0.076461,0.076174,0.098298
1048,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00110368/,B6,...,-0.00636,0.040196,0.104461,0.081361,0.013528,0.012501,-0.044112,0.043685,0.063887,0.06643


### Holdout out a treatment holdout plate

To create our treatment holdout dataset, we group all wells treated with the same compound, then randomly select 15 wells per treatment group.

In [13]:
#### Plate heldout dataset
seed = 0
n_samples = 15

# collecting randomly select wells based on treatment
treatment_holdout_df = []
for treatment, df in fs_profile_df.groupby("Compound Name", as_index=False):
    heldout_treatment = df.sample(n=10, random_state=seed, replace="True")
    treatment_holdout_df.append(heldout_treatment)

# genearte treatment holdout dataframe
treatment_holdout_df = pd.concat(treatment_holdout_df)

# take the indices of the held out data frame and use it to drop those samples from
# the main dataset. And then check if those indices are dropped
treatment_idx_to_drop = treatment_holdout_df.index.tolist()
fs_profile_df = fs_profile_df.drop(treatment_idx_to_drop)
assert all(
    [
        True if num not in fs_profile_df.index.tolist() else False
        for num in treatment_idx_to_drop
    ]
), "index to be dropped found in the main dataframe"

# saving the holdout data
treatment_holdout_df.to_csv(
    data_split_dir / "treatment_holdout.csv.gz", index=False, compression="gzip"
)

# display
print("Treatment holdout shape:", treatment_holdout_df.shape)
treatment_holdout_df.head()

Treatment holdout shape: (1440, 378)


Unnamed: 0,Compound BRD (short),Mahalanobis distance significant,Channels,Compound SMILES,Compound PubChem URL,Compound IUPAC,Compound PubChem CID,Characteristics [Cell Line],Comment [Image File Path],Well,...,Nuclei_Texture_InverseDifferenceMoment_DNA_20_0,Nuclei_Texture_InverseDifferenceMoment_DNA_5_0,Nuclei_Texture_InverseDifferenceMoment_RNA_5_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_DNA_10_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_DNA_10_0,Nuclei_Texture_SumEntropy_DNA_20_0,Nuclei_Texture_SumVariance_DNA_20_0
16429,BRD-K23853216,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",COC(=O)[C@H]1CC2=C([C@@H](N1C(=O)CCl)C3=CC=C(C...,https://pubchem.ncbi.nlm.nih.gov/compound/1750826,"methyl (1S,3R)-2-(2-chloroacetyl)-1-(4-methoxy...",1750826.0,U2OS,/incoming/BR00114106/,D3,...,0.065993,0.153958,-0.233228,0.381282,-0.164482,0.809675,0.639935,-0.129022,-0.091392,0.154064
16432,BRD-K23853216,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",COC(=O)[C@H]1CC2=C([C@@H](N1C(=O)CCl)C3=CC=C(C...,https://pubchem.ncbi.nlm.nih.gov/compound/1750826,"methyl (1S,3R)-2-(2-chloroacetyl)-1-(4-methoxy...",1750826.0,U2OS,/incoming/BR00114106/,D6,...,0.010663,-1e-05,-0.049759,0.122486,-0.033164,-0.235909,-0.213589,-0.113143,-0.131685,-0.115575
16450,BRD-K23853216,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",COC(=O)[C@H]1CC2=C([C@@H](N1C(=O)CCl)C3=CC=C(C...,https://pubchem.ncbi.nlm.nih.gov/compound/1750826,"methyl (1S,3R)-2-(2-chloroacetyl)-1-(4-methoxy...",1750826.0,U2OS,/incoming/BR00114084/,D6,...,0.061096,0.02605,-0.052255,0.152578,0.083841,-0.151704,-0.08022,-0.117791,-0.095516,-0.080927
16393,BRD-K23853216,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",COC(=O)[C@H]1CC2=C([C@@H](N1C(=O)CCl)C3=CC=C(C...,https://pubchem.ncbi.nlm.nih.gov/compound/1750826,"methyl (1S,3R)-2-(2-chloroacetyl)-1-(4-methoxy...",1750826.0,U2OS,/incoming/BR00114104/,D3,...,-0.141511,-0.007705,-0.639508,0.542587,-0.656619,1.580346,0.828654,-0.140379,-0.237727,0.064642
16396,BRD-K23853216,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",COC(=O)[C@H]1CC2=C([C@@H](N1C(=O)CCl)C3=CC=C(C...,https://pubchem.ncbi.nlm.nih.gov/compound/1750826,"methyl (1S,3R)-2-(2-chloroacetyl)-1-(4-methoxy...",1750826.0,U2OS,/incoming/BR00114104/,D6,...,0.162191,0.089688,0.017946,0.077309,-0.065394,-0.091664,-0.245128,-0.196558,-0.147914,-0.160256


### Generating well holdout data

To generate the well hold out data, each plate was iterated and random wells were selected. However, an additional step was condcuting which was to seperate the control wells and the treated wells, due to the large label imbalance with the controls. Therefore, 5 wells were randomly selected and 10 wells were randomly selected from each individual plate

In [14]:
# parameters
seed = 0
n_controls = 5
n_samples = 10

# setting random seed globally
np.random.seed(seed)

# collecting randomly select wells based on treatment
wells_heldout_df = []
for treatment, df in fs_profile_df.groupby("Plate", as_index=False):
    # separate control wells and rest of all wells since there is a huge label imbalance
    # selected 5 control wells and 10 random wells from the plate
    df_control = df.loc[df["Compound Name"] == "DMSO"].sample(
        n=n_controls, random_state=seed
    )
    df_treated = df.loc[df["Compound Name"] != "DMSO"].sample(
        n=n_samples, random_state=seed
    )

    # concatenate those together
    well_heldout = pd.concat([df_control, df_treated])

    wells_heldout_df.append(well_heldout)

# genearte treatment holdout dataframe
wells_heldout_df = pd.concat(wells_heldout_df)

# take the indices of the held out data frame and use it to drop those samples from
# the main dataset. And then check if those indices are dropped
wells_idx_to_drop = wells_heldout_df.index.tolist()
fs_profile_df = fs_profile_df.drop(wells_idx_to_drop)
assert all(
    [
        True if num not in fs_profile_df.index.tolist() else False
        for num in treatment_idx_to_drop
    ]
), "index to be dropped found in the main dataframe"

# saving the holdout data
wells_heldout_df.to_csv(
    data_split_dir / "wells_holdout.csv.gz", index=False, compression="gzip"
)

# display
print("Wells holdout shape:", wells_heldout_df.shape)
wells_heldout_df.head()

Wells holdout shape: (1125, 378)


Unnamed: 0,Compound BRD (short),Mahalanobis distance significant,Channels,Compound SMILES,Compound PubChem URL,Compound IUPAC,Compound PubChem CID,Characteristics [Cell Line],Comment [Image File Path],Well,...,Nuclei_Texture_InverseDifferenceMoment_DNA_20_0,Nuclei_Texture_InverseDifferenceMoment_DNA_5_0,Nuclei_Texture_InverseDifferenceMoment_RNA_5_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_DNA_10_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_DNA_10_0,Nuclei_Texture_SumEntropy_DNA_20_0,Nuclei_Texture_SumVariance_DNA_20_0
4994,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00109990/,B12,...,-0.033483,-0.030215,-0.000435,-0.079334,0.020319,0.165035,-0.031037,0.02598,-0.000591,0.000401
5058,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00109990/,K10,...,0.020964,-0.018817,0.024526,-0.005947,0.038915,0.069078,-0.073076,-0.027233,-0.000291,0.014113
5050,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00109990/,J18,...,0.094199,0.064575,0.161873,-0.041443,-0.06339,0.193849,-0.124109,0.034587,0.073446,0.099643
5035,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00109990/,G9,...,0.034564,-0.03705,-0.037351,0.060268,0.012708,0.061491,-0.040638,-0.093101,-0.116318,-0.111381
4991,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00109990/,B9,...,0.037054,0.029132,-0.070626,0.132449,-0.002434,-0.023357,0.01658,-0.085577,-0.061243,-0.080604


In [15]:
# Showing the amount of data we have after removing the holdout data
meta_injury = []
for injury_type, df in fs_profile_df.groupby("injury_type"):
    # extract n_wells, n_compounds and unique compounds per injury_type
    n_wells = df.shape[0]
    unique_compounds = list(df["Compound Name"].unique())
    n_compounds = len(unique_compounds)

    # store information
    meta_injury.append([injury_type, n_wells, n_compounds, unique_compounds])

# creating data frame
injury_meta_df = pd.DataFrame(
    meta_injury, columns=["injury_type", "n_wells", "n_compounds", "compound_list"]
).sort_values("n_wells", ascending=False)
injury_meta_df.to_csv(data_split_dir / "injury_well_counts_table.csv", index=False)

# display
injury_meta_df

Unnamed: 0,injury_type,n_wells,n_compounds,compound_list
0,0,8398,1,[DMSO]
1,1,980,15,"[Nocodazole, Colchicine, Paclitaxel, Vinblasti..."
5,5,705,39,"[L-Buthionine-(S,R)-sulfoximine, CDDO Im, Cino..."
3,3,658,13,"[Wortmannin, Staurosporine, PI-103, BEZ-235, A..."
4,4,622,22,"[Camptothecin, CX-5461, Doxorubicin, Cladribin..."
2,2,387,3,"[Radicicol, Geldanamycin, 17-AAG]"
6,6,142,12,"[Menadione, PKF118-310, 4-Amino-1-naphthol (HC..."
10,10,94,11,"[Digitonin, Saikosaponin A, Polygalasaponin F,..."
7,7,90,5,"[AR-42, SAHA, ITF 2357, Panobinostat, Apicidin]"
11,11,86,4,"[Antimycin A, CCCP, Rotenone, Oligomycin A]"


In [16]:
# shape of the update training and testing dataset after removing holdout
print("training shape after removing holdouts", fs_profile_df.shape)
fs_profile_df.head()

training shape after removing holdouts (12447, 378)


Unnamed: 0,Compound BRD (short),Mahalanobis distance significant,Channels,Compound SMILES,Compound PubChem URL,Compound IUPAC,Compound PubChem CID,Characteristics [Cell Line],Comment [Image File Path],Well,...,Nuclei_Texture_InverseDifferenceMoment_DNA_20_0,Nuclei_Texture_InverseDifferenceMoment_DNA_5_0,Nuclei_Texture_InverseDifferenceMoment_RNA_5_0,Nuclei_Texture_SumAverage_AGP_5_0,Nuclei_Texture_SumAverage_DNA_10_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_5_0,Nuclei_Texture_SumEntropy_DNA_10_0,Nuclei_Texture_SumEntropy_DNA_20_0,Nuclei_Texture_SumVariance_DNA_20_0
0,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00110363/,B2,...,-0.011258,9.8e-05,0.057244,0.160847,-0.083034,-0.02329,-0.066369,-0.015235,-0.035909,-0.032067
1,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00110363/,B3,...,0.064689,0.025857,0.099848,0.017477,0.0213,0.058137,-0.09728,-0.073545,-0.044883,-0.01524
2,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00110363/,B4,...,0.020937,0.04106,0.119247,0.111741,0.041592,0.224199,-0.088845,0.000327,-0.003115,-0.014406
3,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00110363/,B5,...,0.006589,0.022156,0.036473,-0.013141,0.00869,0.06086,0.044924,0.040528,0.070877,0.072871
4,,No,"Ch1 (blue): Nuclei, Ch2 (green): ER, Ch3 (yell...",CS(=O)C,https://pubchem.ncbi.nlm.nih.gov/compound/679,methylsulfinylmethane,679.0,U2OS,/incoming/BR00110363/,B6,...,-0.028361,0.007213,0.023068,0.110361,0.054405,0.030157,0.06648,0.03891,0.048559,0.056829


### Splitting the data 

Splitting the data and saving them into csv files:
Files are split into test and training dataset.


In [17]:
# spliting the meta features and the feature column names
# loading feature columns json file
with open(data_split_dir / "raw_feature_names.json") as stream:
    feature_info = json.load(stream)

# selecing columns for splitting
y_col = "injury_type"
X_cols = feature_info["feature_cols"]

In [18]:
# spliting the dataset
seed = 0

X = fs_profile_df[X_cols]
y = fs_profile_df[y_col]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=seed, stratify=y
)

In [19]:
X_train.to_csv(data_split_dir / "X_train.csv.gz", index=False, compression="gzip")
y_train.to_csv(data_split_dir / "y_train.csv.gz", index=False, compression="gzip")
X_test.to_csv(data_split_dir / "X_test.csv.gz", index=False, compression="gzip")
y_test.to_csv(data_split_dir / "y_test.csv.gz", index=False, compression="gzip")