# Split Plate 4 data into training, testing, and holdout data

In [1]:
import pathlib
import random

import pandas as pd
from sklearn.model_selection import train_test_split

## Set paths and variables

In [2]:
# Set random state for the whole notebook to ensure reproducibility
random.seed(0)

# Path to feature selected data for plate 4
path_to_norm_data = pathlib.Path(
    "../3.process_cfret_features/data/single_cell_profiles/localhost231120090001_sc_feature_selected.parquet"
).resolve(strict=True)

# Path to annotated data for plate 4
path_to_annot_data = pathlib.Path(
    "../3.process_cfret_features/data/single_cell_profiles/localhost231120090001_sc_annotated.parquet"
).resolve(strict=True)

# Make directory for split data
output_dir = pathlib.Path("./data")
output_dir.mkdir(exist_ok=True)

## Load in feature selected data and annotated data

We want to include the number of adjacent neighbors as both a metadata and feature column.

To do this, we are loading in the annotated data, renaming the "Cells_Neighbors_NumberOfNeighbors_Adjacent" to "Metadata_Neighbors_Adjacent", and join it onto the normalized data frame.

In [3]:
# Load in plate 4 normalized dataset
plate_4_df = pd.read_parquet(path_to_norm_data)

# Load in plate 4 annotated dataset
neighbors_df = pd.read_parquet(
    path_to_annot_data,
    columns=[
        "Metadata_Well",
        "Metadata_Site",
        "Metadata_Nuclei_Number_Object_Number",
        "Cells_Neighbors_NumberOfNeighbors_Adjacent",
    ],
)

# Rename neighbors feature to one that includes metadata as a prefix
neighbors_df.rename(
    columns={
        "Cells_Neighbors_NumberOfNeighbors_Adjacent": "Metadata_Neighbors_Adjacent"
    },
    inplace=True,
)

# Add new metadata column of neighbors onto the normalized data frame
plate_4_df = plate_4_df.merge(
    neighbors_df,
    on=["Metadata_Well", "Metadata_Site", "Metadata_Nuclei_Number_Object_Number"],
    how="inner",
)

print(plate_4_df.shape)
plate_4_df.head()

(16860, 645)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
0,B,2,2,Healthy,,,787.816143,113.743274,832.342493,109.226914,...,-0.461174,-0.479672,-0.329932,-0.864552,-0.838569,-0.127637,-0.252706,-0.132274,-0.157145,2
1,B,2,2,Healthy,,,612.015315,258.122523,583.551435,220.881555,...,0.403113,0.371748,0.4036,0.186117,0.516979,-0.224873,-0.306135,-0.231927,-0.16219,3
2,B,2,2,Healthy,,,54.977129,271.567823,69.468928,289.697152,...,0.011678,0.368805,0.442829,0.658614,-0.956635,-0.654911,-0.322824,-0.249662,2.336911,4
3,B,2,2,Healthy,,,199.395062,268.792593,193.914743,269.641581,...,0.689609,0.695857,0.535811,0.878447,0.687421,0.963423,-0.332691,-0.247825,-0.341341,3
4,B,2,2,Healthy,,,531.072041,276.144082,524.886555,279.036043,...,0.706931,0.885813,0.575168,0.52139,1.579863,1.387026,-0.328504,-0.256229,-0.371318,3


## Split out hold out data first into two different CSVS

1. Remove all wells from DMSO treated healthy heart #7 and remove all wells from one failing heart (random)
2. Remove one well from each heart (both failing and healthy)

### Add DMSO treated heart 7 cells to holdout df

In [4]:
# Copy all DMSO heart #7 rows into the holdout_df
holdout_df = plate_4_df[
    (plate_4_df["Metadata_heart_number"] == 7)
    & (plate_4_df["Metadata_treatment"] == "DMSO")
]

# Check shape and output
print(
    "The shape of the holdout data frame after removing DMSO heart 7 cells is",
    holdout_df.shape,
)
holdout_df.head()

The shape of the holdout data frame after removing DMSO heart 7 cells is (939, 645)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
2593,B,10,7,Healthy,,DMSO,469.048643,155.781674,440.719879,177.033188,...,-0.044555,-0.540168,0.043843,-0.198925,0.595058,0.222201,-0.239421,-0.21136,-0.212959,3
2594,B,10,7,Healthy,,DMSO,836.303219,234.888889,847.646056,205.670039,...,0.121577,-0.289647,0.072589,0.365061,0.872407,0.133743,-0.200897,-0.179418,-0.273608,2
2595,B,10,7,Healthy,,DMSO,203.850902,252.441595,235.742035,269.541554,...,0.39245,0.410012,1.053545,0.622,0.751569,0.536901,-0.313279,-0.248133,-0.274165,5
2596,B,10,7,Healthy,,DMSO,763.702873,283.754937,750.886693,290.192565,...,1.151524,1.095538,1.362403,1.26948,1.28422,1.22018,-0.351635,-0.256424,-0.362906,4
2597,B,10,7,Healthy,,DMSO,494.459596,325.438763,467.58829,329.575066,...,0.388081,0.610565,1.648195,1.669826,1.895788,1.860167,-0.290718,-0.266486,-0.376898,7


### Add all rows from one random failing heart to holdout df 

In [5]:
# Add random seed to this code cell as well to avoid change the random well if this code cell if rerun
random.seed(0)

# Create a list of only the failing heart numbers
failing_heart_numbers = plate_4_df[plate_4_df["Metadata_cell_type"] == "Failing"][
    "Metadata_heart_number"
].unique()

# Select a random heart from the list of failing hearts
random_heart_number = random.choice(failing_heart_numbers)

# Find all rows from the selected failing heart to be added to the holdout data frame
random_failing_heart_rows = plate_4_df[
    (plate_4_df["Metadata_heart_number"] == random_heart_number)
    & (plate_4_df["Metadata_cell_type"] == "Failing")
]
holdout_df = pd.concat([holdout_df, random_failing_heart_rows], ignore_index=True)

# Save holdout_df as "holdout1_data" as CSV file
holdout_df.to_csv(f"{output_dir}/holdout1_data.csv", index=False)

# Check shape and output
print(
    "There were",
    random_failing_heart_rows.shape[0],
    "rows from heart number",
    random_heart_number,
)
print(
    "The shape of the holdout data frame after removing one random failing heart is",
    holdout_df.shape,
)
holdout_df.head()

There were 2069 rows from heart number 29
The shape of the holdout data frame after removing one random failing heart is (3008, 645)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
0,B,10,7,Healthy,,DMSO,469.048643,155.781674,440.719879,177.033188,...,-0.044555,-0.540168,0.043843,-0.198925,0.595058,0.222201,-0.239421,-0.21136,-0.212959,3
1,B,10,7,Healthy,,DMSO,836.303219,234.888889,847.646056,205.670039,...,0.121577,-0.289647,0.072589,0.365061,0.872407,0.133743,-0.200897,-0.179418,-0.273608,2
2,B,10,7,Healthy,,DMSO,203.850902,252.441595,235.742035,269.541554,...,0.39245,0.410012,1.053545,0.622,0.751569,0.536901,-0.313279,-0.248133,-0.274165,5
3,B,10,7,Healthy,,DMSO,763.702873,283.754937,750.886693,290.192565,...,1.151524,1.095538,1.362403,1.26948,1.28422,1.22018,-0.351635,-0.256424,-0.362906,4
4,B,10,7,Healthy,,DMSO,494.459596,325.438763,467.58829,329.575066,...,0.388081,0.610565,1.648195,1.669826,1.895788,1.860167,-0.290718,-0.266486,-0.376898,7


### Generate random well per heart number and add to holdout data frame

In [6]:
# Add random seed to this code cell as well to avoid change the random well if this code cell if rerun
random.seed(0)

# Create new df which removes the holdout data from the plate_4_df which will be used to find random wells from rest of the data
plate_4_df_filtered = plate_4_df[
    ~(
        (
            (plate_4_df["Metadata_heart_number"] == random_heart_number)
            & (plate_4_df["Metadata_cell_type"] == "Failing")
        )
        | (
            (plate_4_df["Metadata_heart_number"] == 7)
            & (plate_4_df["Metadata_treatment"] == "DMSO")
        )
    )
]

# Generate random well per heart number to add to holdout_df
random_wells = (
    plate_4_df_filtered.groupby("Metadata_heart_number")["Metadata_Well"]
    .apply(
        lambda x: random.choice(sorted(x.unique()))
    )  # Selecting a random well from sorted unique values
    .reset_index(name="Random_Metadata_Well")
)

# Filter plate_4_df_filtered based on Metadata_heart_number and Metadata_Well in random_wells
matched_rows = plate_4_df_filtered[
    (
        plate_4_df_filtered["Metadata_heart_number"].isin(
            random_wells["Metadata_heart_number"]
        )
    )
    & (plate_4_df_filtered["Metadata_Well"].isin(random_wells["Random_Metadata_Well"]))
]

# Prior to adding data into holdout_df to remove all holdout data at once, save random well data as "holdout2_data"
matched_rows.to_csv(f"{output_dir}/holdout2_data.csv", index=False)

# Add matching rows to the holdout data frame
holdout_df = pd.concat([holdout_df, matched_rows], ignore_index=True)

# Check shape and output
print("There were", matched_rows.shape[0], "rows matching the random wells per heart")
print(
    "The shape of the holdout data frame after removing a random well per heart is",
    holdout_df.shape,
)
holdout_df.head()

There were 1469 rows matching the random wells per heart
The shape of the holdout data frame after removing a random well per heart is (4477, 645)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
0,B,10,7,Healthy,,DMSO,469.048643,155.781674,440.719879,177.033188,...,-0.044555,-0.540168,0.043843,-0.198925,0.595058,0.222201,-0.239421,-0.21136,-0.212959,3
1,B,10,7,Healthy,,DMSO,836.303219,234.888889,847.646056,205.670039,...,0.121577,-0.289647,0.072589,0.365061,0.872407,0.133743,-0.200897,-0.179418,-0.273608,2
2,B,10,7,Healthy,,DMSO,203.850902,252.441595,235.742035,269.541554,...,0.39245,0.410012,1.053545,0.622,0.751569,0.536901,-0.313279,-0.248133,-0.274165,5
3,B,10,7,Healthy,,DMSO,763.702873,283.754937,750.886693,290.192565,...,1.151524,1.095538,1.362403,1.26948,1.28422,1.22018,-0.351635,-0.256424,-0.362906,4
4,B,10,7,Healthy,,DMSO,494.459596,325.438763,467.58829,329.575066,...,0.388081,0.610565,1.648195,1.669826,1.895788,1.860167,-0.290718,-0.266486,-0.376898,7


## Remove all holdout data from the plate_4_df prior to splitting

In [7]:
# Remove all rows from holdout data (using the data frame itself was not working)
plate_4_df = plate_4_df[
    ~(
        (
            (plate_4_df["Metadata_heart_number"] == random_heart_number)
            & (plate_4_df["Metadata_cell_type"] == "Failing")
        )
        | (
            (plate_4_df["Metadata_heart_number"] == 7)
            & (plate_4_df["Metadata_treatment"] == "DMSO")
        )
        | (
            (
                plate_4_df["Metadata_heart_number"].isin(
                    random_wells["Metadata_heart_number"]
                )
            )
            & (plate_4_df["Metadata_Well"].isin(random_wells["Random_Metadata_Well"]))
        )
    )
]

print(plate_4_df.shape)
plate_4_df.head()

(12383, 645)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
0,B,2,2,Healthy,,,787.816143,113.743274,832.342493,109.226914,...,-0.461174,-0.479672,-0.329932,-0.864552,-0.838569,-0.127637,-0.252706,-0.132274,-0.157145,2
1,B,2,2,Healthy,,,612.015315,258.122523,583.551435,220.881555,...,0.403113,0.371748,0.4036,0.186117,0.516979,-0.224873,-0.306135,-0.231927,-0.16219,3
2,B,2,2,Healthy,,,54.977129,271.567823,69.468928,289.697152,...,0.011678,0.368805,0.442829,0.658614,-0.956635,-0.654911,-0.322824,-0.249662,2.336911,4
3,B,2,2,Healthy,,,199.395062,268.792593,193.914743,269.641581,...,0.689609,0.695857,0.535811,0.878447,0.687421,0.963423,-0.332691,-0.247825,-0.341341,3
4,B,2,2,Healthy,,,531.072041,276.144082,524.886555,279.036043,...,0.706931,0.885813,0.575168,0.52139,1.579863,1.387026,-0.328504,-0.256229,-0.371318,3


## Split remaining Plate 4 data into testing and training data

In [8]:
# Set random state as 0 (same as the rest of the notebook)
random_state = 0

# Set the ratio of the test data to 30% (training data will be 70%)
test_ratio = 0.30

# Split the plate 4 data into training and test
training_data, testing_data = train_test_split(
    plate_4_df,
    test_size=test_ratio,
    stratify=plate_4_df[["Metadata_cell_type"]],
    random_state=random_state,
)

# View shapes and example output
print("The testing data contains", testing_data.shape[0], "single-cells.")
print("The training data contains", training_data.shape[0], "single-cells.")
testing_data.head()

The testing data contains 3715 single-cells.
The training data contains 8668 single-cells.


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
6431,D,4,4,Failing,Dilated_Cardiomyopathy,,286.257326,387.606227,265.588166,405.621719,...,0.582602,0.901329,1.263444,1.415663,1.116278,1.071751,-0.332432,-0.262243,-0.35832,3
686,B,4,2,Healthy,,,827.263011,666.224907,828.928409,653.580655,...,0.378779,0.282226,0.165549,0.186472,0.24107,0.249344,-0.267775,-0.220241,-0.312694,7
785,B,4,2,Healthy,,,865.050691,445.642396,866.28232,445.781581,...,0.893356,0.892557,1.325691,1.495751,0.326689,0.693117,-0.338708,-0.260871,-0.242862,0
1599,B,6,2,Healthy,,,1065.337744,143.458217,1043.660951,156.99001,...,-4.011894,-3.4809,-3.427293,-1.23425,-1.791532,-1.636565,6.247968,0.992534,8.10253,3
16075,G,8,4,Failing,Dilated_Cardiomyopathy,,795.085824,797.297975,803.164513,766.772965,...,1.128164,1.130697,1.275203,1.361499,1.530022,1.295005,-0.351945,-0.255933,-0.373874,2


### Save training and test data as CSVs

In [9]:
# Save training_data as CSV file
training_data.to_csv(f"{output_dir}/training_data.csv", index=False)

# Save testing_data as CSV file
testing_data.to_csv(f"{output_dir}/testing_data.csv", index=False)