# Split Plate 4 data into training, testing, and holdout data

In [1]:
import pathlib
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split

## Set paths and variables

In [2]:
# Set random state for the whole notebook to ensure reproducibility
random.seed(0)

# Path to feature selected data for plate 4
path_to_norm_data = pathlib.Path(
    "../3.process_cfret_features/data/single_cell_profiles/localhost231120090001_sc_feature_selected.parquet"
).resolve(strict=True)

# Path to annotated data for plate 4
path_to_annot_data = pathlib.Path(
    "../3.process_cfret_features/data/single_cell_profiles/localhost231120090001_sc_annotated.parquet"
).resolve(strict=True)

# Make directory for split data
output_dir = pathlib.Path("./data")
output_dir.mkdir(exist_ok=True)

## Load in feature selected data and annotated data

We want to include the number of adjacent neighbors as both a metadata and feature column.

To do this, we are loading in the annotated data, renaming the "Cells_Neighbors_NumberOfNeighbors_Adjacent" to "Metadata_Neighbors_Adjacent", and join it onto the normalized data frame.

In [3]:
# Load in plate 4 normalized dataset
plate_4_df = pd.read_parquet(path_to_norm_data)

# Load in plate 4 annotated dataset
neighbors_df = pd.read_parquet(
    path_to_annot_data,
    columns=[
        "Metadata_Well",
        "Metadata_Site",
        "Metadata_Nuclei_Number_Object_Number",
        "Cells_Neighbors_NumberOfNeighbors_Adjacent",
    ],
)

# Rename neighbors feature to one that includes metadata as a prefix
neighbors_df.rename(columns={'Cells_Neighbors_NumberOfNeighbors_Adjacent': 'Metadata_Neighbors_Adjacent'}, inplace=True)

# Add new metadata column of neighbors onto the normalized data frame
plate_4_df = plate_4_df.merge(neighbors_df, 
                              on=["Metadata_Well", "Metadata_Site", "Metadata_Nuclei_Number_Object_Number"], 
                              how="inner")

print(plate_4_df.shape)
plate_4_df.head()

(17389, 651)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
0,B,2,2,Healthy,,,199.395062,268.792593,193.914743,269.641581,...,0.71238,0.722155,0.561365,0.900009,0.706696,0.978721,-0.340578,-0.261722,-0.366704,3.0
1,B,2,2,Healthy,,,303.540675,350.732143,314.842955,368.939934,...,0.526016,0.641473,0.755986,0.989572,1.459667,1.473404,-0.327215,-0.269073,-0.389804,3.0
2,B,2,2,Healthy,,,700.388506,371.8,708.112277,380.711178,...,1.172259,1.299435,1.117026,1.195194,1.438034,1.421291,-0.358009,-0.273749,-0.396355,7.0
3,B,2,2,Healthy,,,855.869318,481.075758,877.161009,515.307396,...,0.699704,0.157867,1.070312,0.920828,0.949585,0.564866,-0.281425,-0.263644,-0.300786,4.0
4,B,2,2,Healthy,,,586.624514,509.892023,590.453967,504.113587,...,0.585848,0.644228,1.088454,1.072759,1.19044,0.997095,-0.346256,-0.275028,-0.371898,3.0


## Split out hold out data first into two different CSVS

1. Remove all wells from DMSO treated healthy heart #7 and remove all wells from one failing heart (random)
2. Remove one well from each heart (both failing and healthy)

### Add DMSO treated heart 7 cells to holdout df

In [4]:
# Copy all DMSO heart #7 rows into the holdout_df
holdout_df = plate_4_df[(plate_4_df['Metadata_heart_number'] == 7) & (plate_4_df['Metadata_treatment'] == 'DMSO')]

# Check shape and output
print("The shape of the holdout data frame after removing DMSO heart 7 cells is", holdout_df.shape)
holdout_df.head()

The shape of the holdout data frame after removing DMSO heart 7 cells is (1022, 651)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
2699,B,10,7,Healthy,,DMSO,229.82593,170.498199,240.328291,190.038185,...,0.654395,0.572676,0.567056,0.342193,0.580054,0.098546,-0.343161,-0.252958,-0.311639,1.0
2700,B,10,7,Healthy,,DMSO,363.829897,251.636082,315.23257,310.768169,...,0.93251,1.227744,0.196991,0.38597,-0.862651,0.016159,-0.355115,-0.245769,-0.179325,4.0
2701,B,10,7,Healthy,,DMSO,492.457094,363.500425,491.26267,355.074065,...,1.069509,1.003322,1.181124,1.043083,0.114975,-0.352435,-0.350317,-0.268247,-0.18098,3.0
2702,B,10,7,Healthy,,DMSO,622.594258,404.72823,622.301929,403.471015,...,0.739463,1.010848,0.647891,0.837965,0.553191,1.056193,-0.351172,-0.252004,-0.360287,2.0
2703,B,10,7,Healthy,,DMSO,315.622909,508.262548,345.078772,515.164993,...,0.342385,0.147108,-0.083125,-0.055249,-0.777251,-0.397188,-0.32462,-0.22336,-0.156486,1.0


### Add all rows from one random failing heart to holdout df 

In [5]:
# Add random seed to this code cell as well to avoid change the random well if this code cell if rerun
random.seed(0)

# Create a list of only the failing heart numbers
failing_heart_numbers = plate_4_df[plate_4_df["Metadata_cell_type"] == "Failing"][
    "Metadata_heart_number"
].unique()

# Select a random heart from the list of failing hearts
random_heart_number = random.choice(failing_heart_numbers)

# Find all rows from the selected failing heart to be added to the holdout data frame
random_failing_heart_rows = plate_4_df[
    (plate_4_df["Metadata_heart_number"] == random_heart_number)
    & (plate_4_df["Metadata_cell_type"] == "Failing")
]
holdout_df = pd.concat([holdout_df, random_failing_heart_rows], ignore_index=True)

# Save holdout_df as "holdout1_data" as CSV file
holdout_df.to_csv(f"{output_dir}/holdout1_data.csv", index=False)

# Check shape and output
print("There were", random_failing_heart_rows.shape[0], "rows from heart number", random_heart_number)
print("The shape of the holdout data frame after removing one random failing heart is", holdout_df.shape)
holdout_df.head()

There were 2132 rows from heart number 29
The shape of the holdout data frame after removing one random failing heart is (3154, 651)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
0,B,10,7,Healthy,,DMSO,229.82593,170.498199,240.328291,190.038185,...,0.654395,0.572676,0.567056,0.342193,0.580054,0.098546,-0.343161,-0.252958,-0.311639,1.0
1,B,10,7,Healthy,,DMSO,363.829897,251.636082,315.23257,310.768169,...,0.93251,1.227744,0.196991,0.38597,-0.862651,0.016159,-0.355115,-0.245769,-0.179325,4.0
2,B,10,7,Healthy,,DMSO,492.457094,363.500425,491.26267,355.074065,...,1.069509,1.003322,1.181124,1.043083,0.114975,-0.352435,-0.350317,-0.268247,-0.18098,3.0
3,B,10,7,Healthy,,DMSO,622.594258,404.72823,622.301929,403.471015,...,0.739463,1.010848,0.647891,0.837965,0.553191,1.056193,-0.351172,-0.252004,-0.360287,2.0
4,B,10,7,Healthy,,DMSO,315.622909,508.262548,345.078772,515.164993,...,0.342385,0.147108,-0.083125,-0.055249,-0.777251,-0.397188,-0.32462,-0.22336,-0.156486,1.0


### Generate random well per heart number and add to holdout data frame

In [6]:
# Add random seed to this code cell as well to avoid change the random well if this code cell if rerun
random.seed(0)

# Create new df which removes the holdout data from the plate_4_df which will be used to find random wells from rest of the data
plate_4_df_filtered = plate_4_df[
    ~(
        (
            (plate_4_df["Metadata_heart_number"] == random_heart_number)
            & (plate_4_df["Metadata_cell_type"] == "Failing")
        )
        |
        (
            (plate_4_df["Metadata_heart_number"] == 7)
            & (plate_4_df["Metadata_treatment"] == "DMSO")
        )
    )
]

# Generate random well per heart number to add to holdout_df
random_wells = (
    plate_4_df_filtered.groupby('Metadata_heart_number')['Metadata_Well']
    .apply(lambda x: random.choice(sorted(x.unique())))  # Selecting a random well from sorted unique values
    .reset_index(name='Random_Metadata_Well')
)

# Filter plate_4_df_filtered based on Metadata_heart_number and Metadata_Well in random_wells
matched_rows = plate_4_df_filtered[
    (plate_4_df_filtered['Metadata_heart_number'].isin(random_wells['Metadata_heart_number'])) &
    (plate_4_df_filtered['Metadata_Well'].isin(random_wells['Random_Metadata_Well']))
]

# Prior to adding data into holdout_df to remove all holdout data at once, save random well data as "holdout2_data"
matched_rows.to_csv(f"{output_dir}/holdout2_data.csv", index=False)

# Add matching rows to the holdout data frame
holdout_df = pd.concat([holdout_df, matched_rows], ignore_index=True)

# Check shape and output
print("There were", matched_rows.shape[0], "rows matching the random wells per heart")
print("The shape of the holdout data frame after removing a random well per heart is", holdout_df.shape)
holdout_df.head()

There were 1495 rows matching the random wells per heart
The shape of the holdout data frame after removing a random well per heart is (4649, 651)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
0,B,10,7,Healthy,,DMSO,229.82593,170.498199,240.328291,190.038185,...,0.654395,0.572676,0.567056,0.342193,0.580054,0.098546,-0.343161,-0.252958,-0.311639,1.0
1,B,10,7,Healthy,,DMSO,363.829897,251.636082,315.23257,310.768169,...,0.93251,1.227744,0.196991,0.38597,-0.862651,0.016159,-0.355115,-0.245769,-0.179325,4.0
2,B,10,7,Healthy,,DMSO,492.457094,363.500425,491.26267,355.074065,...,1.069509,1.003322,1.181124,1.043083,0.114975,-0.352435,-0.350317,-0.268247,-0.18098,3.0
3,B,10,7,Healthy,,DMSO,622.594258,404.72823,622.301929,403.471015,...,0.739463,1.010848,0.647891,0.837965,0.553191,1.056193,-0.351172,-0.252004,-0.360287,2.0
4,B,10,7,Healthy,,DMSO,315.622909,508.262548,345.078772,515.164993,...,0.342385,0.147108,-0.083125,-0.055249,-0.777251,-0.397188,-0.32462,-0.22336,-0.156486,1.0


## Remove all holdout data from the plate_4_df prior to splitting

In [7]:
# Remove all rows from holdout data (using the data frame itself was not working)
plate_4_df = plate_4_df[
    ~(
        (
            (plate_4_df["Metadata_heart_number"] == random_heart_number)
            & (plate_4_df["Metadata_cell_type"] == "Failing")
        )
        |
        (
            (plate_4_df["Metadata_heart_number"] == 7)
            & (plate_4_df["Metadata_treatment"] == "DMSO")
        )
        |
        (
            (plate_4_df["Metadata_heart_number"].isin(random_wells["Metadata_heart_number"]))
            & (plate_4_df["Metadata_Well"].isin(random_wells["Random_Metadata_Well"]))
        )
    )
]

print(plate_4_df.shape)
plate_4_df.head()

(12740, 651)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
0,B,2,2,Healthy,,,199.395062,268.792593,193.914743,269.641581,...,0.71238,0.722155,0.561365,0.900009,0.706696,0.978721,-0.340578,-0.261722,-0.366704,3.0
1,B,2,2,Healthy,,,303.540675,350.732143,314.842955,368.939934,...,0.526016,0.641473,0.755986,0.989572,1.459667,1.473404,-0.327215,-0.269073,-0.389804,3.0
2,B,2,2,Healthy,,,700.388506,371.8,708.112277,380.711178,...,1.172259,1.299435,1.117026,1.195194,1.438034,1.421291,-0.358009,-0.273749,-0.396355,7.0
3,B,2,2,Healthy,,,855.869318,481.075758,877.161009,515.307396,...,0.699704,0.157867,1.070312,0.920828,0.949585,0.564866,-0.281425,-0.263644,-0.300786,4.0
4,B,2,2,Healthy,,,586.624514,509.892023,590.453967,504.113587,...,0.585848,0.644228,1.088454,1.072759,1.19044,0.997095,-0.346256,-0.275028,-0.371898,3.0


## Split remaining Plate 4 data into testing and training data

In [8]:
# Set random state as 0 (same as the rest of the notebook)
random_state = 0

# Set the ratio of the test data to 30% (training data will be 70%)
test_ratio = 0.30

# Split the plate 4 data into training and test
training_data, testing_data = train_test_split(
    plate_4_df,
    test_size=test_ratio,
    stratify=plate_4_df[["Metadata_cell_type"]],
    random_state=random_state,
)

# View shapes and example output
print("The testing data contains", testing_data.shape[0], "single-cells.")
print("The training data contains", training_data.shape[0], "single-cells.")
testing_data.head()

The testing data contains 3822 single-cells.
The training data contains 8918 single-cells.


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
2690,B,9,19,Failing,Dilated_Cardiomyopathy,,292.619666,653.812616,290.189173,642.672837,...,0.21532,0.067805,0.307272,0.574744,-0.643445,-0.052419,0.062801,-0.266395,-0.095832,2.0
646,B,3,19,Failing,Dilated_Cardiomyopathy,,829.819234,212.461264,831.782265,244.808494,...,0.752516,0.731207,-0.300196,0.535109,0.024433,0.571316,-0.327116,-0.236659,-0.307145,4.0
9176,E,2,2,Healthy,,,749.978581,164.147256,711.697554,173.105465,...,0.846673,1.012776,1.225051,1.100718,-0.298033,0.704561,-0.350721,-0.260515,-0.216415,3.0
5675,C,11,7,Healthy,,,819.10939,719.07454,805.142348,759.631587,...,0.211224,0.294718,0.877232,1.18482,0.558947,0.128171,-0.324772,-0.25901,-0.313125,1.0
1640,B,6,2,Healthy,,,131.662548,1037.732046,169.906603,1032.438015,...,0.59326,0.35909,0.728098,-0.426629,-0.233818,-0.557531,-0.286875,-0.118742,-0.053439,3.0


### Save training and test data as CSVs

In [9]:
# Save training_data as CSV file
training_data.to_csv(f"{output_dir}/training_data.csv", index=False)

# Save testing_data as CSV file
testing_data.to_csv(f"{output_dir}/testing_data.csv", index=False)