# Split Plate 4 data into training, testing, and holdout data

In [1]:
import pathlib
import random

import pandas as pd
from sklearn.model_selection import train_test_split

## Set paths and variables

In [2]:
# Set random state for the whole notebook to ensure reproducibility
random.seed(0)

# Path to feature selected data for plate 4
path_to_norm_data = pathlib.Path(
    "../3.process_cfret_features/data/single_cell_profiles/localhost231120090001_sc_feature_selected.parquet"
).resolve(strict=True)

# Path to annotated data for plate 4
path_to_annot_data = pathlib.Path(
    "../3.process_cfret_features/data/single_cell_profiles/localhost231120090001_sc_annotated.parquet"
).resolve(strict=True)

# Make directory for split data
output_dir = pathlib.Path("./data")
output_dir.mkdir(exist_ok=True)

## Load in feature selected data and annotated data

We want to include the number of adjacent neighbors as both a metadata and feature column.

To do this, we are loading in the annotated data, renaming the "Cells_Neighbors_NumberOfNeighbors_Adjacent" to "Metadata_Neighbors_Adjacent", and join it onto the normalized data frame.

In [3]:
# Load in plate 4 normalized dataset
plate_4_df = pd.read_parquet(path_to_norm_data)

# Load in plate 4 annotated dataset
neighbors_df = pd.read_parquet(
    path_to_annot_data,
    columns=[
        "Metadata_Well",
        "Metadata_Site",
        "Metadata_Nuclei_Number_Object_Number",
        "Cells_Neighbors_NumberOfNeighbors_Adjacent",
    ],
)

# Rename neighbors feature to one that includes metadata as a prefix
neighbors_df.rename(
    columns={
        "Cells_Neighbors_NumberOfNeighbors_Adjacent": "Metadata_Neighbors_Adjacent"
    },
    inplace=True,
)

# Add new metadata column of neighbors onto the normalized data frame
plate_4_df = plate_4_df.merge(
    neighbors_df,
    on=["Metadata_Well", "Metadata_Site", "Metadata_Nuclei_Number_Object_Number"],
    how="inner",
)

print(plate_4_df.shape)
plate_4_df.head()

(16887, 658)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_03_256,Nuclei_Texture_SumVariance_Hoechst_3_03_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
0,B,2,2,Healthy,,,199.395062,268.792593,193.914743,269.641581,...,0.692428,0.536752,0.878746,0.686496,0.963962,-0.324604,-0.291686,-0.238556,-0.351116,3.0
1,B,2,2,Healthy,,,303.540675,350.732143,314.842955,368.939934,...,0.605508,0.741645,0.972737,1.456869,1.470718,-0.301586,-0.3054,-0.246969,-0.376565,3.0
2,B,2,2,Healthy,,,700.388506,371.8,708.112277,380.711178,...,1.31434,1.121741,1.188526,1.434736,1.417333,-0.343746,-0.35866,-0.25232,-0.383783,7.0
3,B,2,2,Healthy,,,855.869318,481.075758,877.161009,515.307396,...,0.084512,1.072561,0.900595,0.934998,0.540008,-0.271485,-0.305855,-0.240756,-0.278494,4.0
4,B,2,2,Healthy,,,586.624514,509.892023,590.453967,504.113587,...,0.608475,1.091661,1.060037,1.18142,0.982785,-0.330949,-0.263325,-0.253783,-0.356839,3.0


## Split out hold out data first into two different CSVS

1. Remove all wells from DMSO treated healthy heart #7 and remove all wells from one failing heart (random)
2. Remove one well from each heart (both failing and healthy)

### Add DMSO treated heart 7 cells to holdout df

In [4]:
# Copy all DMSO heart #7 rows into the holdout_df
holdout_df = plate_4_df[
    (plate_4_df["Metadata_heart_number"] == 7)
    & (plate_4_df["Metadata_treatment"] == "DMSO")
]

# Check shape and output
print(
    "The shape of the holdout data frame after removing DMSO heart 7 cells is",
    holdout_df.shape,
)
holdout_df.head()

The shape of the holdout data frame after removing DMSO heart 7 cells is (944, 658)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_03_256,Nuclei_Texture_SumVariance_Hoechst_3_03_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
2639,B,10,7,Healthy,,DMSO,469.048643,155.781674,440.719879,177.033188,...,-0.514377,0.028779,-0.211263,0.594406,0.225559,-0.255284,-0.107377,-0.195869,-0.210622,3.0
2640,B,10,7,Healthy,,DMSO,203.850902,252.441595,235.742035,269.541554,...,0.41334,1.07133,0.619291,0.750453,0.539063,-0.302181,-0.259822,-0.238917,-0.277602,5.0
2641,B,10,7,Healthy,,DMSO,763.702873,283.754937,750.886693,290.192565,...,1.08266,1.390237,1.274366,1.281527,1.219743,-0.343444,-0.335056,-0.248623,-0.374715,4.0
2642,B,10,7,Healthy,,DMSO,494.459596,325.438763,467.58829,329.575066,...,0.609152,1.685327,1.679408,1.891283,1.857298,-0.287941,-0.370003,-0.260403,-0.390028,7.0
2643,B,10,7,Healthy,,DMSO,590.450266,355.174956,599.530776,336.15348,...,0.938652,0.214267,0.764372,0.848238,1.186908,-0.331475,-0.318234,-0.203535,-0.340312,3.0


### Add all rows from one random failing heart to holdout df 

In [5]:
# Add random seed to this code cell as well to avoid change the random well if this code cell if rerun
random.seed(0)

# Create a list of only the failing heart numbers
failing_heart_numbers = plate_4_df[plate_4_df["Metadata_cell_type"] == "Failing"][
    "Metadata_heart_number"
].unique()

# Select a random heart from the list of failing hearts
random_heart_number = random.choice(failing_heart_numbers)

# Find all rows from the selected failing heart to be added to the holdout data frame
random_failing_heart_rows = plate_4_df[
    (plate_4_df["Metadata_heart_number"] == random_heart_number)
    & (plate_4_df["Metadata_cell_type"] == "Failing")
]
holdout_df = pd.concat([holdout_df, random_failing_heart_rows], ignore_index=True)

# Save holdout_df as "holdout1_data" as CSV file
holdout_df.to_csv(f"{output_dir}/holdout1_data.csv", index=False)

# Check shape and output
print(
    "There were",
    random_failing_heart_rows.shape[0],
    "rows from heart number",
    random_heart_number,
)
print(
    "The shape of the holdout data frame after removing one random failing heart is",
    holdout_df.shape,
)
holdout_df.head()

There were 2045 rows from heart number 29
The shape of the holdout data frame after removing one random failing heart is (2989, 658)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_03_256,Nuclei_Texture_SumVariance_Hoechst_3_03_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
0,B,10,7,Healthy,,DMSO,469.048643,155.781674,440.719879,177.033188,...,-0.514377,0.028779,-0.211263,0.594406,0.225559,-0.255284,-0.107377,-0.195869,-0.210622,3.0
1,B,10,7,Healthy,,DMSO,203.850902,252.441595,235.742035,269.541554,...,0.41334,1.07133,0.619291,0.750453,0.539063,-0.302181,-0.259822,-0.238917,-0.277602,5.0
2,B,10,7,Healthy,,DMSO,763.702873,283.754937,750.886693,290.192565,...,1.08266,1.390237,1.274366,1.281527,1.219743,-0.343444,-0.335056,-0.248623,-0.374715,4.0
3,B,10,7,Healthy,,DMSO,494.459596,325.438763,467.58829,329.575066,...,0.609152,1.685327,1.679408,1.891283,1.857298,-0.287941,-0.370003,-0.260403,-0.390028,7.0
4,B,10,7,Healthy,,DMSO,590.450266,355.174956,599.530776,336.15348,...,0.938652,0.214267,0.764372,0.848238,1.186908,-0.331475,-0.318234,-0.203535,-0.340312,3.0


### Generate random well per heart number and add to holdout data frame

In [6]:
# Add random seed to this code cell as well to avoid change the random well if this code cell if rerun
random.seed(0)

# Create new df which removes the holdout data from the plate_4_df which will be used to find random wells from rest of the data
plate_4_df_filtered = plate_4_df[
    ~(
        (
            (plate_4_df["Metadata_heart_number"] == random_heart_number)
            & (plate_4_df["Metadata_cell_type"] == "Failing")
        )
        | (
            (plate_4_df["Metadata_heart_number"] == 7)
            & (plate_4_df["Metadata_treatment"] == "DMSO")
        )
    )
]

# Generate random well per heart number to add to holdout_df
random_wells = (
    plate_4_df_filtered.groupby("Metadata_heart_number")["Metadata_Well"]
    .apply(
        lambda x: random.choice(sorted(x.unique()))
    )  # Selecting a random well from sorted unique values
    .reset_index(name="Random_Metadata_Well")
)

# Filter plate_4_df_filtered based on Metadata_heart_number and Metadata_Well in random_wells
matched_rows = plate_4_df_filtered[
    (
        plate_4_df_filtered["Metadata_heart_number"].isin(
            random_wells["Metadata_heart_number"]
        )
    )
    & (plate_4_df_filtered["Metadata_Well"].isin(random_wells["Random_Metadata_Well"]))
]

# Prior to adding data into holdout_df to remove all holdout data at once, save random well data as "holdout2_data"
matched_rows.to_csv(f"{output_dir}/holdout2_data.csv", index=False)

# Add matching rows to the holdout data frame
holdout_df = pd.concat([holdout_df, matched_rows], ignore_index=True)

# Check shape and output
print("There were", matched_rows.shape[0], "rows matching the random wells per heart")
print(
    "The shape of the holdout data frame after removing a random well per heart is",
    holdout_df.shape,
)
holdout_df.head()

There were 1452 rows matching the random wells per heart
The shape of the holdout data frame after removing a random well per heart is (4441, 658)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_03_256,Nuclei_Texture_SumVariance_Hoechst_3_03_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
0,B,10,7,Healthy,,DMSO,469.048643,155.781674,440.719879,177.033188,...,-0.514377,0.028779,-0.211263,0.594406,0.225559,-0.255284,-0.107377,-0.195869,-0.210622,3.0
1,B,10,7,Healthy,,DMSO,203.850902,252.441595,235.742035,269.541554,...,0.41334,1.07133,0.619291,0.750453,0.539063,-0.302181,-0.259822,-0.238917,-0.277602,5.0
2,B,10,7,Healthy,,DMSO,763.702873,283.754937,750.886693,290.192565,...,1.08266,1.390237,1.274366,1.281527,1.219743,-0.343444,-0.335056,-0.248623,-0.374715,4.0
3,B,10,7,Healthy,,DMSO,494.459596,325.438763,467.58829,329.575066,...,0.609152,1.685327,1.679408,1.891283,1.857298,-0.287941,-0.370003,-0.260403,-0.390028,7.0
4,B,10,7,Healthy,,DMSO,590.450266,355.174956,599.530776,336.15348,...,0.938652,0.214267,0.764372,0.848238,1.186908,-0.331475,-0.318234,-0.203535,-0.340312,3.0


## Remove all holdout data from the plate_4_df prior to splitting

In [7]:
# Remove all rows from holdout data (using the data frame itself was not working)
plate_4_df = plate_4_df[
    ~(
        (
            (plate_4_df["Metadata_heart_number"] == random_heart_number)
            & (plate_4_df["Metadata_cell_type"] == "Failing")
        )
        | (
            (plate_4_df["Metadata_heart_number"] == 7)
            & (plate_4_df["Metadata_treatment"] == "DMSO")
        )
        | (
            (
                plate_4_df["Metadata_heart_number"].isin(
                    random_wells["Metadata_heart_number"]
                )
            )
            & (plate_4_df["Metadata_Well"].isin(random_wells["Random_Metadata_Well"]))
        )
    )
]

print(plate_4_df.shape)
plate_4_df.head()

(12446, 658)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_03_256,Nuclei_Texture_SumVariance_Hoechst_3_03_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
0,B,2,2,Healthy,,,199.395062,268.792593,193.914743,269.641581,...,0.692428,0.536752,0.878746,0.686496,0.963962,-0.324604,-0.291686,-0.238556,-0.351116,3.0
1,B,2,2,Healthy,,,303.540675,350.732143,314.842955,368.939934,...,0.605508,0.741645,0.972737,1.456869,1.470718,-0.301586,-0.3054,-0.246969,-0.376565,3.0
2,B,2,2,Healthy,,,700.388506,371.8,708.112277,380.711178,...,1.31434,1.121741,1.188526,1.434736,1.417333,-0.343746,-0.35866,-0.25232,-0.383783,7.0
3,B,2,2,Healthy,,,855.869318,481.075758,877.161009,515.307396,...,0.084512,1.072561,0.900595,0.934998,0.540008,-0.271485,-0.305855,-0.240756,-0.278494,4.0
4,B,2,2,Healthy,,,586.624514,509.892023,590.453967,504.113587,...,0.608475,1.091661,1.060037,1.18142,0.982785,-0.330949,-0.263325,-0.253783,-0.356839,3.0


## Split remaining Plate 4 data into testing and training data

In [8]:
# Set random state as 0 (same as the rest of the notebook)
random_state = 0

# Set the ratio of the test data to 30% (training data will be 70%)
test_ratio = 0.30

# Split the plate 4 data into training and test
training_data, testing_data = train_test_split(
    plate_4_df,
    test_size=test_ratio,
    stratify=plate_4_df[["Metadata_cell_type"]],
    random_state=random_state,
)

# View shapes and example output
print("The testing data contains", testing_data.shape[0], "single-cells.")
print("The training data contains", training_data.shape[0], "single-cells.")
testing_data.head()

The testing data contains 3734 single-cells.
The training data contains 8712 single-cells.


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_03_256,Nuclei_Texture_SumVariance_Hoechst_3_03_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
1406,B,6,2,Healthy,,,661.504837,692.083553,623.860935,706.172074,...,-0.808578,-0.04598,-0.661354,-0.494573,-0.817079,-0.162426,-0.297548,-0.120694,-0.165902,5.0
2381,B,9,19,Failing,Dilated_Cardiomyopathy,,484.401806,572.8924,479.353825,559.431012,...,0.306116,1.015502,0.932049,0.0943,0.085885,-0.296087,-0.327194,-0.231065,-0.299319,6.0
1494,B,6,2,Healthy,,,698.966887,455.592347,690.269841,428.446904,...,0.419599,0.441298,0.476775,0.719091,0.933476,-0.317684,-0.335782,-0.215828,-0.331348,1.0
5111,C,9,23,Failing,Dilated_Cardiomyopathy,,775.957947,546.447612,832.028083,674.183582,...,0.830056,0.475853,0.467789,1.846457,1.879433,-0.330273,-0.352741,-0.253425,-0.391279,1.0
10542,E,8,2,Healthy,,,451.976856,429.891032,439.935155,434.818685,...,0.282446,-0.752575,-0.944247,-0.33401,-0.132079,-0.27365,-0.1794,-0.058595,0.104457,5.0


### Save training and test data as CSVs

In [9]:
# Save training_data as CSV file
training_data.to_csv(f"{output_dir}/training_data.csv", index=False)

# Save testing_data as CSV file
testing_data.to_csv(f"{output_dir}/testing_data.csv", index=False)