## Import libraries

In [1]:
import pathlib

import pandas as pd
from sklearn.model_selection import train_test_split
from pycytominer import feature_select

## Load in normalized profiles and concat into one dataframe

In [2]:
# path to big drive where data is located (large)
drive_path = pathlib.Path("../../../../media/18tbdrive/")

# set path to folder in image profiling repo with the normalized profiles
norm_path = pathlib.Path(
    drive_path / "Github_Repositories/nuclear_speckle_image_profiling/4.preprocess_features/data/single_cell_profiles"
).resolve(strict=True)

# load all normalized parquet files
files = norm_path.glob("*_sc_normalized.parquet")
dfs = [pd.read_parquet(file) for file in files]

# concatenate them into one data frame
combined_df = pd.concat(dfs, ignore_index=True)

# perform feature selection to drop any columns that have NaN (avoid downstream issues)
combined_df = feature_select(
    combined_df,
    operation="drop_na_columns",
    na_cutoff=0
)

# print df
print(combined_df.shape)
combined_df.head()

(240920, 574)


Unnamed: 0,Metadata_CellLine,Metadata_Condition,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Nuclei_Site_Count,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Nuclei_AreaShape_Area,...,Nuclei_Texture_Variance_A647_3_02_256,Nuclei_Texture_Variance_A647_3_03_256,Nuclei_Texture_Variance_DAPI_3_00_256,Nuclei_Texture_Variance_DAPI_3_01_256,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GOLD_3_00_256,Nuclei_Texture_Variance_GOLD_3_01_256,Nuclei_Texture_Variance_GOLD_3_02_256,Nuclei_Texture_Variance_GOLD_3_03_256
0,786O,NTC,1,slide3,A1,M14,40,440.421603,78.968641,-0.455396,...,-0.674394,-0.672335,-0.126117,-0.116814,-0.125657,-0.125353,-0.360211,-0.36674,-0.371048,-0.369066
1,786O,NTC,1,slide3,A1,M14,40,419.490134,104.726914,-0.768349,...,-0.674394,-0.672335,0.039389,0.039364,0.029068,0.087892,-0.360211,-0.36674,-0.371048,-0.369066
2,786O,NTC,1,slide3,A1,M14,40,228.128582,1150.712626,1.681257,...,-0.217142,-0.214596,-0.429377,-0.42374,-0.429103,-0.429579,-0.20354,-0.207221,-0.21151,-0.208946
3,786O,NTC,1,slide3,A1,M14,40,193.636493,1293.612259,-0.515006,...,-0.338651,-0.350054,-0.345572,-0.339878,-0.341765,-0.3426,-0.187649,-0.190218,-0.191318,-0.194624
4,786O,NTC,1,slide3,A1,M14,40,238.685272,1344.190989,-0.358529,...,-0.358266,-0.357445,-0.293864,-0.280232,-0.294569,-0.284401,-0.226681,-0.229877,-0.232546,-0.232706


## Load in filtered single-cell data from first module

In [3]:
# load in filtered tuple dataframe
filtering_df = pd.read_parquet(
    pathlib.Path(
        "../0.data_analysis_and_processing/filtered_single_cells/filtered_single_cell_profiles.parquet"
    )
)

# print df
print(filtering_df.shape)
filtering_df.head()

(206964, 9)


Unnamed: 0,Metadata_Condition,Metadata_Nuclei_Site_Count,Metadata_Site,Metadata_Nuclei_Location_Center_Y,Metadata_CellLine,Metadata_Nuclei_Location_Center_X,Metadata_Plate,Metadata_ImageNumber,Metadata_Well
0,NTC,40,M14,78.968641,786O,440.421603,slide3,1,A1
1,NTC,40,M14,1150.712626,786O,228.128582,slide3,1,A1
2,NTC,40,M14,1293.612259,786O,193.636493,slide3,1,A1
3,NTC,40,M14,1344.190989,786O,238.685272,slide3,1,A1
4,NTC,40,M14,1417.101739,786O,211.043142,slide3,1,A1


## Filter out any cells that do not matched the filtered data tuple

In [4]:
# define the columns to match that are important for filtering
columns_to_match = [
    'Metadata_Plate', 'Metadata_Well', 'Metadata_Site',
    'Metadata_Nuclei_Location_Center_X', 'Metadata_Nuclei_Location_Center_Y'
]

# ensure both dataframes have these columns
combined_df_filtered = combined_df[columns_to_match]
filtering_df_filtered = filtering_df[columns_to_match]

# convert rows to tuples
filtering_tuples = set(tuple(row) for row in filtering_df_filtered.itertuples(index=False))
combined_df_tuples = combined_df_filtered.apply(tuple, axis=1)

# create a boolean mask for rows in combined_df that match any rows in filtering_df
mask = combined_df_tuples.isin(filtering_tuples)

# filter combined_df to keep only rows that match any row in filtering_df
filtered_combined_df = combined_df[mask]

# reset index
filtered_combined_df.reset_index(drop=True, inplace=True)

print(filtered_combined_df.shape)
filtered_combined_df.head()

(206964, 574)


Unnamed: 0,Metadata_CellLine,Metadata_Condition,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Nuclei_Site_Count,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Nuclei_AreaShape_Area,...,Nuclei_Texture_Variance_A647_3_02_256,Nuclei_Texture_Variance_A647_3_03_256,Nuclei_Texture_Variance_DAPI_3_00_256,Nuclei_Texture_Variance_DAPI_3_01_256,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GOLD_3_00_256,Nuclei_Texture_Variance_GOLD_3_01_256,Nuclei_Texture_Variance_GOLD_3_02_256,Nuclei_Texture_Variance_GOLD_3_03_256
0,786O,NTC,1,slide3,A1,M14,40,440.421603,78.968641,-0.455396,...,-0.674394,-0.672335,-0.126117,-0.116814,-0.125657,-0.125353,-0.360211,-0.36674,-0.371048,-0.369066
1,786O,NTC,1,slide3,A1,M14,40,228.128582,1150.712626,1.681257,...,-0.217142,-0.214596,-0.429377,-0.42374,-0.429103,-0.429579,-0.20354,-0.207221,-0.21151,-0.208946
2,786O,NTC,1,slide3,A1,M14,40,193.636493,1293.612259,-0.515006,...,-0.338651,-0.350054,-0.345572,-0.339878,-0.341765,-0.3426,-0.187649,-0.190218,-0.191318,-0.194624
3,786O,NTC,1,slide3,A1,M14,40,238.685272,1344.190989,-0.358529,...,-0.358266,-0.357445,-0.293864,-0.280232,-0.294569,-0.284401,-0.226681,-0.229877,-0.232546,-0.232706
4,786O,NTC,1,slide3,A1,M14,40,211.043142,1417.101739,-0.235583,...,-0.58375,-0.581345,-0.504008,-0.498415,-0.50386,-0.500438,-0.345018,-0.351947,-0.355387,-0.353693


## Hold out all untreated single-cells from the `293T` cell line

In [5]:
# set path for training, testing, and holdout datasets
data_dir = pathlib.Path("./data")
data_dir.mkdir(exist_ok=True)

# holdout all cells from the 293T cell line as CSV
holdout_data = filtered_combined_df[filtered_combined_df['Metadata_CellLine'] == '293T']

# Save the holdout data to a CSV file
holdout_data.to_csv(f"{data_dir}/holdout_data_293T.csv", index=False)

# Print the shape of holdout_data to verify
print(holdout_data.shape)

(144955, 574)


## Split data 70% training and 30% testing

In [6]:
# Remove rows with Metadata_CellLine as 293T
remaining_data = filtered_combined_df[filtered_combined_df['Metadata_CellLine'] != '293T']

# Split the data into training and testing sets
train_data, test_data = train_test_split(
    remaining_data,
    test_size=0.3,
    random_state=0,  # For reproducibility
    shuffle=True     # Ensure data is shuffled before splitting
)

# Save the training and testing data to CSV files
train_data.to_csv(f"{data_dir}/training_data.csv", index=False)
test_data.to_csv(f"{data_dir}/testing_data.csv", index=False)

# Print the shapes of the splits to verify
print(f"Training data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")

Training data shape: (43406, 574)
Testing data shape: (18603, 574)
