# Feature select and subset the features using the features used for the model of all JUMP plates 

## Import libraries

In [1]:
import pyarrow.parquet as pq
import pandas as pd

import random
import pathlib

from pycytominer import feature_select

## Set paths and variables

In [2]:
# path to normalized data
norm_dir = pathlib.Path(
    "../../../Way Science Lab Dropbox/JUMP Processed Data/normalized_sc_plate_data"
)

# path to saving the concat file
save_dir = pathlib.Path("./data")
save_dir.mkdir(parents=True, exist_ok=True)

# path to save normalized concat file
concat_save_path = pathlib.Path(f"{save_dir}/concat_norm_data_subset.parquet")

# URL path to Mitocheck labeled data
mitocheck_labeled_data_url = "https://github.com/WayScience/phenotypic_profiling_model/raw/main/0.download_data/data/labeled_data__ic.csv.gz"

# set feature selection operations (added get na columns due to issue when trying to create UMAP)
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
    "drop_na_columns",
]

# set save path for feature selected concat file
fs_save_path = pathlib.Path(f"{save_dir}/concat_fs_norm_data_subset.parquet")

# path to save filtered subset data with Mitocheck features
filter_save_path = pathlib.Path(f"{save_dir}/concat_mitocheck_data_subset.parquet")

# Number of samples to randomly select to use in feature selected
n_samples = 1000

print("There are this many plates in this dataset:", len(list(norm_dir.glob("*"))))

There are this many plates in this dataset: 51


## Concat a subset of all plates together into a dataframe and save the parquet file

In [3]:
# Check to make sure the concatenated file doesn't already exist
if not concat_save_path.exists():
    # Create an empty DataFrame to store the concatenated data
    concatenated_df = pd.DataFrame()
    
    # Iterate through each Parquet file in the directory
    for file_path in norm_dir.glob("*.parquet"):
        print("Sampling from plate:", file_path.stem.split("_")[0])
        # Open the Parquet file
        parquet_file = pq.ParquetFile(file_path)

        # Get the number of rows in the file
        num_rows = parquet_file.num_row_groups

        # Randomly select a row group from the file
        random_row_group = random.randint(0, num_rows - 1)

        # Read the selected row group with the specified number of rows
        df = parquet_file.read_row_group(random_row_group, columns=parquet_file.schema.names, use_threads=True).to_pandas()

        # Randomly sample rows from the selected row group
        sampled_rows = df.sample(n=n_samples, random_state=0)

        print(f"Sampled data frame has this many rows: {sampled_rows.shape[0]}")

        # Concatenate the sampled rows to the concatenated_data DataFrame
        concatenated_df = pd.concat([concatenated_df, sampled_rows], ignore_index=True)

        print(f"Concat data frame now has {concatenated_df.shape[0]} rows")

    # Save the concatenated DataFrame as a Parquet file
    concatenated_df.to_parquet(concat_save_path, index=False)
    
else:
    # read in the parquet file to use
    concatenated_df = pd.read_parquet(concat_save_path)
    print("Concatenated file already exists and has been loaded in!")


Concatenated file already exists and has been loaded in!


## Check to see what the concat file looks like and if it looks correct

In [4]:
print(concatenated_df.shape)
concatenated_df.head(2)

(51000, 5813)


Unnamed: 0,Metadata_broad_sample,Metadata_solvent,Metadata_ImageNumber,Metadata_Plate,Metadata_Site,Metadata_Well,Metadata_TableNumber,Metadata_ObjectNumber_cytoplasm,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,...,Nuclei_Texture_Variance_RNA_3_01_256,Nuclei_Texture_Variance_RNA_3_02_256,Nuclei_Texture_Variance_RNA_3_03_256,Nuclei_Texture_Variance_RNA_5_00_256,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256,Metadata_gene,Metadata_target_sequence,Metadata_negcon_control_type
0,DMSO,DMSO,1540,BR00117054,1,H04,228619917084871791794985360693111447882,58,58.0,58.0,...,-0.6395,-0.666478,-0.674946,-0.659082,-0.677891,-0.669005,-0.659034,,,
1,DMSO,DMSO,1548,BR00117054,9,H04,228391807631129898267164739161499174121,90,90.0,90.0,...,-0.734896,-0.752684,-0.734502,-0.724447,-0.716345,-0.734481,-0.760012,,,


# Perform feature selection on concat data and check output

In [5]:
# Check to make sure the concatenated file doesn't already exist
if not fs_save_path.exists():
    # perform feature selection on concat normalized data
    feature_select(
        concatenated_df,
        operation=feature_select_ops,
        output_file=fs_save_path,
        output_type="parquet"
    )
    
# load back in fs data to see if it worked
fs_df = pd.read_parquet(fs_save_path)

print(fs_df.shape)
fs_df.head(2)

(51000, 1756)


Unnamed: 0,Metadata_broad_sample,Metadata_solvent,Metadata_ImageNumber,Metadata_Plate,Metadata_Site,Metadata_Well,Metadata_TableNumber,Metadata_ObjectNumber_cytoplasm,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,...,Nuclei_Texture_SumVariance_ER_10_02_256,Nuclei_Texture_SumVariance_ER_10_03_256,Nuclei_Texture_SumVariance_HighZBF_3_03_256,Nuclei_Texture_SumVariance_Mito_10_01_256,Nuclei_Texture_SumVariance_Mito_10_03_256,Nuclei_Texture_SumVariance_RNA_10_01_256,Nuclei_Texture_SumVariance_RNA_10_03_256,Metadata_gene,Metadata_target_sequence,Metadata_negcon_control_type
0,DMSO,DMSO,1540,BR00117054,1,H04,228619917084871791794985360693111447882,58,58.0,58.0,...,-0.577284,-0.732747,-0.315438,-0.075827,-0.234435,-0.637203,-0.664829,,,
1,DMSO,DMSO,1548,BR00117054,9,H04,228391807631129898267164739161499174121,90,90.0,90.0,...,-0.75727,-0.797579,0.080938,-0.34924,-0.446364,-0.620931,-0.664754,,,


## Use Mitocheck labelled dataset to filter the subset of data to nuclei features

In [6]:
# load in labelled data as Data Frame
label_df = pd.read_csv(mitocheck_labeled_data_url, compression='gzip')

# Extract metadata columns from concat df
metadata_cols = [col for col in concatenated_df.columns if col.startswith("Metadata")]

# Extract CellProfiler nuclei features from Mitocheck data
feature_cols = [col for col in label_df.columns if "CP__" in col]
feature_cols = [string.replace("CP_", "Nuclei") if "CP" in string else string for string in feature_cols]
feature_cols = pd.Index(feature_cols)

# Find columns that are in the Mitocheck data but not the JUMP data
diff_columns = feature_cols.difference(concatenated_df.columns).tolist()

# Remove features not seen in JUMP from Mitocheck feature list
feature_cols = [item for item in feature_cols if item not in diff_columns]

# Add metadata with nuclei features from Mitocheck
filtered_df = concatenated_df[metadata_cols + feature_cols]

# Save the concatenated DataFrame as a Parquet file
filtered_df.to_parquet(filter_save_path, index=False)

print(filtered_df.shape)
filtered_df.head(2)

(51000, 172)


Unnamed: 0,Metadata_broad_sample,Metadata_solvent,Metadata_ImageNumber,Metadata_Plate,Metadata_Site,Metadata_Well,Metadata_TableNumber,Metadata_ObjectNumber_cytoplasm,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,...,Nuclei_Texture_SumEntropy_DNA_3_02_256,Nuclei_Texture_SumEntropy_DNA_3_03_256,Nuclei_Texture_SumVariance_DNA_3_00_256,Nuclei_Texture_SumVariance_DNA_3_01_256,Nuclei_Texture_SumVariance_DNA_3_02_256,Nuclei_Texture_SumVariance_DNA_3_03_256,Nuclei_Texture_Variance_DNA_3_00_256,Nuclei_Texture_Variance_DNA_3_01_256,Nuclei_Texture_Variance_DNA_3_02_256,Nuclei_Texture_Variance_DNA_3_03_256
0,DMSO,DMSO,1540,BR00117054,1,H04,228619917084871791794985360693111447882,58,58.0,58.0,...,0.113166,0.005064,-0.306598,-0.278894,-0.214727,-0.266604,-0.284124,-0.277031,-0.254744,-0.281526
1,DMSO,DMSO,1548,BR00117054,9,H04,228391807631129898267164739161499174121,90,90.0,90.0,...,-0.747137,-0.689093,-0.40747,-0.398943,-0.405557,-0.400385,-0.433045,-0.425488,-0.429724,-0.435415


In [10]:
len(feature_cols)

149

In [9]:
diff_columns

['Nuclei_AreaShape_ConvexArea',
 'Nuclei_Neighbors_AngleBetweenNeighbors_Adjacent',
 'Nuclei_Neighbors_FirstClosestDistance_Adjacent',
 'Nuclei_Neighbors_FirstClosestObjectNumber_Adjacent',
 'Nuclei_Neighbors_NumberOfNeighbors_Adjacent',
 'Nuclei_Neighbors_PercentTouching_Adjacent',
 'Nuclei_Neighbors_SecondClosestDistance_Adjacent',
 'Nuclei_Neighbors_SecondClosestObjectNumber_Adjacent']