# Generate UMAP coordinates for each plate

## Import libraries

In [1]:
import glob
import pathlib
import pandas as pd
import umap

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features

## Set constants

In [2]:
# Set constants
umap_random_seed = 0
umap_n_components = 2

# Set data type for the UMAP embedding generation
data_type = "cleaned"

# Set output dir
output_dir = pathlib.Path("results")
output_dir.mkdir(parents=True, exist_ok=True)

# Adjust output dir if data_type is "cleaned"
if data_type == "cleaned":
    output_dir = output_dir / "qc_profiles_results"
    output_dir.mkdir(parents=True, exist_ok=True)

print(f"Output directory: {output_dir}")  # Debugging line

Output directory: results/qc_profiles_results


## Create list of paths to feature selected data per plate

In [3]:
# Set input paths
data_dir = pathlib.Path("../../../3.processing_features/data/single_cell_profiles/")

# Adjust path if data_type is "cleaned"
if data_type == "cleaned":
    data_dir = data_dir / "cleaned_sc_profiles"

print(f"Final data_dir: {data_dir.resolve()}")  # Show full path for debugging

# Select only the feature selected files
file_suffix = "*sc_feature_selected.parquet"

# Obtain file paths for all feature selected plates
fs_files = glob.glob(f"{data_dir}/{file_suffix}")
fs_files

Final data_dir: /media/18tbdrive/1.Github_Repositories/nf1_schwann_cell_painting_data/3.processing_features/data/single_cell_profiles/cleaned_sc_profiles


['../../../3.processing_features/data/single_cell_profiles/cleaned_sc_profiles/Plate_5_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/cleaned_sc_profiles/Plate_3_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/cleaned_sc_profiles/Plate_3_prime_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/cleaned_sc_profiles/Plate_6_sc_feature_selected.parquet']

### Set dictionary for all plates to be processed independently

In [4]:
# Load feature data into a dictionary, keyed on plate name
cp_dfs = {x.split("/")[-1]: pd.read_parquet(x) for x in fs_files}

# Print out useful information about each dataset
print(cp_dfs.keys())
[cp_dfs[x].shape for x in cp_dfs]

dict_keys(['Plate_5_sc_feature_selected.parquet', 'Plate_3_sc_feature_selected.parquet', 'Plate_3_prime_sc_feature_selected.parquet', 'Plate_6_sc_feature_selected.parquet'])


[(5348, 1159), (10206, 1155), (5126, 1143), (6862, 1155)]

### Create list of specific files for concat UMAP for plates used in modelling

In [5]:
# Select file paths for plates 5, 3, and 3 prime only
selected_plates = ["Plate_5", "Plate_3", "Plate_3_prime"]

# Filter and concatenate the selected plates
selected_dfs = []
for file_path in fs_files:
    plate_name = pathlib.Path(file_path).stem.replace("_sc_feature_selected", "")

    # Only read in selected plates
    if plate_name in selected_plates:
        df = pd.read_parquet(file_path)

        selected_dfs.append(df)

### Get specific features used in the model

In [6]:
# Get the column names of all DataFrames in selected_dfs
column_sets = [set(df.columns) for df in selected_dfs]

# Find the common column names across all DataFrames which are used in the model
common_columns = list(set.intersection(*column_sets))

# Exclude columns that start with "Metadata" to print the number of features
model_columns = [col for col in common_columns if not col.startswith("Metadata")]

# Print length of only features
len(model_columns)

894

## Generate UMAP coordinates for each plate

**Note:** Only metadata that is common between plates are included in final data frame.

In [7]:
desired_columns = [
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Site",
    "Metadata_number_of_singlecells",
    "Metadata_genotype",
]

# Fit UMAP features per dataset and save
for plate in cp_dfs:
    plate_name = pathlib.Path(plate).stem
    print("UMAP embeddings being generated for", plate_name)

    # Make sure to reinitialize UMAP instance per plate
    umap_fit = umap.UMAP(
        random_state=umap_random_seed, n_components=umap_n_components, n_jobs=1
    )

    # Make sure NA columns have been removed
    cp_df = cp_dfs[plate]
    cp_df = feature_select(cp_df, operation="drop_na_columns", na_cutoff=0)

    # Make sure that the Plate_3_prime has correct name in Metadata_Plate column
    if plate_name.replace("_sc_feature_selected", "") == "Plate_3_prime":
        cp_df["Metadata_Plate"] = "Plate_3_prime"

    # Remove rows with genotype HET for Plate_6
    if plate_name.replace("_sc_feature_selected", "") == "Plate_6":
        cp_df = cp_df[cp_df["Metadata_genotype"] != "HET"]

    # Process cp_df to separate features and metadata
    cp_features = infer_cp_features(cp_df)
    meta_features = infer_cp_features(cp_df, metadata=True)
    filtered_meta_features = [
        feature for feature in meta_features if feature in desired_columns
    ]

    # Fit UMAP and convert to pandas DataFrame
    embeddings = pd.DataFrame(
        umap_fit.fit_transform(cp_df.loc[:, cp_features]),
        columns=[f"UMAP{x}" for x in range(0, umap_n_components)],
    )
    print(embeddings.shape)

    # Combine with metadata
    cp_umap_with_metadata_df = pd.concat(
        [cp_df.loc[:, filtered_meta_features].reset_index(drop=True), embeddings],
        axis=1,
    )

    # randomize the rows of the dataframe to plot the order of the data evenly
    cp_umap_with_metadata_df = cp_umap_with_metadata_df.sample(frac=1, random_state=0)

    # Generate output file and save
    if data_type == "cleaned":
        output_umap_file = pathlib.Path(output_dir, f"UMAP_{plate_name}_qc.tsv")
    else:
        output_umap_file = pathlib.Path(output_dir, f"UMAP_{plate_name}.tsv")

    cp_umap_with_metadata_df.to_csv(output_umap_file, index=False, sep="\t")

UMAP embeddings being generated for Plate_5_sc_feature_selected




(5348, 2)
UMAP embeddings being generated for Plate_3_sc_feature_selected




(10206, 2)
UMAP embeddings being generated for Plate_3_prime_sc_feature_selected




(5126, 2)
UMAP embeddings being generated for Plate_6_sc_feature_selected




(4398, 2)


In [8]:
# Print an example output file
print(cp_umap_with_metadata_df.shape)
cp_umap_with_metadata_df.head()

(4398, 7)


Unnamed: 0,Metadata_Well,Metadata_Site,Metadata_number_of_singlecells,Metadata_genotype,Metadata_Plate,UMAP0,UMAP1
2285,E7,32,194,WT,Plate_6,5.242774,-0.250396
3440,G3,13,123,Null,Plate_6,1.146479,-0.421433
2188,E6,15,144,WT,Plate_6,3.111833,-0.214076
3844,G6,29,133,Null,Plate_6,2.279099,-0.107815
1732,E3,35,161,WT,Plate_6,5.523514,-0.903521


## Create UMAP embeddings with the plates used to the train the model combined

### Save all plate data features together as parquet file

In [9]:
# Filter each DataFrame in selected_dfs to include only common columns
selected_dfs_filtered = [df.loc[:, common_columns] for df in selected_dfs]

# Concatenate the filtered dataframes along the rows
concatenated_df = pd.concat(selected_dfs_filtered, ignore_index=True)

# Ensure column consistency in the concatenated dataframe
concatenated_df = concatenated_df[sorted(concatenated_df.columns)]

# Save the concatenated dataframe to a file
if data_type == "cleaned":
    output_concatenated_file = pathlib.Path(
        output_dir, "concatenated_norm_fs_plates_5_3_3prime_qc.parquet"
    )
else:
    output_concatenated_file = pathlib.Path(
        output_dir, "concatenated_norm_fs_plates_5_3_3prime.parquet"
    )
concatenated_df.to_parquet(output_concatenated_file, index=False)

print(concatenated_df.shape)
concatenated_df.head()

(20680, 911)


Unnamed: 0,Cells_AreaShape_Area,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,Cells_AreaShape_MedianRadius,Cells_AreaShape_Solidity,Cells_AreaShape_Zernike_0_0,Cells_AreaShape_Zernike_1_1,Cells_AreaShape_Zernike_2_0,Cells_AreaShape_Zernike_3_1,Cells_AreaShape_Zernike_4_0,...,Nuclei_Texture_InfoMeas1_RFP_3_02_256,Nuclei_Texture_InfoMeas1_RFP_3_03_256,Nuclei_Texture_InfoMeas2_RFP_3_00_256,Nuclei_Texture_InfoMeas2_RFP_3_01_256,Nuclei_Texture_InfoMeas2_RFP_3_02_256,Nuclei_Texture_InfoMeas2_RFP_3_03_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256
0,0.83911,1.095179,0.202953,0.9258,0.822639,-0.19466,-1.039367,0.311138,-0.543013,1.586707,...,0.796527,0.377723,0.772318,-0.150806,-0.36422,0.135443,0.27685,-1.330407,-1.362612,-0.988199
1,0.373822,0.552097,0.274394,0.669018,0.884253,0.780477,1.688835,0.551297,1.748409,-1.229588,...,-0.992558,-0.489027,0.764757,0.963091,1.007428,0.92227,-1.359737,-0.763721,-0.527832,-1.206265
2,0.627252,-1.95456,-1.368154,-0.847872,-0.837032,-2.007563,-1.64645,-1.98142,-1.644735,-0.406568,...,0.659886,0.34863,0.970132,0.353297,-0.046799,0.268787,0.404264,-1.484791,-1.732719,-1.561797
3,0.112822,-0.318846,-0.367908,-0.212065,0.921082,-0.600674,-0.602911,-0.386151,-0.395013,0.617826,...,-0.138292,0.126055,0.563291,0.812344,0.347518,0.159528,2.029471,2.650493,1.198457,1.298047
4,1.007967,-0.274683,-0.589269,0.314984,-0.346598,-0.818625,-1.233029,-0.157081,-1.167032,1.618552,...,1.433545,1.298655,0.457869,-0.901281,-2.24515,-2.123569,1.768716,0.692973,0.029321,0.259599


In [10]:
desired_columns = [
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Site",
    "Metadata_number_of_singlecells",
    "Metadata_genotype",
]

# Make sure to reinitialize UMAP instance
umap_fit = umap.UMAP(
    random_state=umap_random_seed, n_components=umap_n_components, n_jobs=1
)

# Process cp_df to separate features and metadata
cp_features = infer_cp_features(concatenated_df)
meta_features = infer_cp_features(concatenated_df, metadata=True)
filtered_meta_features = [
    feature for feature in meta_features if feature in desired_columns
]

# Fit UMAP and convert to pandas DataFrame
embeddings = pd.DataFrame(
    umap_fit.fit_transform(concatenated_df.loc[:, cp_features]),
    columns=[f"UMAP{x}" for x in range(0, umap_n_components)],
)
print(embeddings.shape)

# Combine with metadata
cp_umap_with_metadata_df = pd.concat(
    [concatenated_df.loc[:, filtered_meta_features], embeddings], axis=1
)

# randomize the rows of the dataframe to plot the order of the data evenly
cp_umap_with_metadata_df = cp_umap_with_metadata_df.sample(frac=1, random_state=0)

# Generate output file and save
if data_type == "cleaned":
    output_umap_file = pathlib.Path(
        output_dir, "UMAP_concat_model_plates_sc_feature_selected_qc.tsv"
    )
else:
    output_umap_file = pathlib.Path(
        output_dir, "UMAP_concat_model_plates_sc_feature_selected.tsv"
    )
cp_umap_with_metadata_df.to_csv(output_umap_file, index=False, sep="\t")



(20680, 2)


## Generate Plate 6 UMAP embeddings using the model features specifically

In [11]:
# Load in Plate 6 normalized data to then filter down the features with the model_columns
plate_6_norm_df = pd.read_parquet(
    pathlib.Path(data_dir, "Plate_6_sc_normalized.parquet")
)

# Drop rows where Metadata_genotype is HET
plate_6_norm_df = plate_6_norm_df[plate_6_norm_df["Metadata_genotype"] != "HET"]

# Filter the plate_6 data for the columns in model_columns
plate_6_filtered_features = plate_6_norm_df[model_columns]

# Add the metadata columns back
metadata_columns = [
    col for col in plate_6_norm_df.columns if col.startswith("Metadata_")
]
plate_6_filtered_df = pd.concat(
    [plate_6_norm_df[metadata_columns], plate_6_filtered_features], axis=1
)

# Drop rows with NaN values in the feature columns
plate_6_filtered_df = plate_6_filtered_df.dropna(
    subset=[
        col for col in plate_6_filtered_df.columns if not col.startswith("Metadata_")
    ]
)
assert plate_6_filtered_df.isna().sum().sum() == 0, "NaN detected"

# Change Metadata_Plate for all rows to Plate_6_filtered to avoid issues downstream loading in plates
plate_6_filtered_df["Metadata_Plate"] = "Plate_6_filtered"

# Confirm that index is reset to avoid any NaN issues
plate_6_filtered_df = plate_6_filtered_df.reset_index(drop=True)

print(len(plate_6_filtered_features.columns))

# Display the filtered dataframe
print(plate_6_filtered_df.shape)
plate_6_filtered_df.head()

894
(4395, 913)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_Well,Metadata_Site,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_Institution,Metadata_seed_density,Metadata_ImageNumber,...,Nuclei_RadialDistribution_ZernikePhase_GFP_7_3,Cytoplasm_RadialDistribution_ZernikeMagnitude_RFP_3_1,Cytoplasm_RadialDistribution_ZernikePhase_GFP_3_1,Cells_RadialDistribution_ZernikeMagnitude_DAPI_9_1,Cells_Texture_Correlation_DAPI_3_01_256,Nuclei_RadialDistribution_ZernikeMagnitude_RFP_6_2,Nuclei_RadialDistribution_ZernikeMagnitude_RFP_8_6,Cells_RadialDistribution_ZernikePhase_GFP_4_2,Cells_RadialDistribution_ZernikeMagnitude_DAPI_3_1,Cells_Intensity_MassDisplacement_RFP
0,B,2,B2,2,129,NF1,WT,iNFixion,1000,88,...,-0.311199,0.673097,-1.393914,1.114965,-0.177138,-0.21264,0.229569,1.218213,0.996901,-0.550001
1,B,2,B2,31,129,NF1,WT,iNFixion,1000,90,...,-1.410748,-0.918724,1.705195,0.338879,-0.712879,2.286915,-0.282266,0.453598,0.128231,-0.326586
2,B,2,B2,5,129,NF1,WT,iNFixion,1000,98,...,0.0223,-0.06434,1.632494,-0.265251,0.036548,2.621233,1.85634,0.485518,0.938862,0.214164
3,B,2,B2,9,129,NF1,WT,iNFixion,1000,102,...,-1.433564,-0.07949,-0.759758,-0.809677,-0.942134,-1.506302,-1.402954,-1.713181,3.070821,-0.389739
4,B,2,B2,10,129,NF1,WT,iNFixion,1000,67,...,-0.836776,-0.046211,-0.615619,-0.534795,-1.750992,1.586045,0.36705,0.927397,2.334543,-0.271176


In [12]:
desired_columns = [
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Site",
    "Metadata_number_of_singlecells",
    "Metadata_genotype",
]

# Make sure to reinitialize UMAP instance
umap_fit = umap.UMAP(
    random_state=umap_random_seed, n_components=umap_n_components, n_jobs=1
)

# Process cp_df to separate features and metadata
cp_features = infer_cp_features(plate_6_filtered_df)
meta_features = infer_cp_features(plate_6_filtered_df, metadata=True)
filtered_meta_features = [
    feature for feature in meta_features if feature in desired_columns
]

# Fit UMAP and convert to pandas DataFrame
embeddings = pd.DataFrame(
    umap_fit.fit_transform(plate_6_filtered_df.loc[:, cp_features]),
    columns=[f"UMAP{x}" for x in range(0, umap_n_components)],
)
print(embeddings.shape)

# Combine with metadata
cp_umap_with_metadata_df = pd.concat(
    [plate_6_filtered_df.loc[:, filtered_meta_features], embeddings], axis=1
)

# randomize the rows of the dataframe to plot the order of the data evenly
cp_umap_with_metadata_df = cp_umap_with_metadata_df.sample(frac=1, random_state=0)

# Generate output file and save
if data_type == "cleaned":
    output_umap_file = pathlib.Path(
        output_dir, "UMAP_Plate_6_sc_only_model_features_qc.tsv"
    )
else:
    output_umap_file = pathlib.Path(
        output_dir, "UMAP_Plate_6_sc_only_model_features.tsv"
    )
cp_umap_with_metadata_df.to_csv(output_umap_file, index=False, sep="\t")



(4395, 2)
