# Generate UMAP coordinates for each plate

## Import libraries

In [1]:
import glob
import pathlib
import pandas as pd
import umap

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features

## Set constants

In [2]:
# Set constants
umap_random_seed = 0
umap_n_components = 2

# Set data type for the UMAP embedding generation
data_type = "cleaned"

# Set output dir
output_dir = pathlib.Path("results")
output_dir.mkdir(parents=True, exist_ok=True)

# Adjust output dir if data_type is "cleaned"
if data_type == "cleaned":
    output_dir = output_dir / "qc_profiles_results"
    output_dir.mkdir(parents=True, exist_ok=True)

print(f"Output directory: {output_dir}")  # Debugging line

Output directory: results/qc_profiles_results


## Create list of paths to feature selected data per plate

In [3]:
# Set input paths
data_dir = pathlib.Path("../../../3.processing_features/data/single_cell_profiles/")

# Adjust path if data_type is "cleaned"
if data_type == "cleaned":
    data_dir = data_dir / "cleaned_sc_profiles"

print(f"Final data_dir: {data_dir.resolve()}")  # Show full path for debugging

# Select only the feature selected files
file_suffix = "*sc_feature_selected.parquet"

# Obtain file paths for all feature selected plates
fs_files = glob.glob(f"{data_dir}/{file_suffix}")
fs_files

Final data_dir: /media/18tbdrive/1.Github_Repositories/nf1_schwann_cell_painting_data/3.processing_features/data/single_cell_profiles/cleaned_sc_profiles


['../../../3.processing_features/data/single_cell_profiles/cleaned_sc_profiles/Plate_5_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/cleaned_sc_profiles/Plate_3_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/cleaned_sc_profiles/Plate_3_prime_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/cleaned_sc_profiles/Plate_6_sc_feature_selected.parquet']

### Set dictionary for all plates to be processed independently

In [4]:
# Load feature data into a dictionary, keyed on plate name
cp_dfs = {x.split("/")[-1]: pd.read_parquet(x) for x in fs_files}

# Print out useful information about each dataset
print(cp_dfs.keys())
[cp_dfs[x].shape for x in cp_dfs]

dict_keys(['Plate_5_sc_feature_selected.parquet', 'Plate_3_sc_feature_selected.parquet', 'Plate_3_prime_sc_feature_selected.parquet', 'Plate_6_sc_feature_selected.parquet'])


[(5512, 1175), (10509, 1180), (5182, 1173), (6800, 1165)]

### Create list of specific files for concat UMAP for plates used in modelling

In [5]:
# Select file paths for plates 5, 3, and 3 prime only
selected_plates = ["Plate_5", "Plate_3", "Plate_3_prime"]

# Filter and concatenate the selected plates
selected_dfs = []
for file_path in fs_files:
    plate_name = pathlib.Path(file_path).stem.replace("_sc_feature_selected", "")

    # Only read in selected plates
    if plate_name in selected_plates:
        df = pd.read_parquet(file_path)

        selected_dfs.append(df)

### Get specific features used in the model

In [6]:
# Get the column names of all DataFrames in selected_dfs
column_sets = [set(df.columns) for df in selected_dfs]

# Find the common column names across all DataFrames which are used in the model
common_columns = list(set.intersection(*column_sets))

# Exclude columns that start with "Metadata" to print the number of features
model_columns = [col for col in common_columns if not col.startswith("Metadata")]

# Print length of only features
len(model_columns)

913

## Generate UMAP coordinates for each plate

**Note:** Only metadata that is common between plates are included in final data frame.

In [7]:
desired_columns = [
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Site",
    "Metadata_number_of_singlecells",
    "Metadata_genotype",
]

# Fit UMAP features per dataset and save
for plate in cp_dfs:
    plate_name = pathlib.Path(plate).stem
    print("UMAP embeddings being generated for", plate_name)

    # Make sure to reinitialize UMAP instance per plate
    umap_fit = umap.UMAP(
        random_state=umap_random_seed, n_components=umap_n_components, n_jobs=1
    )

    # Make sure NA columns have been removed
    cp_df = cp_dfs[plate]
    cp_df = feature_select(cp_df, operation="drop_na_columns", na_cutoff=0)

    # Make sure that the Plate_3_prime has correct name in Metadata_Plate column
    if plate_name.replace("_sc_feature_selected", "") == "Plate_3_prime":
        cp_df["Metadata_Plate"] = "Plate_3_prime"

    # Remove rows with genotype HET for Plate_6
    if plate_name.replace("_sc_feature_selected", "") == "Plate_6":
        cp_df = cp_df[cp_df["Metadata_genotype"] != "HET"]

    # Process cp_df to separate features and metadata
    cp_features = infer_cp_features(cp_df)
    meta_features = infer_cp_features(cp_df, metadata=True)
    filtered_meta_features = [
        feature for feature in meta_features if feature in desired_columns
    ]

    # Fit UMAP and convert to pandas DataFrame
    embeddings = pd.DataFrame(
        umap_fit.fit_transform(cp_df.loc[:, cp_features]),
        columns=[f"UMAP{x}" for x in range(0, umap_n_components)],
    )
    print(embeddings.shape)

    # Combine with metadata
    cp_umap_with_metadata_df = pd.concat(
        [cp_df.loc[:, filtered_meta_features].reset_index(drop=True), embeddings],
        axis=1,
    )

    # randomize the rows of the dataframe to plot the order of the data evenly
    cp_umap_with_metadata_df = cp_umap_with_metadata_df.sample(frac=1, random_state=0)

    # Generate output file and save
    if data_type == "cleaned":
        output_umap_file = pathlib.Path(output_dir, f"UMAP_{plate_name}_qc.tsv")
    else:
        output_umap_file = pathlib.Path(output_dir, f"UMAP_{plate_name}.tsv")

    cp_umap_with_metadata_df.to_csv(output_umap_file, index=False, sep="\t")

UMAP embeddings being generated for Plate_5_sc_feature_selected




(5512, 2)
UMAP embeddings being generated for Plate_3_sc_feature_selected




(10509, 2)
UMAP embeddings being generated for Plate_3_prime_sc_feature_selected




(5182, 2)
UMAP embeddings being generated for Plate_6_sc_feature_selected




(4464, 2)


In [8]:
# Print an example output file
print(cp_umap_with_metadata_df.shape)
cp_umap_with_metadata_df.head()

(4464, 7)


Unnamed: 0,Metadata_Well,Metadata_Site,Metadata_number_of_singlecells,Metadata_genotype,Metadata_Plate,UMAP0,UMAP1
1912,E4,17,131,WT,Plate_6,2.723513,1.827488
2457,E8,29,215,WT,Plate_6,5.17984,0.43768
4401,G11,6,114,Null,Plate_6,4.286242,-0.28247
2941,E10,27,224,WT,Plate_6,2.907478,0.320848
3061,E10,9,224,WT,Plate_6,3.833103,-0.911714


## Create UMAP embeddings with the plates used to the train the model combined

### Save all plate data features together as parquet file

In [9]:
# Filter each DataFrame in selected_dfs to include only common columns
selected_dfs_filtered = [df.loc[:, common_columns] for df in selected_dfs]

# Concatenate the filtered dataframes along the rows
concatenated_df = pd.concat(selected_dfs_filtered, ignore_index=True)

# Ensure column consistency in the concatenated dataframe
concatenated_df = concatenated_df[sorted(concatenated_df.columns)]

# Save the concatenated dataframe to a file
if data_type == "cleaned":
    output_concatenated_file = pathlib.Path(
        output_dir, "concatenated_norm_fs_plates_5_3_3prime_qc.parquet"
    )
else:
    output_concatenated_file = pathlib.Path(
        output_dir, "concatenated_norm_fs_plates_5_3_3prime.parquet"
    )
concatenated_df.to_parquet(output_concatenated_file, index=False)

print(concatenated_df.shape)
concatenated_df.head()

(21203, 934)


Unnamed: 0,Cells_AreaShape_FormFactor,Cells_AreaShape_MedianRadius,Cells_AreaShape_Solidity,Cells_AreaShape_Zernike_1_1,Cells_AreaShape_Zernike_2_0,Cells_AreaShape_Zernike_3_1,Cells_AreaShape_Zernike_4_0,Cells_AreaShape_Zernike_4_2,Cells_AreaShape_Zernike_5_1,Cells_AreaShape_Zernike_5_3,...,Nuclei_Texture_InfoMeas2_RFP_3_02_256,Nuclei_Texture_InfoMeas2_RFP_3_03_256,Nuclei_Texture_InverseDifferenceMoment_DAPI_3_00_256,Nuclei_Texture_InverseDifferenceMoment_GFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256,Nuclei_Texture_SumVariance_DAPI_3_03_256,Nuclei_Texture_SumVariance_GFP_3_03_256
0,0.133606,0.965237,0.784319,-1.050035,0.288675,-0.536259,1.597419,0.795327,-0.57085,0.178731,...,-0.416361,0.073586,0.836776,-0.59402,0.286562,-1.1564,-1.218846,-0.852895,-0.698835,-0.464884
1,-0.400903,-1.597026,-1.002418,-0.107761,-0.716132,0.272805,0.277528,-0.385383,0.905715,0.97828,...,1.087508,1.162216,-1.267003,1.359042,8.911923,9.474872,8.593259,9.768122,1.803852,0.081331
2,0.20128,0.709951,0.845565,1.64853,0.52633,1.742914,-1.205984,-0.712605,-0.368464,-1.208715,...,0.941565,0.848492,-0.7576,-0.992832,-1.192041,-0.648909,-0.453584,-1.048156,0.665859,-0.26613
3,-1.354661,-0.798102,-0.865438,-1.650523,-1.979982,-1.632091,-0.386732,-1.380526,-1.492452,-1.672204,...,-0.102116,0.20491,-0.041237,-0.259683,0.401676,-1.294658,-1.558131,-1.366508,-0.214248,-0.461305
4,-0.407155,-0.165999,0.882174,-0.61832,-0.401345,-0.38905,0.632973,0.604351,0.890562,-0.426591,...,0.288256,0.097307,0.426933,1.3878,1.869997,2.408665,1.128943,1.194263,-0.553479,-0.460541


In [10]:
desired_columns = [
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Site",
    "Metadata_number_of_singlecells",
    "Metadata_genotype",
]

# Make sure to reinitialize UMAP instance
umap_fit = umap.UMAP(
    random_state=umap_random_seed, n_components=umap_n_components, n_jobs=1
)

# Process cp_df to separate features and metadata
cp_features = infer_cp_features(concatenated_df)
meta_features = infer_cp_features(concatenated_df, metadata=True)
filtered_meta_features = [
    feature for feature in meta_features if feature in desired_columns
]

# Fit UMAP and convert to pandas DataFrame
embeddings = pd.DataFrame(
    umap_fit.fit_transform(concatenated_df.loc[:, cp_features]),
    columns=[f"UMAP{x}" for x in range(0, umap_n_components)],
)
print(embeddings.shape)

# Combine with metadata
cp_umap_with_metadata_df = pd.concat(
    [concatenated_df.loc[:, filtered_meta_features], embeddings], axis=1
)

# randomize the rows of the dataframe to plot the order of the data evenly
cp_umap_with_metadata_df = cp_umap_with_metadata_df.sample(frac=1, random_state=0)

# Generate output file and save
if data_type == "cleaned":
    output_umap_file = pathlib.Path(
        output_dir, "UMAP_concat_model_plates_sc_feature_selected_qc.tsv"
    )
else:
    output_umap_file = pathlib.Path(
        output_dir, "UMAP_concat_model_plates_sc_feature_selected.tsv"
    )
cp_umap_with_metadata_df.to_csv(output_umap_file, index=False, sep="\t")



(21203, 2)


## Generate Plate 6 UMAP embeddings using the model features specifically

In [11]:
# Load in Plate 6 normalized data to then filter down the features with the model_columns
plate_6_norm_df = pd.read_parquet(
    pathlib.Path(data_dir, "Plate_6_sc_normalized.parquet")
)

# Drop rows where Metadata_genotype is HET
plate_6_norm_df = plate_6_norm_df[plate_6_norm_df["Metadata_genotype"] != "HET"]

# Filter the plate_6 data for the columns in model_columns
plate_6_filtered_features = plate_6_norm_df[model_columns]

# Add the metadata columns back
metadata_columns = [
    col for col in plate_6_norm_df.columns if col.startswith("Metadata_")
]
plate_6_filtered_df = pd.concat(
    [plate_6_norm_df[metadata_columns], plate_6_filtered_features], axis=1
)

# Drop rows with NaN values in the feature columns
plate_6_filtered_df = plate_6_filtered_df.dropna(
    subset=[
        col for col in plate_6_filtered_df.columns if not col.startswith("Metadata_")
    ]
)
assert plate_6_filtered_df.isna().sum().sum() == 0, "NaN detected"

# Change Metadata_Plate for all rows to Plate_6_filtered to avoid issues downstream loading in plates
plate_6_filtered_df["Metadata_Plate"] = "Plate_6_filtered"

# Confirm that index is reset to avoid any NaN issues
plate_6_filtered_df = plate_6_filtered_df.reset_index(drop=True)

print(len(plate_6_filtered_features.columns))

# Display the filtered dataframe
print(plate_6_filtered_df.shape)
plate_6_filtered_df.head()

913
(4461, 936)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_Well,Metadata_Site,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_Institution,Metadata_seed_density,Metadata_ImageNumber,...,Nuclei_Correlation_K_RFP_GFP,Cells_Neighbors_FirstClosestDistance_Adjacent,Nuclei_RadialDistribution_ZernikePhase_GFP_9_9,Cytoplasm_Correlation_RWC_CY5_DAPI,Cytoplasm_Correlation_K_RFP_CY5,Cytoplasm_RadialDistribution_ZernikeMagnitude_DAPI_8_4,Nuclei_RadialDistribution_ZernikeMagnitude_CY5_8_2,Nuclei_RadialDistribution_ZernikeMagnitude_DAPI_8_2,Nuclei_RadialDistribution_FracAtD_CY5_2of4,Cytoplasm_Correlation_RWC_DAPI_RFP
0,B,2,B2,2,129,NF1,WT,iNFixion,1000,88,...,-0.093534,1.229773,-0.841159,-0.046382,0.531873,0.392849,0.273369,1.997122,0.065734,0.156066
1,B,2,B2,5,129,NF1,WT,iNFixion,1000,98,...,-0.255972,-0.009669,1.690169,-0.233118,-0.914449,0.621309,-0.826012,1.599837,-0.144679,-0.436232
2,B,2,B2,10,129,NF1,WT,iNFixion,1000,67,...,-0.870993,-0.582862,-0.700461,-0.567569,-0.712755,-0.20536,-0.051229,-1.127109,-0.547623,-0.92826
3,B,2,B2,13,129,NF1,WT,iNFixion,1000,70,...,-1.094668,1.423012,-0.026835,-1.000887,-0.867653,0.255211,1.444931,0.53767,-1.078432,-1.473768
4,B,2,B2,17,129,NF1,WT,iNFixion,1000,74,...,1.798828,0.538671,0.406136,-0.066115,-0.785509,-0.307723,-0.203968,1.013351,-0.75668,0.991099


In [12]:
desired_columns = [
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Site",
    "Metadata_number_of_singlecells",
    "Metadata_genotype",
]

# Make sure to reinitialize UMAP instance
umap_fit = umap.UMAP(
    random_state=umap_random_seed, n_components=umap_n_components, n_jobs=1
)

# Process cp_df to separate features and metadata
cp_features = infer_cp_features(plate_6_filtered_df)
meta_features = infer_cp_features(plate_6_filtered_df, metadata=True)
filtered_meta_features = [
    feature for feature in meta_features if feature in desired_columns
]

# Fit UMAP and convert to pandas DataFrame
embeddings = pd.DataFrame(
    umap_fit.fit_transform(plate_6_filtered_df.loc[:, cp_features]),
    columns=[f"UMAP{x}" for x in range(0, umap_n_components)],
)
print(embeddings.shape)

# Combine with metadata
cp_umap_with_metadata_df = pd.concat(
    [plate_6_filtered_df.loc[:, filtered_meta_features], embeddings], axis=1
)

# randomize the rows of the dataframe to plot the order of the data evenly
cp_umap_with_metadata_df = cp_umap_with_metadata_df.sample(frac=1, random_state=0)

# Generate output file and save
if data_type == "cleaned":
    output_umap_file = pathlib.Path(
        output_dir, "UMAP_Plate_6_sc_only_model_features_qc.tsv"
    )
else:
    output_umap_file = pathlib.Path(
        output_dir, "UMAP_Plate_6_sc_only_model_features.tsv"
    )
cp_umap_with_metadata_df.to_csv(output_umap_file, index=False, sep="\t")



(4461, 2)
