# Set up UMAP coordinates

## Import libraries

In [1]:
import pathlib
import pandas as pd
import umap

  from .autonotebook import tqdm as notebook_tqdm


## Set paths and variables

In [2]:
# Load in barcode_platemap to assign plates based on treatment type
barcode_platemap = pd.read_csv(
    pathlib.Path("../../reference_plate_data/barcode_platemap.csv")
)

# Create a dictionary mapping plate to treatment type
plate_map_dict = dict(
    zip(barcode_platemap["Assay_Plate_Barcode"], barcode_platemap["Plate_Map_Name"])
)

# load in feature selected data
fs_df = pd.read_parquet(pathlib.Path("./data/concat_fs_norm_data_subset.parquet"))

# load in mitocheck filtered data
filter_df = pd.read_parquet(pathlib.Path("./data/concat_mitocheck_data_subset.parquet"))

# load in predicted probabilites file
prob_path = pathlib.Path("./data/all_features_probabilities.parquet")
prob_df = pd.read_parquet(prob_path)

# Mitocheck labelled dataset to use to find nuclei features and concat
label_data_path = "https://github.com/WayScience/phenotypic_profiling_model/raw/main/0.download_data/data/labeled_data__ic.csv.gz"

# Model type that will be used for the add predicted probability column
model_type = "final"

# UMAP results output directory
UMAP_results_dir = pathlib.Path("./results")
UMAP_results_dir.mkdir(exist_ok=True)

# Dict for data frames to be used for UMAP embeddings
df_dict = {}

print("We are working with a total of", fs_df.shape[0], "single-cells")
print(
    "There are these many features that were selected as differential:", fs_df.shape[1]
)

We are working with a total of 51000 single-cells
There are these many features that were selected as differential: 1756


## Add treatment type as a column

Note: This will be either compound, crispr, or orf.

In [3]:
# Add the Metadata_treatment column to concatenated_df
filter_df["Metadata_treatment"] = filter_df["Metadata_Plate"].map(plate_map_dict)

# Split the values in the Metadata_treatment column by "_" and take the 1st index
filter_df["Metadata_treatment"] = filter_df["Metadata_treatment"].str.split("_").str[1]

# move relevant metadata to the front of the data frame (treatment will be first)
desired_columns = [
    "Metadata_treatment",
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Site",
    "Metadata_ObjectNumber_cytoplasm",
]
filter_df = filter_df[
    desired_columns + [col for col in filter_df if col not in desired_columns]
]

# Check to make sure that this metadata has been added
print(filter_df.shape)
filter_df.head(2)

(51000, 173)


Unnamed: 0,Metadata_treatment,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_ObjectNumber_cytoplasm,Metadata_broad_sample,Metadata_solvent,Metadata_ImageNumber,Metadata_TableNumber,Metadata_Cytoplasm_Parent_Cells,...,Nuclei_Texture_SumEntropy_DNA_3_02_256,Nuclei_Texture_SumEntropy_DNA_3_03_256,Nuclei_Texture_SumVariance_DNA_3_00_256,Nuclei_Texture_SumVariance_DNA_3_01_256,Nuclei_Texture_SumVariance_DNA_3_02_256,Nuclei_Texture_SumVariance_DNA_3_03_256,Nuclei_Texture_Variance_DNA_3_00_256,Nuclei_Texture_Variance_DNA_3_01_256,Nuclei_Texture_Variance_DNA_3_02_256,Nuclei_Texture_Variance_DNA_3_03_256
0,compound,BR00117054,H04,1,58,DMSO,DMSO,1540,228619917084871791794985360693111447882,58.0,...,0.113166,0.005064,-0.306598,-0.278894,-0.214727,-0.266604,-0.284124,-0.277031,-0.254744,-0.281526
1,compound,BR00117054,H04,9,90,DMSO,DMSO,1548,228391807631129898267164739161499174121,90.0,...,-0.747137,-0.689093,-0.40747,-0.398943,-0.405557,-0.400385,-0.433045,-0.425488,-0.429724,-0.435415


In [4]:
# Loop through the columns and add "Metadata_" prefix to the nuclei center x,y columns
for column in prob_df.columns:
    if column.startswith("Nuclei"):
        prob_df.rename(columns={column: "Metadata_" + column}, inplace=True)

prob_df = prob_df.rename(columns={"Metadata_plate": "Metadata_Plate"})

print(prob_df.shape)
prob_df.head()

(41919720, 20)


Unnamed: 0,ADCCM,Anaphase,Apoptosis,Binuclear,Elongated,Grape,Hole,Interphase,Large,Metaphase,MetaphaseAlignment,OutOfFocus,Polylobed,Prometaphase,SmallIrregular,Metadata_model_type,Metadata_Well,Metadata_Plate,Metadata_ObjectNumber_cytoplasm,Metadata_Site
0,0.051008,0.211197,0.115538,0.00768385,0.059707,0.043649,0.04759277,0.01033536,0.036752,0.099821,0.023167,0.2601537,0.002784537,0.002658478,0.02795364,final,A01,BR00116996,1,1
1,0.008728,0.038776,0.003018,0.002421637,0.406965,0.036579,0.0104344,0.006927929,0.005184,0.366973,0.002281,0.1068267,0.001086595,0.0009656659,0.002832116,final,A01,BR00116996,2,1
2,0.002404,9e-06,5.1e-05,0.002444155,0.000181,0.000599,0.0002589783,0.001062534,0.003286,0.000299,0.002351,0.9720542,0.01353909,1.529966e-05,0.001444214,final,A01,BR00116996,3,1
3,0.002428,0.001167,0.989989,2.610488e-11,0.000184,0.00418,2.228079e-07,1.6437550000000002e-17,1.5e-05,0.001357,0.000678,3.249089e-07,9.46942e-09,2.895316e-07,5.325868e-09,final,A01,BR00116996,4,1
4,0.010993,0.047032,0.817058,2.187876e-05,0.012261,0.059017,0.001614873,5.1931e-08,0.000553,0.042322,0.005592,0.0022863,8.843109e-06,0.0007789092,0.0004601835,final,A01,BR00116996,5,1


In [5]:
# Only use final model
filtered_prob_df = prob_df[prob_df["Metadata_model_type"] == model_type]

# Add predicted class for each row to use for labelling
# set to -7 only for greg model, other models can be -5 if they do not include center x,y coords
filtered_prob_df["Metadata_Predicted_Class"] = filtered_prob_df.iloc[:, :-7].idxmax(
    axis=1
)

# Include a new column called Metadata_Phenotypic_Value as to be able to plot on UMAP
# Take the highest prob value and add as a value in the column to use for labelling
filtered_prob_df["Metadata_Phenotypic_Value"] = filtered_prob_df.iloc[:, :-7].max(
    axis=1
)

print(filtered_prob_df.shape)
filtered_prob_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_prob_df["Metadata_Predicted_Class"] = filtered_prob_df.iloc[:, :-7].idxmax(


(20959860, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_prob_df["Metadata_Phenotypic_Value"] = filtered_prob_df.iloc[:, :-7].max(


Unnamed: 0,ADCCM,Anaphase,Apoptosis,Binuclear,Elongated,Grape,Hole,Interphase,Large,Metaphase,...,Polylobed,Prometaphase,SmallIrregular,Metadata_model_type,Metadata_Well,Metadata_Plate,Metadata_ObjectNumber_cytoplasm,Metadata_Site,Metadata_Predicted_Class,Metadata_Phenotypic_Value
0,0.051008,0.211197,0.115538,0.00768385,0.059707,0.043649,0.04759277,0.01033536,0.036752,0.099821,...,0.002784537,0.002658478,0.02795364,final,A01,BR00116996,1,1,OutOfFocus,0.260154
1,0.008728,0.038776,0.003018,0.002421637,0.406965,0.036579,0.0104344,0.006927929,0.005184,0.366973,...,0.001086595,0.0009656659,0.002832116,final,A01,BR00116996,2,1,Elongated,0.406965
2,0.002404,9e-06,5.1e-05,0.002444155,0.000181,0.000599,0.0002589783,0.001062534,0.003286,0.000299,...,0.01353909,1.529966e-05,0.001444214,final,A01,BR00116996,3,1,OutOfFocus,0.972054
3,0.002428,0.001167,0.989989,2.610488e-11,0.000184,0.00418,2.228079e-07,1.6437550000000002e-17,1.5e-05,0.001357,...,9.46942e-09,2.895316e-07,5.325868e-09,final,A01,BR00116996,4,1,Apoptosis,0.989989
4,0.010993,0.047032,0.817058,2.187876e-05,0.012261,0.059017,0.001614873,5.1931e-08,0.000553,0.042322,...,8.843109e-06,0.0007789092,0.0004601835,final,A01,BR00116996,5,1,Apoptosis,0.817058


In [6]:
# Specify the columns you want to use for merging morphology and probabilities
merge_columns = [
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Site",
    "Metadata_ObjectNumber_cytoplasm",
]

# Select only the columns in filtered_prob_df that start with "Metadata"
filtered_prob_df_subset = filtered_prob_df.filter(like="Metadata")

# Merge the data frames on the specified columns
merged_prob_df = filter_df.merge(filtered_prob_df_subset, on=merge_columns, how="inner")

# Remove rows with NaN in feature columns
merged_prob_df = merged_prob_df.dropna(
    subset=(col for col in merged_prob_df.columns if not col.startswith("Metadata"))
)

# reset index
merged_prob_df.reset_index(inplace=True, drop=True)

# Add data frame to dictionary
df_dict["Only_JUMP_all_features"] = merged_prob_df

print(merged_prob_df.shape)
merged_prob_df.head(2)

(51000, 176)


Unnamed: 0,Metadata_treatment,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_ObjectNumber_cytoplasm,Metadata_broad_sample,Metadata_solvent,Metadata_ImageNumber,Metadata_TableNumber,Metadata_Cytoplasm_Parent_Cells,...,Nuclei_Texture_SumVariance_DNA_3_01_256,Nuclei_Texture_SumVariance_DNA_3_02_256,Nuclei_Texture_SumVariance_DNA_3_03_256,Nuclei_Texture_Variance_DNA_3_00_256,Nuclei_Texture_Variance_DNA_3_01_256,Nuclei_Texture_Variance_DNA_3_02_256,Nuclei_Texture_Variance_DNA_3_03_256,Metadata_model_type,Metadata_Predicted_Class,Metadata_Phenotypic_Value
0,compound,BR00117054,H04,1,58,DMSO,DMSO,1540,228619917084871791794985360693111447882,58.0,...,-0.278894,-0.214727,-0.266604,-0.284124,-0.277031,-0.254744,-0.281526,final,Elongated,0.282177
1,compound,BR00117054,H04,9,90,DMSO,DMSO,1548,228391807631129898267164739161499174121,90.0,...,-0.398943,-0.405557,-0.400385,-0.433045,-0.425488,-0.429724,-0.435415,final,Apoptosis,0.231956


## Combine data frames

### Remove all metadata and only include a metadata for data type

In [7]:
# Find all columns that start with Metadata
metadata_cols = [col for col in merged_prob_df.columns if col.startswith("Metadata")]

# Create a new DataFrame by selecting only "Metadata_Predicted_Class" and all other columns
jump_df = merged_prob_df[
    ["Metadata_Predicted_Class", "Metadata_Phenotypic_Value"]
    + [col for col in merged_prob_df.columns if col not in metadata_cols]
]

# Add a data name column to separate between datasets
jump_df.insert(0, "Metadata_data_name", "jump")

print(jump_df.shape)
jump_df.head(2)

(51000, 152)


Unnamed: 0,Metadata_data_name,Metadata_Predicted_Class,Metadata_Phenotypic_Value,Nuclei_AreaShape_Area,Nuclei_AreaShape_BoundingBoxArea,Nuclei_AreaShape_BoundingBoxMaximum_X,Nuclei_AreaShape_BoundingBoxMaximum_Y,Nuclei_AreaShape_BoundingBoxMinimum_X,Nuclei_AreaShape_BoundingBoxMinimum_Y,Nuclei_AreaShape_Center_X,...,Nuclei_Texture_SumEntropy_DNA_3_02_256,Nuclei_Texture_SumEntropy_DNA_3_03_256,Nuclei_Texture_SumVariance_DNA_3_00_256,Nuclei_Texture_SumVariance_DNA_3_01_256,Nuclei_Texture_SumVariance_DNA_3_02_256,Nuclei_Texture_SumVariance_DNA_3_03_256,Nuclei_Texture_Variance_DNA_3_00_256,Nuclei_Texture_Variance_DNA_3_01_256,Nuclei_Texture_Variance_DNA_3_02_256,Nuclei_Texture_Variance_DNA_3_03_256
0,jump,Elongated,0.282177,1.065755,0.724645,-1.28617,-0.884312,-1.27051,-0.930623,-1.277531,...,0.113166,0.005064,-0.306598,-0.278894,-0.214727,-0.266604,-0.284124,-0.277031,-0.254744,-0.281526
1,jump,Apoptosis,0.231956,-0.312907,0.112422,-0.997762,-0.49029,-0.99864,-0.493956,-0.999787,...,-0.747137,-0.689093,-0.40747,-0.398943,-0.405557,-0.400385,-0.433045,-0.425488,-0.429724,-0.435415


### Load in Mitocheck labeled data and update CellProfiler columns to match naming for JUMP

In [8]:
# Load in labeled mitocheck data
label_df = pd.read_csv(label_data_path)

# Extract feature columns from the JUMP data filtered by phenotypic profiling model features
feature_cols = [col for col in label_df.columns if col.startswith("CP__")]
metadata_cols = [col for col in label_df.columns if col.startswith("Metadata_")]

# Filter df with only CP features and the metadata columns
mito_cp_df = label_df[
    ["Mitocheck_Phenotypic_Class"]
    + ["Cell_UUID"]
    + ["Location_Center_X"]
    + ["Location_Center_Y"]
    + metadata_cols
    + feature_cols
]

# change prefix for columns to match JUMP
mito_cp_df.columns = mito_cp_df.columns.str.replace("CP__", "Nuclei_")

print(mito_cp_df.shape)
mito_cp_df.head()

(2916, 170)


Unnamed: 0,Mitocheck_Phenotypic_Class,Cell_UUID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,...,Nuclei_Texture_SumEntropy_DNA_3_02_256,Nuclei_Texture_SumEntropy_DNA_3_03_256,Nuclei_Texture_SumVariance_DNA_3_00_256,Nuclei_Texture_SumVariance_DNA_3_01_256,Nuclei_Texture_SumVariance_DNA_3_02_256,Nuclei_Texture_SumVariance_DNA_3_03_256,Nuclei_Texture_Variance_DNA_3_00_256,Nuclei_Texture_Variance_DNA_3_01_256,Nuclei_Texture_Variance_DNA_3_02_256,Nuclei_Texture_Variance_DNA_3_03_256
0,Large,21da27ab-873a-41f4-ab98-49170cae9a2d,397,618,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,0.243252,0.207835,-0.323359,-0.320501,-0.311945,-0.323494,-0.322818,-0.322017,-0.320285,-0.321772
1,Large,82f7949b-4ea2-45c8-8dd9-7854caf49077,359,584,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,0.46486,0.511547,-0.245543,-0.243937,-0.243026,-0.207762,-0.250865,-0.249915,-0.250428,-0.242154
2,Large,cec7234f-fe35-4411-aded-f8112bb31219,383,685,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,0.564208,0.484112,-0.216386,-0.209909,-0.178756,-0.228112,-0.21025,-0.218458,-0.203423,-0.221919
3,Large,43d9e7c9-c9ec-45ce-8820-048bfb896989,932,532,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,0.091198,0.114826,-0.340022,-0.322504,-0.340637,-0.341112,-0.344534,-0.337374,-0.343764,-0.33779
4,Large,63ce6652-338e-4afd-9c77-dbc0e903bf92,477,130,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,0.438306,0.417113,-0.281431,-0.26334,-0.25718,-0.267689,-0.282813,-0.27969,-0.273663,-0.275796


### Filter the mitocheck data

In [9]:
# Load in labeled mitocheck data
label_df = pd.read_csv(label_data_path)

# Extract feature columns from the JUMP data filtered by phenotypic profiling model features
feature_cols = [col for col in label_df.columns if col.startswith("CP__")]

# Filter df with only CP features and the metadata column
mito_cp_df = label_df[["Mitocheck_Phenotypic_Class"] + feature_cols]

# change prefix for columns to match JUMP
mito_cp_df.columns = mito_cp_df.columns.str.replace("CP__", "Nuclei_")

# add data name column to separate between datasets
mito_cp_df.insert(0, "Metadata_data_name", "mitocheck")

# add data name column to separate between datasets
mito_cp_df.insert(2, "Metadata_Phenotypic_Value", 1)

# rename phenotypic class column from mito to match JUMP
mito_cp_df.rename(
    columns={"Mitocheck_Phenotypic_Class": "Metadata_Predicted_Class"}, inplace=True
)

# Find columns that are in the Mitocheck data but not the JUMP data (nuclei features only)
diff_columns = (
    pd.Index([col for col in mito_cp_df.columns if col.startswith("Nuclei_")])
    .difference(merged_prob_df.columns)
    .tolist()
)

# drop features that are not seen in JUMP to avoid merging errors
mito_cp_df = mito_cp_df.drop(columns=diff_columns)

print(mito_cp_df.shape)
mito_cp_df.head(2)

(2916, 152)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mito_cp_df.rename(


Unnamed: 0,Metadata_data_name,Metadata_Predicted_Class,Metadata_Phenotypic_Value,Nuclei_AreaShape_Area,Nuclei_AreaShape_BoundingBoxArea,Nuclei_AreaShape_BoundingBoxMaximum_X,Nuclei_AreaShape_BoundingBoxMaximum_Y,Nuclei_AreaShape_BoundingBoxMinimum_X,Nuclei_AreaShape_BoundingBoxMinimum_Y,Nuclei_AreaShape_Center_X,...,Nuclei_Texture_SumEntropy_DNA_3_02_256,Nuclei_Texture_SumEntropy_DNA_3_03_256,Nuclei_Texture_SumVariance_DNA_3_00_256,Nuclei_Texture_SumVariance_DNA_3_01_256,Nuclei_Texture_SumVariance_DNA_3_02_256,Nuclei_Texture_SumVariance_DNA_3_03_256,Nuclei_Texture_Variance_DNA_3_00_256,Nuclei_Texture_Variance_DNA_3_01_256,Nuclei_Texture_Variance_DNA_3_02_256,Nuclei_Texture_Variance_DNA_3_03_256
0,mitocheck,Large,1,2.514724,2.329739,-0.703055,0.394466,-0.70853,0.332127,-0.705193,...,0.243252,0.207835,-0.323359,-0.320501,-0.311945,-0.323494,-0.322818,-0.322017,-0.320285,-0.321772
1,mitocheck,Large,1,3.493695,3.958694,-0.787151,0.280066,-0.818922,0.221209,-0.804212,...,0.46486,0.511547,-0.245543,-0.243937,-0.243026,-0.207762,-0.250865,-0.249915,-0.250428,-0.242154


## Concat mitocheck and jump data (all nuclei features)

In [10]:
# Concatenate the two DataFrames vertically
merged_mito_jump_df = pd.concat([mito_cp_df, jump_df], axis=0)

# Reset the index of the resulting DataFrame
merged_mito_jump_df.reset_index(drop=True, inplace=True)

# Add data frame to dictionary
df_dict["Mito_JUMP_all_features"] = merged_mito_jump_df

print(merged_mito_jump_df.shape)
merged_mito_jump_df.head(2)

(53916, 152)


Unnamed: 0,Metadata_data_name,Metadata_Predicted_Class,Metadata_Phenotypic_Value,Nuclei_AreaShape_Area,Nuclei_AreaShape_BoundingBoxArea,Nuclei_AreaShape_BoundingBoxMaximum_X,Nuclei_AreaShape_BoundingBoxMaximum_Y,Nuclei_AreaShape_BoundingBoxMinimum_X,Nuclei_AreaShape_BoundingBoxMinimum_Y,Nuclei_AreaShape_Center_X,...,Nuclei_Texture_SumEntropy_DNA_3_02_256,Nuclei_Texture_SumEntropy_DNA_3_03_256,Nuclei_Texture_SumVariance_DNA_3_00_256,Nuclei_Texture_SumVariance_DNA_3_01_256,Nuclei_Texture_SumVariance_DNA_3_02_256,Nuclei_Texture_SumVariance_DNA_3_03_256,Nuclei_Texture_Variance_DNA_3_00_256,Nuclei_Texture_Variance_DNA_3_01_256,Nuclei_Texture_Variance_DNA_3_02_256,Nuclei_Texture_Variance_DNA_3_03_256
0,mitocheck,Large,1.0,2.514724,2.329739,-0.703055,0.394466,-0.70853,0.332127,-0.705193,...,0.243252,0.207835,-0.323359,-0.320501,-0.311945,-0.323494,-0.322818,-0.322017,-0.320285,-0.321772
1,mitocheck,Large,1.0,3.493695,3.958694,-0.787151,0.280066,-0.818922,0.221209,-0.804212,...,0.46486,0.511547,-0.245543,-0.243937,-0.243026,-0.207762,-0.250865,-0.249915,-0.250428,-0.242154


## Only Zernike features

In [11]:
# Extract metadata columns
metadata_cols = [
    col for col in merged_mito_jump_df.columns if col.startswith("Metadata")
]
# Extract feature columns
feature_cols = [col for col in merged_mito_jump_df.columns if col.startswith("Nuclei")]

# Filter feature columns for "Zernike"
zernike_feature_cols = [col for col in feature_cols if "Zernike" in col]

# Create a new DataFrame with metadata and filtered features
mito_jump_zernike_df = merged_mito_jump_df[metadata_cols + zernike_feature_cols]

# Add df to dictionary
df_dict["Mito_JUMP_zernike_features"] = mito_jump_zernike_df

print(mito_jump_zernike_df.shape)
mito_jump_zernike_df.head(2)

(53916, 33)


Unnamed: 0,Metadata_data_name,Metadata_Predicted_Class,Metadata_Phenotypic_Value,Nuclei_AreaShape_Zernike_0_0,Nuclei_AreaShape_Zernike_1_1,Nuclei_AreaShape_Zernike_2_0,Nuclei_AreaShape_Zernike_2_2,Nuclei_AreaShape_Zernike_3_1,Nuclei_AreaShape_Zernike_3_3,Nuclei_AreaShape_Zernike_4_0,...,Nuclei_AreaShape_Zernike_8_0,Nuclei_AreaShape_Zernike_8_2,Nuclei_AreaShape_Zernike_8_4,Nuclei_AreaShape_Zernike_8_6,Nuclei_AreaShape_Zernike_8_8,Nuclei_AreaShape_Zernike_9_1,Nuclei_AreaShape_Zernike_9_3,Nuclei_AreaShape_Zernike_9_5,Nuclei_AreaShape_Zernike_9_7,Nuclei_AreaShape_Zernike_9_9
0,mitocheck,Large,1.0,-1.038763,-0.928531,1.180296,1.55731,-0.33483,-0.778987,-1.401342,...,-1.109334,-1.155584,0.569697,1.232027,-0.606715,-1.588903,-1.008395,0.002081,-0.78775,0.459925
1,mitocheck,Large,1.0,-0.402259,-0.853936,0.8452,0.921969,0.088275,0.395755,-0.066645,...,0.635363,0.48991,-0.425917,-0.366503,-0.563294,-0.185326,-0.312259,0.017453,0.577882,0.233967


## Only AreaShape features

In [12]:
# Extract metadata columns
metadata_cols = [
    col for col in merged_mito_jump_df.columns if col.startswith("Metadata")
]
# Extract feature columns
feature_cols = [col for col in merged_mito_jump_df.columns if col.startswith("Nuclei")]

# Filter feature columns for "Zernike"
areashape_feature_cols = [col for col in feature_cols if "AreaShape" in col]

# Create a new DataFrame with metadata and filtered features
mito_jump_areashape_df = merged_mito_jump_df[metadata_cols + areashape_feature_cols]

# Add df to dictionary
df_dict["Mito_JUMP_areashape_features"] = mito_jump_areashape_df

print(mito_jump_areashape_df.shape)
mito_jump_areashape_df.head(2)

(53916, 57)


Unnamed: 0,Metadata_data_name,Metadata_Predicted_Class,Metadata_Phenotypic_Value,Nuclei_AreaShape_Area,Nuclei_AreaShape_BoundingBoxArea,Nuclei_AreaShape_BoundingBoxMaximum_X,Nuclei_AreaShape_BoundingBoxMaximum_Y,Nuclei_AreaShape_BoundingBoxMinimum_X,Nuclei_AreaShape_BoundingBoxMinimum_Y,Nuclei_AreaShape_Center_X,...,Nuclei_AreaShape_Zernike_8_0,Nuclei_AreaShape_Zernike_8_2,Nuclei_AreaShape_Zernike_8_4,Nuclei_AreaShape_Zernike_8_6,Nuclei_AreaShape_Zernike_8_8,Nuclei_AreaShape_Zernike_9_1,Nuclei_AreaShape_Zernike_9_3,Nuclei_AreaShape_Zernike_9_5,Nuclei_AreaShape_Zernike_9_7,Nuclei_AreaShape_Zernike_9_9
0,mitocheck,Large,1.0,2.514724,2.329739,-0.703055,0.394466,-0.70853,0.332127,-0.705193,...,-1.109334,-1.155584,0.569697,1.232027,-0.606715,-1.588903,-1.008395,0.002081,-0.78775,0.459925
1,mitocheck,Large,1.0,3.493695,3.958694,-0.787151,0.280066,-0.818922,0.221209,-0.804212,...,0.635363,0.48991,-0.425917,-0.366503,-0.563294,-0.185326,-0.312259,0.017453,0.577882,0.233967


## Generate UMAP coordinates for each of the data splits

1. All Nuclei features from JUMP
2. All Nuclei features from Mitocheck + JUMP
3. Only Zernike features (AreaShape measurement) from Mitocheck + JUMP
4. Only AreaShape features from Mitocheck + JUMP

In [13]:
# Set constants
umap_random_seed = 0
umap_n_components = 2

for data_name, df in df_dict.items():
    print(
        "Creating embeddings for",
        data_name,
        "and including the probabilities from the",
        model_type,
        "_".join(prob_path.stem.split("_")[:2]),
        "model",
    )
    # Make sure to reinitialize UMAP instance per plate
    umap_fit = umap.UMAP(random_state=umap_random_seed, n_components=umap_n_components)

    # Process df to separate features and metadata
    metadata_columns = [col for col in df.columns if col.startswith("Metadata")]
    feature_columns = [col for col in df.columns if not col.startswith("Metadata")]

    # Fit UMAP and convert to pandas DataFrame
    embeddings = pd.DataFrame(
        umap_fit.fit_transform(df.loc[:, feature_columns]),
        columns=[f"UMAP{x}" for x in range(0, umap_n_components)],
    )

    # Combine with metadata
    umap_with_metadata_df = pd.concat([df.loc[:, metadata_columns], embeddings], axis=1)

    if not data_name == "Only_JUMP_all_features":
        # Only include relevant metadata and UMAP coords (merged only)
        umap_with_metadata_df = umap_with_metadata_df[
            [
                "Metadata_data_name",
                "Metadata_Predicted_Class",
                "Metadata_Phenotypic_Value",
                "UMAP0",
                "UMAP1",
            ]
        ]

    # Make folder per data split
    model_dir = pathlib.Path(f"./{UMAP_results_dir}/{data_name}")
    model_dir.mkdir(exist_ok=True)

    # Generate output file and save
    output_umap_file = pathlib.Path(
        f"./{model_dir}/{data_name}_{model_type}_{'_'.join(prob_path.stem.split('_')[:2])}_model.tsv"
    )
    umap_with_metadata_df.to_csv(output_umap_file, index=False, sep="\t")

print(umap_with_metadata_df.shape)
umap_with_metadata_df.head()

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Creating embeddings for Only_JUMP_all_features and including the probabilities from the final all_features model
Creating embeddings for Mito_JUMP_all_features and including the probabilities from the final all_features model


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Creating embeddings for Mito_JUMP_zernike_features and including the probabilities from the final all_features model


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Creating embeddings for Mito_JUMP_areashape_features and including the probabilities from the final all_features model


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(53916, 5)


Unnamed: 0,Metadata_data_name,Metadata_Predicted_Class,Metadata_Phenotypic_Value,UMAP0,UMAP1
0,mitocheck,Large,1.0,1.972792,6.335934
1,mitocheck,Large,1.0,3.21187,5.616934
2,mitocheck,Large,1.0,-0.424301,8.314137
3,mitocheck,Large,1.0,-0.076583,10.818566
4,mitocheck,Large,1.0,2.211961,6.890884
