In [1]:
import pathlib

import numpy as np
import pandas as pd
import umap

In [2]:
cell_type = "PBMC"

In [3]:
bulk_profile_file_path = pathlib.Path(
    f"../../data/{cell_type}_preprocessed_sc_norm_aggregated.parquet"
).resolve(strict=True)
pathlib.Path("../results").mkdir(parents=True, exist_ok=True)
umap_output_file_path = pathlib.Path(
    f"../results/{cell_type}_umap_bulk_profile.parquet"
).resolve()


bulk_profile = pd.read_parquet(bulk_profile_file_path)
print(bulk_profile.shape)
bulk_profile.head()

(154, 1201)


Unnamed: 0,Metadata_Well,oneb_Metadata_Treatment_Dose_Inhibitor_Dose,Cytoplasm_AreaShape_Compactness,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_MajorAxisLength,Cytoplasm_AreaShape_MinorAxisLength,Cytoplasm_AreaShape_Orientation,Cytoplasm_AreaShape_Zernike_0_0,Cytoplasm_AreaShape_Zernike_1_1,Cytoplasm_AreaShape_Zernike_2_0,...,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_CorrMito_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_03_256,Nuclei_Texture_SumEntropy_CorrPM_3_01_256,Nuclei_Texture_SumVariance_CorrPM_3_03_256,Nuclei_Texture_Variance_CorrER_3_02_256,Nuclei_Texture_Variance_CorrMito_3_01_256
0,B02,LPS_0.010_ug_per_ml_DMSO_0.025_%,0.100173,-0.059734,0.218567,0.111938,0.00742,-0.100946,-0.030356,-0.070701,...,0.021386,-0.095924,-0.182695,-0.185317,-0.183084,-0.189434,0.217271,0.023909,-0.015452,-0.004886
1,B03,LPS_0.010_ug_per_ml_DMSO_0.025_%,0.137279,-0.097646,0.205644,0.108021,-0.002159,-0.141895,-0.059932,-0.091195,...,0.034647,0.079415,-0.10595,-0.112622,-0.108821,-0.114137,0.141156,0.022128,-0.017276,-0.006272
2,B04,LPS_Nigericin_100.000_ug_per_ml_1.000_uM_DMSO_...,0.071345,-0.053566,0.055404,0.013373,0.004443,-0.111708,-0.084402,-0.043409,...,-0.087337,-0.67167,-0.068129,-0.06252,-0.063204,-0.066542,0.074449,-0.020061,0.022286,0.039616
3,B05,LPS_Nigericin_100.000_ug_per_ml_1.000_uM_DMSO_...,0.110685,-0.084346,0.107954,0.071923,0.00415,-0.121376,-0.075382,-0.052805,...,-0.096255,-1.263923,-0.102173,-0.099375,-0.10133,-0.100625,0.11406,-0.007227,0.009458,0.059863
4,B06,DMSO_0.100_%_DMSO_0.025_%,-0.021771,0.018442,-0.048689,-0.07049,-0.005284,-0.008255,-0.012815,-0.017174,...,0.082642,0.292318,0.029805,0.022969,0.026496,0.024827,-0.028355,-0.00784,-0.037983,-0.014871


In [4]:
# get the Metadata columns
metadata_columns = bulk_profile.columns[bulk_profile.columns.str.contains("Metadata_")]
metadata_df = bulk_profile[metadata_columns]
feature_df = bulk_profile.drop(metadata_columns, axis=1)

# UMAP
# set umap parameters
umap_params = umap.UMAP(
    n_components=2,
    spread=1.1,
    min_dist=0.8,
    init="random",
    metric="cosine",
    random_state=0,
)

# fit umap
umap_output = umap_params.fit_transform(feature_df)
umap_output_df = pd.DataFrame(umap_output, columns=["UMAP1", "UMAP2"])
umap_output_df = pd.concat([metadata_df, umap_output_df], axis=1)

# save umap output
umap_output_df.to_parquet(umap_output_file_path)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
