# Sample JUMP plate data to compute UMAP.
This UMAP data was computed by sampling QC'd and feature-selected cellprofiler profiles.

In [1]:
import pathlib

import numpy as np
import pandas as pd
import umap

  import pkg_resources


# Inputs

In [2]:

git_root_path = pathlib.Path("../../")
big_drive_path = pathlib.Path("/mnt/big_drive").resolve(strict=True)
feature_data_path = big_drive_path / "feature_selected_sc_qc_data"
plate_to_treat_typedf = pd.read_csv(
    (git_root_path / "reference_plate_data/barcode_platemap.csv").resolve(strict=True)
)
plate_to_treat_typedf = plate_to_treat_typedf.rename(
    columns={
        "Assay_Plate_Barcode": "Metadata_Plate",
        "Plate_Map_Name": "Metadata_Treatment_Type",
    }
)

# Outputs

In [3]:

umap_data_path = big_drive_path / "umap_data/feature_selected_sc_qc_data"
umap_data_path.mkdir(parents=True, exist_ok=True)

## Process Plate Mappings

In [4]:

replacements = [
    "crispr",
    "orf",
    "compound",
]

conditions = [
    plate_to_treat_typedf["Metadata_Treatment_Type"].str.contains(
        k, case=False, na=False
    )
    for k in replacements
]

plate_to_treat_typedf["Metadata_Treatment_Type"] = np.select(
    conditions, replacements, default=plate_to_treat_typedf["Metadata_Treatment_Type"]
)

# Sample Single Cells
Sample cells from plate data.

In [5]:

merge_cols = [
    "Metadata_Plate",
    "Metadata_Site",
    "Metadata_Well",
    "Metadata_ObjectNumber",
]

umapdf = []

for plate_path in feature_data_path.iterdir():
    plate_name = plate_path.stem.split("_")[0]

    print(f"Sampling Plate {plate_name}")
    anomaly_path = (
        big_drive_path
        / f"sc_anomaly_data/feature_selected_sc_qc_data/{plate_name}_feature_selected_sc_qc"
    )

    anomalydf = pd.concat(
        [pd.read_parquet(path) for path in anomaly_path.iterdir()], axis=0
    )

    featdf = pd.read_parquet(
        big_drive_path
        / f"feature_selected_sc_qc_data/{plate_name}_feature_selected_sc_qc.parquet"
    )

    result_cols = anomalydf.columns[anomalydf.columns.str.contains("Result")].tolist()

    # Include the anomaly data
    scdf = pd.merge(
        left=anomalydf[result_cols + merge_cols],
        right=featdf,
        how="inner",
        on=merge_cols,
    )

    # Include the treatment type
    scdf = pd.merge(
        left=scdf, right=plate_to_treat_typedf, how="inner", on="Metadata_Plate"
    )

    scdf.loc[
        ~scdf["Metadata_control_type"].isin(["negcon"]), "Metadata_control_type"
    ] = "other"

    group_sizes = scdf["Metadata_control_type"].value_counts()
    large_groups = group_sizes[group_sizes > 250].index
    small_groups = group_sizes[group_sizes <= 250].index
    sampled_large = (
        scdf[scdf["Metadata_control_type"].isin(large_groups)]
        .groupby("Metadata_control_type", group_keys=False)
        .sample(n=250, random_state=0)
    )
    small = scdf[scdf["Metadata_control_type"].isin(small_groups)]
    scdf = pd.concat([sampled_large, small], axis=0)

    umapdf.append(scdf)

umapdf = pd.concat(umapdf, axis=0)
umapdf = umapdf.dropna(axis=1, how="any")

print("Shape of plate data after sampling:", umapdf.shape)
print(umapdf["Metadata_control_type"].unique())

Sampling Plate BR00117054


Sampling Plate BR00117012


Sampling Plate BR00117019


Sampling Plate BR00117006


Sampling Plate BR00117025


Sampling Plate BR00116992


Sampling Plate BR00118048


Sampling Plate BR00117003


Sampling Plate BR00117005


Sampling Plate BR00117000


Sampling Plate BR00117053


Sampling Plate BR00117009


Sampling Plate BR00118042


Sampling Plate BR00118040


Sampling Plate BR00117055


Sampling Plate BR00117051


Sampling Plate BR00117017


Sampling Plate BR00117023


Sampling Plate BR00118049


Sampling Plate BR00117004


Sampling Plate BR00117011


Sampling Plate BR00118041


Sampling Plate BR00117052


Sampling Plate BR00116995


Sampling Plate BR00117002


Sampling Plate BR00118043


Sampling Plate BR00116993


Sampling Plate BR00117010


Sampling Plate BR00116996


Sampling Plate BR00116998


Sampling Plate BR00117008


Sampling Plate BR00117001


Sampling Plate BR00116991


Sampling Plate BR00116999


Sampling Plate BR00118045


Sampling Plate BR00117050


Sampling Plate BR00118046


Sampling Plate BR00117015


Sampling Plate BR00117026


Sampling Plate BR00116994


Sampling Plate BR00118039


Sampling Plate BR00118050


Sampling Plate BR00117024


Sampling Plate BR00116997


Sampling Plate BR00117016


Sampling Plate BR00117020


Sampling Plate BR00117022


Sampling Plate BR00117013


Sampling Plate BR00117021


Sampling Plate BR00118044


Sampling Plate BR00118047


Shape of plate data after sampling: (25500, 317)
['negcon' 'other']


# Compute UMAP Components
Drop all feature data not associated with UMAP, result, or metadata data.

In [6]:


def compute_umap_components(umapdf: pd.DataFrame):
    umap_drop_cols = [
        col for col in umapdf.columns if "Metadata" in col or "Result" in col
    ]

    umapdf = umapdf.sample(frac=1, random_state=0)
    reducer = umap.UMAP(n_components=2, random_state=0)
    umap_data = reducer.fit_transform(umapdf.drop(columns=umap_drop_cols))
    umapdf = umapdf.copy()
    umapdf[["umap_0", "umap_1"]] = umap_data[:, :2]

    return umapdf[umap_drop_cols + ["umap_0", "umap_1"]]


umapdf = compute_umap_components(umapdf=umapdf)

In [7]:

print("\nColumns of final umap:", umapdf.columns.tolist())
print(f"\nShape of final umap: {umapdf.shape}")
print(umapdf["Metadata_control_type"].value_counts())


Columns of final umap: ['Result_inlier', 'Result_anomaly_score', 'Metadata_Plate', 'Metadata_Site', 'Metadata_Well', 'Metadata_ObjectNumber', 'Metadata_broad_sample', 'Metadata_ImageNumber', 'Metadata_TableNumber', 'Metadata_ObjectNumber_cytoplasm', 'Metadata_Cytoplasm_Parent_Cells', 'Metadata_Cytoplasm_Parent_Nuclei', 'Metadata_ObjectNumber_cells', 'Metadata_pert_type', 'Metadata_control_type', 'Metadata_Treatment_Type', 'umap_0', 'umap_1']

Shape of final umap: (25500, 18)
Metadata_control_type
other     12750
negcon    12750
Name: count, dtype: int64


# Save UMAP Data

In [8]:

umapdf.to_parquet(umap_data_path / "umap_feature_selected_sc_qc_data.parquet")