In [1]:
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm
import umap

In [2]:
def fit_umap_to_the_first_timepoint(
    df: pd.DataFrame,
    timepoint_column: str = "Metadata_Time",
    metadata_columns: list = None,
    feature_columns: list = None,
    umap_model: umap.UMAP = None,
) -> pd.DataFrame:
    """
    This function fits a UMAP model to the first timepoint of the data and then applies the model to the rest of the data.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe containing all feature, metadata, and timepoint columns.
    timepoint_column : str, optional
        The name of the column containing the timepoint information, by default "Metadata_Time"
    metadata_columns : list, optional
        The names of the columns containing the metadata information, by default None
    feature_columns : list, optional
        The names of the columns containing the feature information, by default None
    umap_model : umap.UMAP, optional
        The UMAP model to use, by default None. If None, a new UMAP model will be created with default parameters.

    Returns
    -------
    pd.DataFrame
        The UMAP embeddings for the data, with the metadata columns included.
    """

    df = df.copy()
    metadata_df = df[metadata_columns]

    # get the first timepoint and the subset of the data for that timepoint
    first_time = df[timepoint_column].min()
    first_timepoint_subset_df = df[df[timepoint_column] == first_time]

    # get the
    first_timepoint_subset_df = first_timepoint_subset_df.drop(metadata_columns, axis=1)
    first_timepoint_subset_df = first_timepoint_subset_df[feature_columns]
    first_timepoint_subset_df = first_timepoint_subset_df.dropna(axis=0)
    # fit the model to the first timepoint
    _ = umap_model.fit_transform(first_timepoint_subset_df)

    # get the rest of the data fo transformation
    df = df.drop(metadata_columns, axis=1)
    df = df[feature_columns]
    df.dropna(axis=0, inplace=True)
    metadata_df = metadata_df.loc[df.index]
    df.reset_index(drop=True, inplace=True)
    metadata_df.reset_index(drop=True, inplace=True)

    # apply the model to the rest of the data
    umap_embeddings = umap_model.transform(df)
    # create a dataframe with the umap fit and the metadata
    umap_df = pd.DataFrame(umap_embeddings, columns=["UMAP_0", "UMAP_1"])
    # add the metadata to the dataframe
    umap_df = pd.concat([umap_df, metadata_df], axis=1)

    return umap_df

In [3]:
CP_scDINO_profile_file_path = pathlib.Path(
    "../../data/CP_scDINO_features/combined_CP_scDINO_norm_fs.parquet"
).resolve(strict=True)
df = pd.read_parquet(CP_scDINO_profile_file_path)
df.head()

Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
0,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,101,...,0.018387,-0.178214,1.589703,0.313944,1.126927,-0.143103,0.241127,-0.293259,-0.283715,1.434163
1,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,111,...,-1.811176,-0.059895,-1.208776,0.10275,0.845704,0.08393,-1.990931,-0.030848,-1.033722,-0.942127
2,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,11,...,-0.731998,0.654253,-0.075728,0.810937,0.30094,-0.22878,1.782329,0.153739,-0.763335,0.725093
3,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,128,...,1.339222,1.203907,0.509754,-0.711263,0.067196,-0.149771,1.40565,0.063245,2.16211,3.187469
4,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,132,...,-0.41992,-0.29961,0.344723,-0.25113,-1.851114,0.669517,-0.439855,1.576201,0.747753,0.895601


In [4]:
metadata_columns = [x for x in df.columns if "Metadata" in x]
scDINO_columns = [x for x in df.columns if "scDINO" in x]
CP_columns = df.drop(columns=metadata_columns + scDINO_columns).columns
CP_scDINO_columns = df.drop(metadata_columns, axis=1).columns

feature_set_dict = {
    "scDINO": scDINO_columns,
    "CP": CP_columns,
    "CP_scDINO": CP_scDINO_columns,
}

In [5]:
umap_model = umap.UMAP(
    n_neighbors=4,
    n_components=2,
    metric="euclidean",
    random_state=0,
    min_dist=0.5,
    spread=0.8,
)

In [6]:
for feature_set_name, feature_set in tqdm.tqdm(feature_set_dict.items()):
    umap_df = fit_umap_to_the_first_timepoint(
        df,
        timepoint_column="Metadata_Time",
        metadata_columns=metadata_columns,
        feature_columns=feature_set,
        umap_model=umap_model,
    )
    # set the save path of the umap data
    umap_save_path = pathlib.Path(
        f"../results/UMAP/{feature_set_name}_umap.parquet"
    ).resolve()
    umap_save_path.parent.mkdir(parents=True, exist_ok=True)
    # save the umap data
    umap_df.to_parquet(umap_save_path, index=False)

  warn(
100%|██████████| 3/3 [02:00<00:00, 40.02s/it]
