In [1]:
import pathlib

import numpy as np
import pandas as pd
import toml
import umap
from tqdm import tqdm

In [2]:
# Parameters
cell_type = "PBMC"

In [3]:
# read in toml file

# set up the path
toml_path = pathlib.Path("../utils/params.toml")
# read in the toml file
params = toml.load(toml_path)
list_of_treatments = params["list_of_treatments"]["treatments"]
print(len(list_of_treatments))
print(list_of_treatments)

15
['DMSO_0.100_%_DMSO_0.025_%', 'Thapsigargin_1.000_uM_DMSO_0.025_%', 'Thapsigargin_10.000_uM_DMSO_0.025_%', 'H2O2_100.000_uM_DMSO_0.025_%', 'H2O2_100.000_nM_DMSO_0.025_%', 'Flagellin_0.100_ug_per_ml_DMSO_0.025_%', 'Flagellin_1.000_ug_per_ml_DMSO_0.025_%', 'LPS_0.010_ug_per_ml_DMSO_0.025_%', 'LPS_0.100_ug_per_ml_DMSO_0.025_%', 'LPS_1.000_ug_per_ml_DMSO_0.025_%', 'LPS_10.000_ug_per_ml_DMSO_0.025_%', 'LPS_100.000_ug_per_ml_DMSO_0.025_%', 'LPS_Nigericin_1.000_ug_per_ml_1.000_uM_DMSO_0.025_%', 'LPS_Nigericin_1.000_ug_per_ml_3.000_uM_DMSO_0.025_%', 'LPS_Nigericin_1.000_ug_per_ml_10.000_uM_DMSO_0.025_%']


In [4]:
# Set path to parquet file
path = pathlib.Path(f"../../data/{cell_type}_preprocessed_sc_norm.parquet").resolve(
    strict=True
)
# Read in parquet file
df = pd.read_parquet(path)

In [5]:
# Code snippet for metadata extraction by Jenna Tomkinson
df_metadata = list(df.columns[df.columns.str.contains("Metadata")])
# define which columns are data and which are descriptive
df_descriptive = df[df_metadata]
df_values = df.drop(columns=df_metadata)

In [6]:
anova_path = pathlib.Path(f"../results/{cell_type}_combined.parquet")

anova_results = pd.read_parquet(anova_path)
anova_results.head()

Unnamed: 0,group1,group2,meandiff,lower,upper,p-adj,reject,features,p-adj_abs,pos_neg
0,apoptosis,healthy,-0.0071,0.0005,-0.0138,-0.0003,True,Cytoplasm_RadialDistribution_ZernikePhase_Corr...,0.0003,negative
1,apoptosis,pyroptosis,-0.0071,0.0004,-0.0139,-0.0004,True,Cytoplasm_RadialDistribution_ZernikePhase_Corr...,0.0004,negative
2,healthy,pyroptosis,-0.0001,0.995,-0.0032,0.003,False,Cytoplasm_RadialDistribution_ZernikePhase_Corr...,0.003,positive
0,apoptosis,healthy,-0.0329,-0.0,-0.0398,-0.0259,True,Cells_RadialDistribution_ZernikeMagnitude_Corr...,0.0259,negative
1,apoptosis,pyroptosis,0.0412,-0.0,0.0342,0.0482,True,Cells_RadialDistribution_ZernikeMagnitude_Corr...,0.0482,positive


In [7]:
# create a column that adds group1 and group2 together
anova_results["group"] = anova_results["group1"] + "_" + anova_results["group2"]
print(anova_results.shape)

# filter out rows that have p-adj_abs > 0.05
anova_results = anova_results[anova_results["p-adj_abs"] < 0.05]
print(anova_results.shape)

# change the group names to replace healthy with control
anova_results["group"] = anova_results["group"].str.replace("healthy", "control")

# create the three df sets for a venn diagram
a_h = anova_results[anova_results["group"] == "apoptosis_control"]["features"]
a_p = anova_results[anova_results["group"] == "apoptosis_pyroptosis"]["features"]
h_p = anova_results[anova_results["group"] == "control_pyroptosis"]["features"]

# create a list of the three df sets
a_h_list = a_h.tolist()
a_p_list = a_p.tolist()
h_p_list = h_p.tolist()

# add sets together
a_h__a_p = np.union1d(a_h_list, a_p_list)
a_h__h_p = np.union1d(a_h_list, h_p_list)
a_p__h_p = np.union1d(a_p_list, h_p_list)

(3735, 11)
(2415, 11)


In [8]:
# get the unique features for each set
a_h_unique = np.setdiff1d(a_h_list, a_p__h_p)
print(len(a_h_unique))

a_p_unique = np.setdiff1d(a_p_list, a_h__h_p)
print(len(a_p_unique))

h_p_unique = np.setdiff1d(h_p_list, a_h__a_p)
print(len(h_p_unique))

# get the common features for each set
a_h__a_p_common = np.intersect1d(a_h_list, a_p_list)
a_h__a_p_common = np.setdiff1d(a_h__a_p_common, h_p_list)
print(len(a_h__a_p_common))

a_h__h_p_common = np.intersect1d(a_h_list, h_p_list)
a_h__h_p_common = np.setdiff1d(a_h__h_p_common, a_p_list)
print(len(a_h__h_p_common))

a_p__h_p_common = np.intersect1d(a_p_list, h_p_list)
a_p__h_p_common = np.setdiff1d(a_p__h_p_common, a_h_list)
print(len(a_p__h_p_common))

# all three set intersection
a_h__a_p__h_p_common = np.intersect1d(a_h_list, a_p_list)
a_h__a_p__h_p_common = np.intersect1d(a_h__a_p__h_p_common, h_p_list)
print(len(a_h__a_p__h_p_common))

244
40
136
54
74
43
551


In [9]:
# create a list of each list of features
dict_of_feature_lists = {}
dict_of_feature_lists["a_h_unique"] = list(a_h_unique)
dict_of_feature_lists["a_p_unique"] = list(a_p_unique)
dict_of_feature_lists["h_p_unique"] = list(h_p_unique)
dict_of_feature_lists["a_h__a_p_common"] = list(a_h__a_p_common)
dict_of_feature_lists["a_h__h_p_common"] = list(a_h__h_p_common)
dict_of_feature_lists["a_p__h_p_common"] = list(a_p__h_p_common)
dict_of_feature_lists["a_h__a_p__h_p_common"] = list(a_h__a_p__h_p_common)

In [10]:
# set umap parameters
umap_params = umap.UMAP(
    n_components=2,
    spread=1.1,
    min_dist=0.8,
    init="random",
    metric="cosine",
    # random_state=0,
    n_jobs=-1,
)

In [11]:
final_df_dict = {}
for key, value in tqdm(dict_of_feature_lists.items()):
    print(key)
    print(len(value))
    df = df_values[df_values.columns[df_values.columns.isin(value)]]
    umap_results = umap_params.fit_transform(df)
    results_df = pd.DataFrame(umap_results, columns=["UMAP1", "UMAP2"])
    results_df.loc[:, "Metadata_Treatment_Dose_Inhibitor_Dose"] = df_descriptive[
        "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"
    ]
    results_df.loc[:, "Dataset_comparison"] = key
    final_df_dict[key] = results_df
final_df = pd.concat(final_df_dict.values(), ignore_index=True)


  0%|          | 0/7 [00:00<?, ?it/s]

a_h_unique
244



 14%|█▍        | 1/7 [29:43<2:58:18, 1783.03s/it]

a_p_unique
40



 29%|██▊       | 2/7 [56:22<2:19:34, 1674.87s/it]

h_p_unique
136



 43%|████▎     | 3/7 [1:24:18<1:51:41, 1675.37s/it]

a_h__a_p_common
54



 57%|█████▋    | 4/7 [1:52:24<1:23:59, 1679.77s/it]

a_h__h_p_common
74



 71%|███████▏  | 5/7 [2:19:29<55:19, 1659.88s/it]  

a_p__h_p_common
43



 86%|████████▌ | 6/7 [2:46:19<27:23, 1643.09s/it]

a_h__a_p__h_p_common
551



100%|██████████| 7/7 [3:17:53<00:00, 1724.99s/it]


100%|██████████| 7/7 [3:17:53<00:00, 1696.20s/it]




In [12]:
# write out the results
out_path = pathlib.Path(f"../results/{cell_type}_combined_sub_UMAP_results.parquet")
final_df.to_parquet(out_path)
final_df.head()

Unnamed: 0,UMAP1,UMAP2,Metadata_Treatment_Dose_Inhibitor_Dose,Dataset_comparison
0,8.245439,14.024575,LPS_0.010_ug_per_ml_DMSO_0.025_%,a_h_unique
1,5.398923,13.592986,LPS_0.010_ug_per_ml_DMSO_0.025_%,a_h_unique
2,12.687638,9.254477,LPS_0.010_ug_per_ml_DMSO_0.025_%,a_h_unique
3,2.385462,13.058334,LPS_0.010_ug_per_ml_DMSO_0.025_%,a_h_unique
4,6.711093,-0.140633,LPS_0.010_ug_per_ml_DMSO_0.025_%,a_h_unique
...,...,...,...,...
39190769,4.069534,3.113695,DMSO_0.100_%_DMSO_1.000_%,a_h__a_p__h_p_common
39190770,0.355694,10.232911,DMSO_0.100_%_DMSO_1.000_%,a_h__a_p__h_p_common
39190771,-2.405004,5.493917,DMSO_0.100_%_DMSO_1.000_%,a_h__a_p__h_p_common
39190772,3.799183,10.736754,DMSO_0.100_%_DMSO_1.000_%,a_h__a_p__h_p_common
