In [None]:
import sys
repo_dir = '/home/labs/amit/noamsh/repos/MM_2023'
sys.path.append(repo_dir)

%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
import os

from omegaconf import OmegaConf
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad

from io_utils import generate_path_in_output_dir
from data_loading.utils import load_dataframe_from_file

In [None]:
config_path = Path(repo_dir, 'config.yaml')
conf = OmegaConf.load(config_path)
update_results = False

from datetime import date
ts_iso = date.today().isoformat()

sc.set_figure_params(dpi=100, dpi_save=150)
figures_dir = Path(conf.outputs.output_dir, "figures", ts_iso)
update_figures = True

In [None]:
load_ts_iso = "2024-05-19"
data_version = "20240515"

## all cells

In [None]:
adata_for_clustering = ad.read_h5ad(generate_path_in_output_dir(conf, conf.outputs.inferred_missing_annotation_file_name,
                                                 with_version=data_version, with_date_timestamp=load_ts_iso))
adata_for_clustering

In [None]:
drop_diseases = ('In_vitro', 'Ex_vivo')
adata_for_clustering = adata_for_clustering[adata_for_clustering.obs[conf.annotation.Disease].apply(lambda x: x not in drop_diseases)].copy()
adata_for_clustering.obs["Populations"][adata_for_clustering.obs["super_Population"] == "PC"] = "PC"
# adata_for_clustering.obs["Populations"][adata_for_clustering.obs["Populations"].apply(lambda x: x in ('UN', 'Malignant'))] = ""
adata_for_clustering = adata_for_clustering[adata_for_clustering.obs["Populations"].apply(lambda x: x not in ("Erythrocytes", 'UN', 'Malignant'))]


In [None]:
adata_for_clustering.obs["Populations"].value_counts()

In [None]:
def update_disease_col(adata, pateint_disease_map, rename_disease_map):
    new_disease_col = []
    c_ids = []
    for cid, row in adata.obs.iterrows():
        # print(row)
        # break
        c_ids.append(cid)
        if row['Hospital.Code'] in pateint_disease_map:
            new_disease_col.append(pateint_disease_map[row['Hospital.Code']])
        else:
            new_disease_col.append(row['Disease'])
    
    adata.obs['Disease'] = pd.Series(new_disease_col, index=c_ids)
    adata.obs['Disease'] = adata.obs['Disease'].apply(lambda x: rename_disease_map[x] if x in rename_disease_map else x)

In [None]:
wrong_labeling_patient_map = {
"TLV07": "MGUS" ,
"TLV14": "MGUS",
"TLV18": "SMM",
"TLV20": "NDMM",
"HMC11": "SMM",
"TLV22": "NDMM",
"HMC12": "NDMM",
"HMC13": "MGUS",
"TLV25": "NDMM" ,
"HMC13": "MGUS",
"TLV26": "SMM",
"TLV27": "PRMM",
"HMC19": "NDMM"
}

disease_map = {"PRMM": "RRMM", "NDAL": "AL"}

update_disease_col(adata_for_clustering, wrong_labeling_patient_map, disease_map)

In [None]:
Disease_patients_count = adata_for_clustering.obs.groupby('Disease')['Hospital.Code'].nunique()
Disease_patients_count[[ "Healthy", "MGUS", "SMM", "NDMM", "RRMM"]]

In [None]:
# report 
print(f"""
Hip Controls: {Disease_patients_count['Healthy']}
MM Patients: {Disease_patients_count[[ "MGUS", "SMM", "NDMM", "RRMM"]].sum()}
Plasma cells: {(adata_for_clustering.obs["super_Population"]=="PC").sum()}
Immune cells: {(adata_for_clustering.obs["super_Population"]=="CD45").sum() }
Total cells: {adata_for_clustering.n_obs}
""")

In [None]:
sc.tl.umap(adata_for_clustering, neighbors_key=None, min_dist=conf.umap_settings.umap_min_dist)

In [None]:
from matplotlib import rcParams
FIGSIZE = (10, 10)
rcParams["figure.figsize"] = FIGSIZE
# rcParams.update({'font.size': 16})

sc.pl.umap(adata_for_clustering, color=['Populations', 'super_Population'], size=1,
           ncols=1, legend_loc='on data',legend_fontsize='medium', frameon=False, palette="tab20")

### targets

In [None]:
mor_markers_df = load_dataframe_from_file(Path("/home/labs/amit/noamsh/data/mm_2023/MM targets and genes.xlsx"))
mor_markers_df.drop(columns=["Unnamed: 0"])
PC_markers = mor_markers_df['Known PC markers'].dropna().tolist()
MM_drivers = mor_markers_df['Known MM drivers'].dropna().tolist()
MM_drivers.remove("ITF4")
MM_drivers.append("IRF4")
MM_targets = mor_markers_df['MM targets'].dropna().tolist()
Clinical_trials = mor_markers_df['Clinical trials'].dropna().tolist()
Clinical_trials.remove('CD73')
Clinical_trials.append("NT5E")
Clinical_trials.remove('eIF2a')
Clinical_trials.append("EIF2A")

In [None]:
PPT_PC_markers = ["CD38", "SDC1", "XBP1", "SSR4", "MZB1"]
PPT_MM_drivers = ["CCND1", "CCND2", "FRZB", "LAMP5", "ITGB7", "CDR1", "NSD2", "FGFR3", "SPP1"]
PPT_current_targets = ["CD38", "SDC1", "TNFRSF17", "GPRC5D", "SLAMF7"]
new_targets_PPT = ["TNFRSF13B", "FCRLA", "CCR10", "KCNN3", "BFSP2"]
new_targets_after_PPT = ["ITM2C", "FCGR2B", "FCRL2", "ITGA8", "BTLA", "IL5RA", "LIME1", "BST2", "SELPLG", "CD180", "SLAMF1", "ADORA2A", "PERP", "P2RX5", "RASGRP3" , "ORAI2" ]

In [None]:
FIGSIZE = (5, 5)
rcParams["figure.figsize"] = FIGSIZE

In [None]:
# sc.pl.umap(adata_for_clustering, color=PC_markers, ncols=3, frameon=False, colorbar_loc=None)
sc.pl.umap(adata_for_clustering, color=PPT_PC_markers, ncols=5, frameon=False, colorbar_loc=None, size=0.8)

In [None]:
# sc.pl.umap(adata_for_clustering, color=MM_drivers, ncols=3, frameon=False, colorbar_loc=None)
sc.pl.umap(adata_for_clustering, color=PPT_MM_drivers, ncols=5, frameon=False, colorbar_loc=None, size=0.8)

In [None]:
# sc.pl.umap(adata_for_clustering, color=MM_targets, ncols=3, frameon=False, colorbar_loc=None)
sc.pl.umap(adata_for_clustering, color=PPT_current_targets, ncols=5, frameon=False, colorbar_loc=None, size=1)


In [None]:
sc.pl.umap(adata_for_clustering, color=Clinical_trials, ncols=3, frameon=False, colorbar_loc=None)

In [None]:
targets_mor = pd.read_excel('/home/labs/amit/noamsh/data/mm_2023/targets/Genes_combined_mor.xls')
sc.pl.umap(adata_for_clustering, color=targets_mor['Gene_name'], ncols=3, frameon=False, colorbar_loc=None)

In [None]:
# sc.pl.umap(adata_for_clustering, color=["BMP6", "DERL3", "FCRLA", "HLA-DOB", "KCNN3", "MOXD1", "TNFRSF13B", "AMPD1", "MYEOV", "TXNDC5", "BFSP2", "CCR10", "JSRP1"], ncols=3, frameon=False, colorbar_loc=None)
sc.pl.umap(adata_for_clustering, color=new_targets_PPT, ncols=4, frameon=False, colorbar_loc=None, vmax=4, size=1)

In [None]:
sc.pl.umap(adata_for_clustering, color=new_targets_PPT, ncols=3, frameon=False, colorbar_loc=None, vmax=4, size=1)

In [None]:
sc.pl.umap(adata_for_clustering, color=new_targets_after_PPT, ncols=3, frameon=False, colorbar_loc=None, vmax=4, size=1)

## only PC

In [None]:
annotated_only_pc_path = Path(conf.outputs.output_dir, f"adata_with_scvi_annot_pred_data_v_{data_version}_ts_{load_ts_iso}_only_pc_annotated.h5ad")
annotated_filtered_only_pc_path = Path(conf.outputs.output_dir, f"adata_with_scvi_annot_pred_data_v_{data_version}_ts_{load_ts_iso}_only_pc_annotated_filtered.h5ad")

adata_only_pc = ad.read_h5ad(annotated_only_pc_path)
non_noisy_malignant = ad.read_h5ad(annotated_filtered_only_pc_path)

In [None]:
# sc.pl.umap(non_noisy_malignant, color=PC_markers, ncols=3 , size=2, frameon=False, colorbar_loc=None)
sc.pl.umap(non_noisy_malignant, color=PPT_PC_markers, ncols=5 , size=5, frameon=False, colorbar_loc=None)

In [None]:
# sc.pl.umap(non_noisy_malignant, color=MM_drivers, ncols=3, size=2, frameon=False, colorbar_loc=None)
sc.pl.umap(non_noisy_malignant, color=PPT_MM_drivers, ncols=5 , size=5, frameon=False, colorbar_loc=None)

In [None]:
# sc.pl.umap(non_noisy_malignant, color=MM_targets, ncols=3, size=2, frameon=False, colorbar_loc=None)
sc.pl.umap(non_noisy_malignant, color=PPT_current_targets, ncols=5 , size=5, frameon=False, colorbar_loc=None)


In [None]:
sc.pl.umap(non_noisy_malignant, color=Clinical_trials, ncols=3, size=2, frameon=False, colorbar_loc=None)

In [None]:
# metabolism
# sc.pl.umap(non_noisy_malignant, color=["HIF1A", "LDHA", "LDHB", "SLC2A1", "SLC2A3", "ODC1", "SMS", "SMOX", "SAT1", "SRM"], ncols=3, palette="Paired", legend_loc='on data', size=10)

In [None]:
targets_mor = pd.read_excel('/home/labs/amit/noamsh/data/mm_2023/targets/Genes_combined_mor.xls')
sc.pl.umap(non_noisy_malignant, color=targets_mor['Gene_name'], ncols=3, size=2, frameon=False, colorbar_loc=None)

In [None]:
# sc.pl.umap(non_noisy_malignant, color=["BMP6", "DERL3", "FCRLA", "HLA-DOB", "KCNN3", "MOXD1", "TNFRSF13B", "AMPD1", "MYEOV", "TXNDC5", "BFSP2", "CCR10", "JSRP1"], ncols=3, size=2, frameon=False, colorbar_loc=None)
sc.pl.umap(non_noisy_malignant, color=new_targets_PPT, ncols=3, size=4, frameon=False, color_map='viridis', vmax=4, colorbar_loc=None) # colorbar_loc=None


In [None]:
sc.pl.umap(non_noisy_malignant, color=new_targets_after_PPT, ncols=3, size=4, frameon=False, color_map='viridis', vmax=5, colorbar_loc=None) # colorbar_loc=None


In [None]:

mye_genes = ["MAF", 'MAFA','CYBB', 'CXCL12', 'C1QB', 'TLR4','S100A10','S100A11','S100A4']
sc.pl.umap(non_noisy_malignant, color=mye_genes, ncols=3, vmax=4 , size=2, frameon=False, color_map='viridis', colorbar_loc=None) # colorbar_loc=None


In [None]:
sc.pl.umap(non_noisy_malignant, color='BCL2', ncols=3, vmax=4 , size=2, frameon=False, color_map='viridis', colorbar_loc=None) # colorbar_loc=None


### exploration

In [None]:
non_noisy_malignant

In [None]:
non_noisy_malignant.obs['pc_annotation'] = non_noisy_malignant.obs['pc_annotation'].apply(lambda x: x if x != 'non Healthy' else 'Malignant')
non_noisy_malignant.obs['pc_annotation'] = non_noisy_malignant.obs['pc_annotation'].cat.add_categories("Healthy like")
non_noisy_malignant.obs['pc_annotation'][non_noisy_malignant.obs['leiden'] == '8'] = "Healthy like"

In [None]:
sc.pl.umap(non_noisy_malignant, color='pc_annotation', ncols=1, palette="brc", frameon=False, legend_loc='on data', title="only PC")

In [None]:
wrong_labeling_patient_map = {
"TLV07": "MGUS" ,
"TLV14": "MGUS",
"TLV18": "SMM",
"TLV20": "NDMM",
"HMC11": "SMM",
"TLV22": "NDMM",
"HMC12": "NDMM",
"HMC13": "MGUS",
"TLV25": "NDMM" ,
"HMC13": "MGUS",
"TLV26": "SMM",
"TLV27": "PRMM",
"HMC19": "NDMM"
}

disease_map = {"PRMM": "RRMM", "NDAL": "AL"}

update_disease_col(non_noisy_malignant, wrong_labeling_patient_map, disease_map)


In [None]:

Diseases = non_noisy_malignant.obs['Disease'].unique()
for d in Diseases:
    disease_map = non_noisy_malignant.obs['Disease'] == d
    non_noisy_malignant.obs[d] = np.nan
    non_noisy_malignant.obs[d][disease_map] = d

In [None]:

# show_Diseases = list(Diseases)
# show_Diseases.remove("MM_Unknown") 
# show_Diseases.remove("EMD")
# show_Diseases.remove("AL")
# show_Diseases
show_Diseases = ['Healthy', 'MGUS', 'SMM', 'NDMM', 'RRMM']

In [None]:
sc.pl.umap(non_noisy_malignant, color=show_Diseases, ncols=5, frameon=False, legend_loc=None, palette="gist_gray", size=1.5)

In [None]:
sc.pl.umap(non_noisy_malignant,
               color=["number_of_diffrent_patients_in_nighborhood"],
           ncols=2 ,color_map="magma", frameon=False)

In [None]:
new_potential_targets = ["UBA52", "RACK1", "CST3", "TIMP1", "FTH1", "FTL"]
sc.pl.umap(non_noisy_malignant,  color=new_potential_targets,
           ncols=3 ,color_map="magma", palette='Paired')

In [None]:
# patients_with_no_coverage = ['01-001', '01-006', '01-011', '028-0601-005', '028-0608-008', '03-001', '04-002', '04-003', '04-004', '05-001', '08-001', '09-001', '10-005', '11-005', '12-001', '14-001', 'ASF1', 'BEL04', 'P19', 'P20', 'P24', 'CSA-01-02', 'CSA-01-04', 'HMC02', 'HMC06', 'HMC10', 'HMC18', 'HMC20', 'KydarNDMM01', 'KydarNDMM03', 'TLV01', 'TLV12', 'TLV13', 'TLV18', 'TLV19', 'TLV27', 'TLV34', 'TLV41', 'TLV48', 'TLV50-DN', 'TLV53', 'P03', 'P16', 'P35', 'P45', 'P46', 'P54', 'P71', 'P103', 'P208', 'P225', 'P258', 'P300', 'P309', 'P653', 'P659', 'P683', 'P792', 'P818', 'P822', 'P85', 'P942', 'TLV75', 'TLV77', 'TLV85', 'TLV86', 'RMC002']
patients_with_no_coverage = ['01-001', '028-0608-008', '04-002', '04-003', '08-001', 'BEL04', 'HMC20', 'KydarNDMM01', 'TLV13', 'TLV34', 'TLV48', 'P03', 'P46', 'P54', 'P103', 'P309', 'P683', 'TLV85']
non_noisy_malignant.obs["patients_with_no_coverage"] = non_noisy_malignant.obs["Hospital.Code"].apply(lambda x: x in patients_with_no_coverage)
sc.pl.umap(non_noisy_malignant,  color=["patients_with_no_coverage"],
           ncols=3 ,color_map="magma", palette='Paired')

In [None]:
patients_with_low_coverage = ['01-001', '01-004', '01-011', '028-0603-001', '028-0606-001', '028-0608-008', '04-002', '04-003', '04-005', '05-002', '08-001', '09-001', '09-002', '10-005', '11-003', '11-005', 'ASF1', 'BEL04', 'P19', 'P20', 'CSA-01-02', 'CSA-01-04', 'HMC05', 'HMC06', 'HMC10', 'HMC11', 'HMC18', 'HMC20', 'KydarNDMM01', 'KydarNDMM03', 'TLV01', 'TLV12', 'TLV13', 'TLV18', 'TLV19', 'TLV25', 'TLV27', 'TLV34', 'TLV48', 'TLV51', 'P03', 'P16', 'P35', 'P45', 'P46', 'P48', 'P54', 'P103', 'P225', 'P258', 'P309', 'P659', 'P683', 'P792', 'P85', 'TLV53', 'TLV77', 'TLV85']
non_noisy_malignant.obs["patients_with_low_coverage"] = non_noisy_malignant.obs["Hospital.Code"].apply(lambda x: x in patients_with_low_coverage)
sc.pl.umap(non_noisy_malignant,  color=["patients_with_low_coverage"],
           ncols=3 ,color_map="magma", palette='Paired')