In [None]:
import sys
repo_dir = '/home/labs/amit/noamsh/repos/MM_2023'
sys.path.append(repo_dir)

%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
import os

from omegaconf import OmegaConf
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import pyreadr

from matplotlib import rcParams
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from io_utils import generate_path_in_output_dir
from data_loading.utils import load_dataframe_from_file, get_updated_disease_col
from clinical_predictions.clinical_data_loading import load_and_process_clinical_data

In [None]:
config_path = Path(repo_dir, 'config.yaml')
conf = OmegaConf.load(config_path)
update_results = False

from datetime import date
ts_iso = date.today().isoformat()

sc.set_figure_params(dpi=100, dpi_save=150)
figures_dir = Path(conf.outputs.output_dir, "figures", ts_iso)
update_figures = True

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# load_ts_iso = "2024-05-21"
# data_version = "20240519"

# load_ts_iso = "2024-06-20"
# data_version = "20240619"

load_ts_iso = "2024-08-18"
data_version = "20240813"


## all cells

In [None]:
adata_for_clustering = ad.read_h5ad(generate_path_in_output_dir(conf, conf.outputs.inferred_missing_annotation_file_name,
                                                 with_version=data_version, with_date_timestamp=load_ts_iso))
adata_for_clustering

In [None]:
hospital_crf_path = Path('/home/labs/amit/noamsh/data/mm_2023/clinical_prediction/Anonymized_CRF_BP_01082024.xlsx')
clinical_disease = load_and_process_clinical_data(hospital_crf_path, code_lower_case=False, get_hospital_stage=True,
                                                 get_post_treatment=False, get_treatment_history=True,
                                                 get_combination_exposure=False, get_pfs_data=False)
adata_for_clustering.obs['Biopsy.Sequence'] = adata_for_clustering.obs['Biopsy.Sequence'].astype(int)
merged = adata_for_clustering.obs.merge(clinical_disease, how='left', 
                                        left_on=['Hospital.Code', 'Biopsy.Sequence'],
                                        right_on=['Code', 'Biopsy sequence No.'])

In [None]:
updated_disease_col = get_updated_disease_col(merged, "Disease", "Disease Stage Hospital", 
                                              update_non_naive_NDMM=True, remove_PRMM=False)
updated_disease_col.index = adata_for_clustering.obs_names
adata_for_clustering.obs['Disease'] = updated_disease_col
Disease_patients_count = adata_for_clustering.obs.groupby('Disease')['Hospital.Code'].nunique()
# Disease_patients_count

In [None]:
adata_for_clustering.obs["Populations"][adata_for_clustering.obs["super_Population"] == "PC"] = "PC"
adata_for_clustering = adata_for_clustering[adata_for_clustering.obs["Populations"].apply(lambda x: x not in ("Erythrocytes", 'UN', 'Malignant'))]
adata_for_clustering.obs["Populations"].value_counts()

In [None]:
sid = merged['Hospital.Code'] + "_" + merged['Biopsy.Sequence'].astype(str)

In [None]:
# report 
print(f"sequenced BM samples: {len(sid.unique())}")
print(f"samples with clinical data: {len(clinical_disease)}")

print("\npatients by disease stage")
print(Disease_patients_count.sort_values(ascending=False))

print("\ncells sequenced")
print(f"""
Plasma cells: {(adata_for_clustering.obs["super_Population"]=="PC").sum()}
Immune cells: {(adata_for_clustering.obs["super_Population"]=="CD45").sum() }
Total cells: {adata_for_clustering.n_obs}
""")

In [None]:
FIGSIZE = (6, 6)
rcParams["figure.figsize"] = FIGSIZE
# rcParams.update({'font.size': 16})

sc.pl.umap(adata_for_clustering, color=['Populations', 'super_Population'], size=1,
           ncols=1, legend_loc='on data',legend_fontsize='small', frameon=False, palette="tab20")

### targets

In [None]:
mor_markers_df = load_dataframe_from_file(Path("/home/labs/amit/noamsh/data/mm_2023/MM targets and genes.xlsx"))
mor_markers_df.drop(columns=["Unnamed: 0"])
PC_markers = mor_markers_df['Known PC markers'].dropna().tolist()
MM_drivers = mor_markers_df['Known MM drivers'].dropna().tolist()
MM_drivers.remove("ITF4")
MM_drivers.append("IRF4")
MM_targets = mor_markers_df['MM targets'].dropna().tolist()
Clinical_trials = mor_markers_df['Clinical trials'].dropna().tolist()
Clinical_trials.remove('CD73')
Clinical_trials.append("NT5E")
Clinical_trials.remove('eIF2a')
Clinical_trials.append("EIF2A")

In [None]:
PPT_PC_markers = ["CD38", "SDC1", "XBP1", "SSR4", "MZB1"]
PPT_MM_drivers = ["CCND1", "CCND2", "FRZB", "LAMP5", "ITGB7", "CDR1", "NSD2", "FGFR3", "SPP1"]
PPT_current_targets = ["CD38", "SDC1", "TNFRSF17", "GPRC5D", "SLAMF7"]
new_targets_PPT = ["TNFRSF13B", "FCRLA", "CCR10", "KCNN3"]

In [None]:
FIGSIZE = (5, 5)
rcParams["figure.figsize"] = FIGSIZE

In [None]:
# sc.pl.umap(adata_for_clustering, color=PC_markers, ncols=3, frameon=False, colorbar_loc=None)
sc.pl.umap(adata_for_clustering, color=PPT_PC_markers, ncols=5, frameon=False, colorbar_loc=None, size=0.8)

In [None]:
# sc.pl.umap(adata_for_clustering, color=MM_drivers, ncols=3, frameon=False, colorbar_loc=None)
sc.pl.umap(adata_for_clustering, color=PPT_MM_drivers, ncols=5, frameon=False, colorbar_loc=None, size=0.8)

In [None]:
# sc.pl.umap(adata_for_clustering, color=MM_targets, ncols=3, frameon=False, colorbar_loc=None)
sc.pl.umap(adata_for_clustering, color=PPT_current_targets, ncols=5, frameon=False, colorbar_loc=None, size=1)

In [None]:
sc.pl.umap(adata_for_clustering, color=Clinical_trials, ncols=3, frameon=False, colorbar_loc=None)

In [None]:
targets_mor = pd.read_excel('/home/labs/amit/noamsh/data/mm_2023/targets/Genes_combined_mor.xls')
sc.pl.umap(adata_for_clustering, color=targets_mor['Gene_name'], ncols=3, frameon=False, colorbar_loc=None)

In [None]:
# sc.pl.umap(adata_for_clustering, color=["BMP6", "DERL3", "FCRLA", "HLA-DOB", "KCNN3", "MOXD1", "TNFRSF13B", "AMPD1", "MYEOV", "TXNDC5", "BFSP2", "CCR10", "JSRP1"], ncols=3, frameon=False, colorbar_loc=None)
sc.pl.umap(adata_for_clustering, color=new_targets_PPT, ncols=4, frameon=False, colorbar_loc=None, vmax=4, size=1)

In [None]:
sc.pl.umap(adata_for_clustering, color=['BCL2'], ncols=4, frameon=False, colorbar_loc=None, vmax=4, size=1)

## only PC

In [None]:
annotated_only_pc_path = Path(conf.outputs.output_dir, f"adata_with_scvi_annot_pred_data_v_{data_version}_ts_{load_ts_iso}_only_pc_annotated.h5ad")
annotated_filtered_only_pc_path = Path(conf.outputs.output_dir, f"adata_with_scvi_annot_pred_data_v_{data_version}_ts_{load_ts_iso}_only_pc_annotated_filtered.h5ad")

adata_only_pc = ad.read_h5ad(annotated_only_pc_path)
non_noisy_malignant = ad.read_h5ad(annotated_filtered_only_pc_path)

In [None]:
sc.pl.umap(non_noisy_malignant, color=PC_markers, ncols=3 , size=2, frameon=False, colorbar_loc=None)
# sc.pl.umap(non_noisy_malignant, color=PPT_PC_markers, ncols=5 , size=5, frameon=False, colorbar_loc=None)

In [None]:
sc.pl.umap(non_noisy_malignant, color=MM_drivers, ncols=3, size=2, frameon=False, colorbar_loc=None)
# sc.pl.umap(non_noisy_malignant, color=PPT_MM_drivers, ncols=5 , size=5, frameon=False, colorbar_loc=None)

In [None]:
sc.pl.umap(non_noisy_malignant, color=MM_targets, ncols=3, size=2, frameon=False, colorbar_loc=None)
# sc.pl.umap(non_noisy_malignant, color=PPT_current_targets, ncols=5 , size=5, frameon=False, colorbar_loc=None)


In [None]:
sc.pl.umap(non_noisy_malignant, color=Clinical_trials, ncols=3, size=2, frameon=False, colorbar_loc=None)

In [None]:
# metabolism
# sc.pl.umap(non_noisy_malignant, color=["HIF1A", "LDHA", "LDHB", "SLC2A1", "SLC2A3", "ODC1", "SMS", "SMOX", "SAT1", "SRM"], ncols=3, palette="Paired", legend_loc='on data', size=10)

In [None]:
targets_mor = pd.read_excel('/home/labs/amit/noamsh/data/mm_2023/targets/Genes_combined_mor.xls')
sc.pl.umap(non_noisy_malignant, color=targets_mor['Gene_name'], ncols=3, size=2, frameon=False, colorbar_loc=None)

In [None]:
# sc.pl.umap(non_noisy_malignant, color=["BMP6", "DERL3", "FCRLA", "HLA-DOB", "KCNN3", "MOXD1", "TNFRSF13B", "AMPD1", "MYEOV", "TXNDC5", "BFSP2", "CCR10", "JSRP1"], ncols=3, size=2, frameon=False, colorbar_loc=None)
sc.pl.umap(non_noisy_malignant, color=new_targets_PPT, ncols=4, size=4, frameon=False, color_map='viridis', vmax=4, colorbar_loc=None) # colorbar_loc=None


In [None]:

mye_genes = ["MAF", 'MAFA','CYBB', 'CXCL12', 'C1QB', 'TLR4','S100A10','S100A11','S100A4']
sc.pl.umap(non_noisy_malignant, color=mye_genes, ncols=3, vmax=4 , size=2, frameon=False, color_map='viridis', colorbar_loc=None) # colorbar_loc=None


In [None]:
sc.pl.umap(non_noisy_malignant, color='BCL2', ncols=3, vmax=4 , size=2, frameon=False, color_map='viridis', colorbar_loc=None) # colorbar_loc=None


### exploration

In [None]:
non_noisy_malignant

In [None]:
sc.pl.umap(non_noisy_malignant, color='pc_annotation', ncols=1, palette="brc", frameon=False, legend_loc='on data', title="only PC")

In [None]:
# sample_cell_counts
cells = non_noisy_malignant.obs[['Hospital.Code','Biopsy.Sequence', 'pc_annotation', 'Disease']]
cells = cells[cells['Biopsy.Sequence']!=4]

total_sample_cells = cells.groupby(['Hospital.Code','Biopsy.Sequence','Disease'])['pc_annotation'].count()
total_sample_cells = total_sample_cells[total_sample_cells!=0].reset_index()
total_sample_cells = total_sample_cells.rename(columns={'pc_annotation': "total_pc"})

In [None]:
# cell_counts = cells.groupby(['Hospital.Code','Biopsy.Sequence','Disease'])['pc_annotation'].value_counts(normalize=norm_counts)
# cell_counts[cell_counts!=0].reset_index()

In [None]:
norm_counts = True
low_pc_thesh = 50

sample_cell_counts = cells.groupby(['Hospital.Code','Biopsy.Sequence','Disease'])['pc_annotation'].value_counts(normalize=norm_counts)
sample_cell_counts = sample_cell_counts[sample_cell_counts!=0].reset_index()
sample_cell_counts = sample_cell_counts.merge(total_sample_cells, on=['Hospital.Code','Biopsy.Sequence','Disease'], how='left')
sample_cell_counts = sample_cell_counts[sample_cell_counts['total_pc']>low_pc_thesh]

for cell_population_to_plot in ['Healthy' , 'Healthy_Like', 'Malignant']:
    f = px.box(sample_cell_counts[sample_cell_counts['pc_annotation'] == cell_population_to_plot],
               x='Biopsy.Sequence', y='proportion' if norm_counts else 'count',
               color='Disease', points="all", hover_data=['Hospital.Code', 'total_pc'], 
               title=f"{'proportion' if norm_counts else 'count'} of {cell_population_to_plot} in samples")
    f.show()

In [None]:
sample_cell_counts

hl_porportions = sample_cell_counts[sample_cell_counts['pc_annotation']=='Healthy_Like'][['Hospital.Code','Biopsy.Sequence', 'Disease','proportion']]
mal_porportions = sample_cell_counts[sample_cell_counts['pc_annotation']=='Malignant'][['Hospital.Code','Biopsy.Sequence', 'Disease','proportion']]

merged_proportions = mal_porportions.merge(hl_porportions, how='outer', on=['Hospital.Code', 'Biopsy.Sequence', 'Disease'], suffixes=('_mal', '_hl'))
merged_proportions[['proportion_hl', 'proportion_mal']] = merged_proportions[['proportion_hl', 'proportion_mal']].fillna(0)

merged_proportions
px.scatter(merged_proportions, x='proportion_hl', y='proportion_mal', color='Disease', hover_data=['Hospital.Code', 'Biopsy.Sequence'])

### problems
healthy - 
hipb13 has only 6 pc 2 are mal
hip16 and hip17 have more than 20% mal (from 246 and 365 cell respectively)
controls with very low PC (ctrl052, control_sa, control_meit, tlv91, contril_keren, ctrl020)(less than 15)- some times have Healthy like instead of Healthy



In [None]:
wrong_labeling_patient_map = {
"TLV07": "MGUS" ,
"TLV14": "MGUS",
"TLV18": "SMM",
"TLV20": "NDMM",
"HMC11": "SMM",
"TLV22": "NDMM",
"HMC12": "NDMM",
"HMC13": "MGUS",
"TLV25": "NDMM" ,
"HMC13": "MGUS",
"TLV26": "SMM",
"TLV27": "PRMM",
"HMC19": "NDMM"
}

# disease_map = {"PRMM": "RRMM", "NDAL": "AL"}

# update_disease_col(non_noisy_malignant, wrong_labeling_patient_map, disease_map)


In [None]:

Diseases = non_noisy_malignant.obs['Disease'].unique()
for d in Diseases:
    disease_map = non_noisy_malignant.obs['Disease'] == d
    non_noisy_malignant.obs[d] = np.nan
    non_noisy_malignant.obs[d][disease_map] = d

In [None]:

# show_Diseases = list(Diseases)
# show_Diseases.remove("MM_Unknown") 
# show_Diseases.remove("EMD")
# show_Diseases.remove("AL")
# show_Diseases
show_Diseases = ['Healthy', 'MGUS', 'SMM', 'NDMM','non_naive_NDMM', 'RRMM']

In [None]:
sc.pl.umap(non_noisy_malignant, color=show_Diseases, ncols=3, frameon=False, legend_loc=None, palette="gist_gray", size=1)

In [None]:
sc.pl.umap(non_noisy_malignant,
               color=["number_of_diffrent_patients_in_nighborhood"],
           ncols=2 ,color_map="magma", frameon=False)

In [None]:
new_potential_targets = ["UBA52", "RACK1", "CST3", "TIMP1", "FTH1", "FTL"]
sc.pl.umap(non_noisy_malignant,  color=new_potential_targets,
           ncols=3 ,color_map="magma", palette='Paired')

In [None]:
# patients_with_no_coverage = ['01-001', '01-006', '01-011', '028-0601-005', '028-0608-008', '03-001', '04-002', '04-003', '04-004', '05-001', '08-001', '09-001', '10-005', '11-005', '12-001', '14-001', 'ASF1', 'BEL04', 'P19', 'P20', 'P24', 'CSA-01-02', 'CSA-01-04', 'HMC02', 'HMC06', 'HMC10', 'HMC18', 'HMC20', 'KydarNDMM01', 'KydarNDMM03', 'TLV01', 'TLV12', 'TLV13', 'TLV18', 'TLV19', 'TLV27', 'TLV34', 'TLV41', 'TLV48', 'TLV50-DN', 'TLV53', 'P03', 'P16', 'P35', 'P45', 'P46', 'P54', 'P71', 'P103', 'P208', 'P225', 'P258', 'P300', 'P309', 'P653', 'P659', 'P683', 'P792', 'P818', 'P822', 'P85', 'P942', 'TLV75', 'TLV77', 'TLV85', 'TLV86', 'RMC002']
patients_with_no_coverage = ['01-001', '028-0608-008', '04-002', '04-003', '08-001', 'BEL04', 'HMC20', 'KydarNDMM01', 'TLV13', 'TLV34', 'TLV48', 'P03', 'P46', 'P54', 'P103', 'P309', 'P683', 'TLV85']
non_noisy_malignant.obs["patients_with_no_coverage"] = non_noisy_malignant.obs["Hospital.Code"].apply(lambda x: x in patients_with_no_coverage)
sc.pl.umap(non_noisy_malignant,  color=["patients_with_no_coverage"],
           ncols=3 ,color_map="magma", palette='Paired')

In [None]:
patients_with_low_coverage = ['01-001', '01-004', '01-011', '028-0603-001', '028-0606-001', '028-0608-008', '04-002', '04-003', '04-005', '05-002', '08-001', '09-001', '09-002', '10-005', '11-003', '11-005', 'ASF1', 'BEL04', 'P19', 'P20', 'CSA-01-02', 'CSA-01-04', 'HMC05', 'HMC06', 'HMC10', 'HMC11', 'HMC18', 'HMC20', 'KydarNDMM01', 'KydarNDMM03', 'TLV01', 'TLV12', 'TLV13', 'TLV18', 'TLV19', 'TLV25', 'TLV27', 'TLV34', 'TLV48', 'TLV51', 'P03', 'P16', 'P35', 'P45', 'P46', 'P48', 'P54', 'P103', 'P225', 'P258', 'P309', 'P659', 'P683', 'P792', 'P85', 'TLV53', 'TLV77', 'TLV85']
non_noisy_malignant.obs["patients_with_low_coverage"] = non_noisy_malignant.obs["Hospital.Code"].apply(lambda x: x in patients_with_low_coverage)
sc.pl.umap(non_noisy_malignant,  color=["patients_with_low_coverage"],
           ncols=3 ,color_map="magma", palette='Paired')

### cell level arch vis

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


sample_level = False

if sample_level:
    # nmf_path = "/home/labs/amit/annaku/repos/MM_2024_AK/Shuang_scripts/outputs/arch_sample_v5_without_drivers.csv"
    nmf_path = "/home/labs/amit/annaku/repos/MM_2024_AK/Shuang_scripts/outputs/arch_sample_v5_with_drivers.csv"
else:
    nmf_path = "/home/labs/amit/annaku/repos/MM_2024_AK/Shuang_scripts/outputs/arch_sample_v4_without_drivers.csv"

# load old shuang architypes to rename our new ones
path_sh = '/home/labs/amit/shuangyi/Project_MM3/Atlas/scvi_diff/z_v4_cl_clus.Rds'
result = pyreadr.read_r(path_sh)
df_sh = result[None]
df_sh['PID'] = df_sh['PID'].str.lower()

if not sample_level:
    arch_score_df = pd.read_csv(nmf_path)
    arch_score_df['index'] = arch_score_df['index'].str.lower()
    arch_score_df = arch_score_df.set_index('index').drop(columns=['Unnamed: 0', 'Row.names'])
    
    print("make sure that the following manual cluster map match the print")
    print(pd.concat([arch_score_df['Cluster'], df_sh.set_index("PID")['clus_new']], axis=1).value_counts().sort_index())
    arch_map = {'1': '2', '2': '8', '3': '7', '4':'6', '5':'5', '6':'1', '7':'4', '8':'3'}
    
    arch_score_df = arch_score_df.rename(columns=arch_map)
    arch_score_df = arch_score_df.drop(columns='Cluster')
    arch_score_df = arch_score_df.merge(df_sh[["PID", "clus_new"]], how='inner', left_index=True, right_on='PID')
    arch_score_df = arch_score_df.set_index('PID')
    arch_score_df = arch_score_df.rename(columns={'clus_new': 'architype'})
    arch_score_df['architype'] = arch_score_df['architype'].astype(str)
else:
    arch_score_df = arch_score_df = pd.read_csv(nmf_path)
    arch_score_df['index'] = arch_score_df['index'].str.lower()
    arch_score_df['SID'] = arch_score_df['index']
    arch_score_df = arch_score_df.set_index('SID').drop(columns=['Unnamed: 0', 'Row.names', 'index'])
    arch_score_df = arch_score_df.rename(columns={'Cluster': 'architype'})

    print("make sure that the following manual cluster map match the print")
    arch_score_df['PID'] = pd.Series([str(sid)[:-2] for sid in arch_score_df.index], index=arch_score_df.index)
    cm = confusion_matrix(arch_score_df.merge(df_sh, how='inner', on="PID")['clus_new'].astype(int),
                      arch_score_df.merge(df_sh, how='inner', on="PID")['architype'].astype(int))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  display_labels=[int(i) for i in range(1,9)])
    disp.plot()
    disp.ax_.set_title('before renameing')
    
    # arch_map = {'1': '4', '2': '2', '3': '3', '4': '5', '5': '6', '6': '8',  '7': '1', '8': '7'} # for version with no MM drivers
    arch_map = {'1': '8', '2': '3', '3': '5', '4': '4', '5': '7', '6': '1', '7': '6', '8': '2'}
    
    arch_score_df = arch_score_df.rename(columns=arch_map)
    arch_score_df['architype'] = arch_score_df['architype'].apply(lambda x: arch_map[str(x)])
    
    cm = confusion_matrix(arch_score_df.merge(df_sh, how='inner', on="PID")['clus_new'].astype(int),
                      arch_score_df.merge(df_sh, how='inner', on="PID")['architype'].astype(int))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  display_labels=[int(i) for i in range(1,9)])
    disp.plot()
    disp.ax_.set_title('after renameing')

In [None]:
arch_score_df

In [None]:
non_noisy_malignant.obs['PID'] = 'z.' + non_noisy_malignant.obs['Method'].astype(str) + '_malignant_' + non_noisy_malignant.obs['Hospital.Code'].astype(str)
non_noisy_malignant.obs['PID'] = non_noisy_malignant.obs['PID'].str.lower()
non_noisy_malignant.obs['PID_HL'] = 'z.' + non_noisy_malignant.obs['Method'].astype(str) + '_healthy_like_' + non_noisy_malignant.obs['Hospital.Code'].astype(str)
non_noisy_malignant.obs['PID_HL'] = non_noisy_malignant.obs['PID_HL'].str.lower()

In [None]:
if not sample_level:
    non_noisy_malignant.obs = non_noisy_malignant.obs.merge(arch_score_df, how='left', left_on='PID', right_index=True)
else:
    non_noisy_malignant.obs = non_noisy_malignant.obs.merge(arch_score_df, how='left', on='PID')
non_noisy_malignant.obs['architype'] = non_noisy_malignant.obs['architype'].astype('category')


In [None]:
sc.pl.umap(non_noisy_malignant,  color=[str(i) for i in range(2,9)] + ['architype'],
           ncols=3 ,color_map="viridis", palette='tab10')