# Thymus human spatial atlas
# Multi resolution annotation from high resolution annotation  

In [None]:
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import scanpy as sc
import scvi
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to show output from all the lines in a cells
pd.set_option('display.max_column',None) # display all the columns in pandas
pd.options.display.max_rows = 100
import os
from datetime import date 
today = str(date.today())

from matplotlib import rcParams
from matplotlib import font_manager
import matplotlib.pyplot as plt
rcParams['pdf.fonttype'] = 42
sc.settings.set_figure_params(dpi = 150, color_map = 'RdPu', dpi_save = 150, vector_friendly = True, format = 'pdf')
font_manager.fontManager.addfont("...software/Arial.ttf")
print(font_manager.findfont("Arial"))
plt.rcParams["font.sans-serif"] = ["Arial"]
sc.settings.set_figure_params(dpi = 150, color_map = 'RdPu', dpi_save = 300, vector_friendly = True, format = 'pdf')

## Load full thymus dataset

In [None]:
os.chdir('.../clean/')
adata = sc.read_h5ad(filename='adata_full_rev_2_clean.h5ad')

In [None]:
list[adata.obs['pred_cell_type_level_4'].cat.categories]

## Load Lena's annotation

In [None]:
df_obs_cite = pd.read_csv('.../clean/HTSA_CITE_anno.csv',index_col = 0)

In [None]:
df_obs_cite.columns

In [None]:
new_anno =      {'RNA.weight':float, 'ADTdsb.weight':float, 'low_ADT':str,'spt_CD4':float, 'spt_CD8':float, 'anno_CITE_new':str}
for a in new_anno.items():
    adata.obs[a[0]] = df_obs_cite[a[0]]
    adata.obs[a[0]] = adata.obs[a[0]].astype(a[1])

In [None]:
sc.pl.umap(adata, color=['RNA.weight','ADTdsb.weight','low_ADT','spt_CD4','spt_CD8','anno_CITE_new'], legend_loc='on data',legend_fontsize = 3)

In [None]:
granular_ctype_dict = {}
for i in adata.obs['pred_cell_type_level_4'].cat.categories:
    granular_ctype_dict[i] = {i}
granular_ctype_dict

In [None]:
adata.obs['cell_type_level_4_explore'] = adata.obs['pred_cell_type_level_4']

## multilevel annotations



In [None]:
# generate highres but clean annotations
dict_anno_v4_clean = {
     'B-Prolif': {'B-Prolif'},
     'B-memory': {'B-memory'},
     'B-naive': {'B-naive'},
     'B-plasma': {'B-plasma'},
     'B_pre_pro': {'B_pro_pre'},
     'CMP': {'CMP'},
     'DC1': {'DC1'},
     'DC1-Prolif': {'DC1-Prolif'},
     'DC2': {'DC2'},
     'DC2-Prolif': {'DC2-Prolif'},
     'EC-Art': {'EC-Art'},
     'EC-Art-ELN': {'EC-Art-ELN'},
     'EC-Cap': {'EC-Cap'},
     'EC-Cap-Prolif': {'EC-Cap-Prolif'},
     'EC-Lymphatic': {'EC-Lymphatic'},
     'EC-Ven': {'EC-Ven'},
     'EC-Ven-ELN': {'EC-Ven-ELN'},
     'GMP': {'GMP'},
     'InterloFb': {'InterloFb'},
     'InterloFb-COL9A3': {'InterloFb-COL9A3'},
     'Macrophage-APOC2': {'Macrophage-APOC2'},
     'Macrophage-LYVE1': {'Macrophage-LYVE1'},
     'Macrophage-SPIC1': {'Macrophage-SPIC1'},
     'Mast': {'Mast'},
     'Mesothelium': {'Mesothelium'},
     'Monocyte_CCR2': {'Monocyte_CCR2'},
     'Monocyte_IL1B': {'Monocyte_IL1B'},
     'Myelocyte': {'Myelocyte'},
     'Neutrophil': {'Neutrophil'},
     'Pericyte_general': {'Pericyte'},
     'Pericyte_CCL19': {'Pericyte_CCL19'},
     'Pericyte_COL1A1': {'Pericyte_COL1A1'},
     'PeriloFb': {'PeriloFb'},
     'PeriloFb-Prolif': {'PeriloFb-Prolif'},
     'ProlifPericyte': {'ProlifPericyte'},
     'Promonocyte': {'Promonocyte'},
     'RBC': {'RBC'},
     'SMC': {'SMC'},
     'Schwann': {'Schwann'},
     'TEC-cilliated': {'TEC-cilliated'},
     'TEC-myo': {'TEC-myo'},
     'TEC-neuro': {'TEC-neuro'},
     'see_lv4_explore': {'TEC-tuft'},
     'T_CD4': {'T_CD4'},
     'T_CD8': {'T_CD8'},
     'see_lv4_explore': {'T_CD8-Prolif'},
     'T_CD8_memory': {'T_CD8_memory'},
     'T_CD8αα(I)': {'T_CD8αα(I)'},
     'T_CD8αα(II)': {'T_CD8αα(II)'},
     'T_CD8αα(entry)': {'T_CD8αα(entry)'},
     'T_DN(P)': {'T_DN(P)'},
     'T_DN(Q)': {'T_DN(Q)'},
     'see_lv4_explore': {'T_DN(Q)-intermediate'},
     'see_lv4_explore': {'T_DN(Q)-stress_1'},
     'see_lv4_explore': {'T_DN(Q)-stress_2'},
     'T_DN(early)': {'T_DN(early)'},
     'T_DP(P)': {'T_DP(P)'},
     'T_DP(Q)': {'T_DP(Q)'},
     'see_lv4_explore': {'T_DP(Q)-CD99'},
     'see_lv4_explore': {'T_DP(Q)-HSPH1'},
     'T_DP(Q)-early': {'T_DP(Q)-early'},
     'see_lv4_explore': {'T_DP(Q)-late_vdj'},
     'T_ETP': {'T_ETP'},
     'T_NK': {'T_NK'},
     'see_lv4_explore': {'T_NK_dev'},
     'see_lv4_explore': {'T_NK_fetal'}, # NK-fetal was later changed to ambiguous and moved to "explore" as these are not NK cells
     'see_lv4_explore': {'T_SP8or4'},
     'see_lv4_explore': {'T_SP-HSP'},
     'T_Treg(agonist)': {'T_Treg(agonist)'},
     'T_Treg-diff_1': {'T_Treg-diff_1'},
     'T_Treg-diff_2': {'T_Treg-diff_2'},
     'T_Treg-intermediate': {'T_Treg-intermediate'},
     'T_Treg_CD8': {'T_Treg_CD8'},
     'T_Treg_mature': {'T_Treg_mature'},
     'T_Treg_recirc': {'T_Treg_recirc'},
     'see_lv4_explore': {'T_cycling'},
     'see_lv4_explore': {'T_innate_type_1'},
     'see_lv4_explore': {'T_innate_type_3'},
     'T_αβT(entry)': {'T_αβT(entry)'},
     'T_γδT': {'T_γδT'},
     'aDC1': {'aDC1'},
     'aDC2': {'aDC2'},
     'aDC3': {'aDC3'},
     'cTECI': {'cTECI'},
     'cTECII': {'cTECII'},
     'cTECIII': {'cTECIII'},
     'see_lv4_explore': {'fetFB-CCL21'},
     'see_lv4_explore': {'fetFB-NKX2-5'},
     'see_lv4_explore': {'fetFB-RSPO2'},
     'large_pre_B': {'large_pre_B'},
     'late_pro_B': {'late_pro_B'},
     'mTECI': {'mTECI'},
     'mTECI-trans': {'mTECI-trans'},
     'mTECII': {'mTECII'},
     'mTECIII': {'mTECIII'},
     'mcTEC': {'mcTEC'},
     'mcTEC-Prolif': {'mcTEC-Prolif'},
     'medFB-MHCIIh': {'medFB-MHCIIh'},
     'medFb': {'medFb'},
     'medFb-RGS5': {'medFb-RGS5'},
     'nmSchwann': {'nmSchwann'},
     'pDC': {'pDC'},
     'pDC-Prolif': {'pDC-Prolif'},
     'pro_B': {'pro_B'},
     'small_pre_B': {'small_pre_B'},
}

adata.obs['cell_type_level_4'] = 'see_lv4_explore' # generate a new lv4 annotation 
adata.obs['pred_cell_type_level_4'] = adata.obs['pred_cell_type_level_4'].astype('object')    
for key, values in dict_anno_v4_clean.items():
    adata.obs.loc[adata.obs['pred_cell_type_level_4'].isin(values), 'cell_type_level_4'] = key
adata.obs['cell_type_level_4'] = adata.obs['cell_type_level_4'].astype('category')    
sc.set_figure_params(figsize=[10,10])
sc.pl.umap(adata, color=['cell_type_level_4'], legend_loc='on data',legend_fontsize = 10)
adata.obs['cell_type_level_4'].value_counts(dropna=False)

In [None]:
# lv 4 
granular_ctype_dict = {}
for i in adata.obs['cell_type_level_4'].cat.categories:
    granular_ctype_dict[i] = {i}

    

In [None]:
# map level 3
dict_anno_v3 = {
             'see_lv4_explore':{'see_lv4_explore'},
    
             'T_DN(early)':{'T_DN(early)','T_ETP'},
             'T_DN(Q)':{'T_DN(Q)'},
             'T_DN(P)':{'T_DN(P)'},

             'T_DP(Q)':{'T_DP(Q)', 'T_DP(Q)-early'},
             'T_DP(P)':{'T_DP(P)'},
             'T_CD4': {'T_CD4'},
             'T_CD8': {'T_CD8','T_CD8_memory'},
             'T_CD8αα': {'T_CD8αα(I)','T_CD8αα(II)','T_CD8αα(entry)'},
        
             'T_NK': {'T_NK','T_NK_fetal'}, # NK-fetal was later changed to ambiguous and moved to "explore" as these are not NK cells
     
             'T_Treg(agonist)': {'T_Treg(agonist)','T_Treg-diff_1','T_Treg-diff_2','T_Treg-intermediate'},
             'T_Treg_CD8': {'T_Treg_CD8'},
             'T_Treg_mature': {'T_Treg_mature'},
             'T_Treg_recirc': {'T_Treg_recirc'},
             'T_γδT':{'T_γδT'},
             'T_αβT(entry)': {'T_αβT(entry)'},
    
             'RBC': {'RBC'},
    
             'B_dev': {'B_pre_pro','pro_B', 'late_pro_B', 'large_pre_B', 'small_pre_B'},
             'B_mature':{'B-naive', 'B-memory', 'B-Prolif'},
             'B-plasma': {'B-plasma'},
    
             'Myeloid_progenitor':{'CMP','GMP'},

             'Mono':{'Promonocyte', 'Monocyte_CCR2', 'Monocyte_IL1B'},
    
             'Neut':{'Myelocyte','Neutrophil'},
    
             'Mast': {'Mast'},

    
             'DC1': {'DC1', 'DC1-Prolif'},
             'aDC1': {'aDC1'},
             'DC2': {'DC2', 'DC2-Prolif'},
             'aDC2': {'aDC2'},
             'aDC3': {'aDC3'},
    
             'pDC': {'pDC', 'pDC-Prolif'},
             
             'Macrophage-APOC2':{'Macrophage-APOC2'},
             'Macrophage-LYVE1':{'Macrophage-LYVE1'}, 
             'Macrophage-SPIC1':{'Macrophage-SPIC1'},
    
             'cTECI': {'cTECI'},
             'cTECII': {'cTECII'},
             'cTECIII': {'cTECIII'},
             'mTECI': {'mTECI'},
             'mTECI-trans': {'mTECI-trans'},
             'mTECII': {'mTECII'},
             'mTECIII': {'mTECIII'},
    
             'mcTEC': {'mcTEC','mcTEC-Prolif'},
             'TEC-cilliated': {'TEC-cilliated'},
             'TEC-myo': {'TEC-myo'},
             'TEC-neuro': {'TEC-neuro'},

            'EC-Art': {'EC-Art', 'EC-Art-ELN'},
            'EC-Cap':{'EC-Cap', 'EC-Cap-Prolif'},
            'EC-Ven':{'EC-Ven', 'EC-Ven-ELN'},
            'EC-Lymphatic':{'EC-Lymphatic'},
            'SMC':{'SMC'},
    
            'Pericyte': {'Pericyte_general', 'Pericyte_CCL19', 'Pericyte_COL1A1', 'ProlifPericyte'},
    
            'PeriloFb': {'PeriloFb','PeriloFb-Prolif'},
            'InterloFb': {'InterloFb', 'InterloFb-COL9A3'},
            'medFB-MHCIIh': {'medFB-MHCIIh'},
            'medFb': {'medFb', 'medFb-RGS5'},
            'fetFB_special': {'fetFB-CCL21','fetFB-NKX2-5','fetFB-RSPO2'},
    
            'Mesothelium':{'Mesothelium'},
            'Schwann': {'Schwann','nmSchwann'},
    
            'T_innate': {'T_innate_type_1','T_innate_type_3'},
            }

adata.obs['cell_type_level_3'] = 'unassigned'
for key, values in dict_anno_v3.items():
    adata.obs.loc[adata.obs['cell_type_level_4'].isin(values), 'cell_type_level_3'] = key
adata.obs['cell_type_level_3'] = adata.obs['cell_type_level_3'].astype('category')    
 
sc.set_figure_params(figsize=[10,10])
sc.pl.umap(adata, color=['cell_type_level_3'], legend_loc='on data',legend_fontsize = 10)
adata.obs['cell_type_level_3'].value_counts(dropna=False)

In [None]:
# map level 2
dict_anno_v2 = {
             'see_lv4_explore':{'see_lv4_explore'},
    
             'T_DN(early)':{'T_DN(early)','T_ETP'},
             'T_DN(Q)':{'T_DN(Q)'},
             'T_DN(P)':{'T_DN(P)'},

             'T_DP(Q)':{'T_DP(Q)', 'T_DP(Q)-early'},
             'T_DP(P)':{'T_DP(P)'},
             'T_CD4': {'T_CD4'},
             'T_CD8': {'T_CD8','T_CD8_memory'},
             'T_αβT(entry)': {'T_αβT(entry)'},

             'T_CD8αα': {'T_CD8αα(I)','T_CD8αα(II)','T_CD8αα(entry)'},
        
             'T_NK': {'T_NK','T_NK_fetal'},
     
             'T_Treg(agonist)': {'T_Treg(agonist)','T_Treg-diff_1','T_Treg-diff_2','T_Treg-intermediate'},
             'T_Treg': {'T_Treg_mature','T_Treg_recirc','T_Treg_CD8'},
             'T_γδT':{'T_γδT'},
    
             'B_dev': {'B_pre_pro','pro_B', 'late_pro_B', 'large_pre_B', 'small_pre_B'},
             'B_mature':{'B-naive', 'B-memory', 'B-Prolif'},
             'B-plasma': {'B-plasma'},
             
             'RBC': {'RBC'},
    
             'Myeloid_progenitor':{'CMP','GMP'},

             'Mono':{'Promonocyte', 'Monocyte_CCR2', 'Monocyte_IL1B'},
             'Neut':{'Myelocyte','Neutrophil'},
             'Mast': {'Mast'},

             'DC': {'DC1', 'DC1-Prolif','DC2', 'DC2-Prolif'},
             'aDC': {'aDC1','aDC2','aDC3'},            
             'pDC': {'pDC', 'pDC-Prolif'},
             
             'Macrophage':{'Macrophage-APOC2','Macrophage-LYVE1','Macrophage-SPIC1'},
    
             'cTEC': {'cTECI','cTECII','cTECIII'},
             'mTEC': {'mTECI','mTECI-trans','mTECII','mTECIII'},
    
             'mcTEC': {'mcTEC','mcTEC-Prolif'},
             'mTEC-mimetic': {'TEC-cilliated','TEC-myo','TEC-neuro'},

            'EC-Art': {'EC-Art', 'EC-Art-ELN'},
            'EC-Cap':{'EC-Cap', 'EC-Cap-Prolif'},
            'EC-Ven':{'EC-Ven', 'EC-Ven-ELN'},
            'EC-Lymphatic':{'EC-Lymphatic'},
            'SMC':{'SMC'},
    
            'Pericyte': {'Pericyte_general', 'Pericyte_CCL19', 'Pericyte_COL1A1', 'ProlifPericyte'},
    
            'PeriloFb': {'PeriloFb','PeriloFb-Prolif'},
            'InterloFb': {'InterloFb', 'InterloFb-COL9A3'},
            'medFb': {'medFb', 'medFb-RGS5','medFB-MHCIIh'},
            'fetFB_special': {'fetFB-CCL21','fetFB-NKX2-5','fetFB-RSPO2'},
    
            'Mesothelium':{'Mesothelium'},
            'Schwann': {'Schwann','nmSchwann'},
    
            'T_innate': {'T_innate_type_1','T_innate_type_3'},
            }

adata.obs['cell_type_level_2'] = 'unassigned'
for key, values in dict_anno_v2.items():
    adata.obs.loc[adata.obs['cell_type_level_4'].isin(values), 'cell_type_level_2'] = key
adata.obs['cell_type_level_2'] = adata.obs['cell_type_level_2'].astype('category')    
 
sc.set_figure_params(figsize=[10,10])
sc.pl.umap(adata, color=['cell_type_level_2'], legend_loc='on data',legend_fontsize = 10)
adata.obs['cell_type_level_2'].value_counts(dropna=False)

In [None]:
# map level 1
dict_anno_v1 = {
             'see_lv4_explore':{'see_lv4_explore'},
    
             'T_DN(early)':{'T_DN(early)','T_ETP'},
             'T_DN':{'T_DN(Q)','T_DN(P)'},

             'T_DP':{'T_DP(Q)','T_DP(P)' ,'T_DP(Q)-early'},
             'T_CD4': {'T_CD4'},
             'T_CD8': {'T_CD8','T_CD8_memory'},
             'T_αβT(entry)': {'T_αβT(entry)'},
        
             'T_NK': {'T_NK','T_NK_fetal'},
     
             'T_Treg': {'T_Treg(agonist)','T_Treg-diff_1','T_Treg-diff_2','T_Treg-intermediate','T_Treg_mature','T_Treg_recirc','T_Treg_CD8'},
    
             'B': {'B_pre_pro','pro_B', 'late_pro_B', 'large_pre_B', 'small_pre_B','B-naive', 'B-memory', 'B-Prolif','B-plasma'},
    
            'RBC': {'RBC'},
    
             'Myeloid_progenitor':{'CMP','GMP'},

             'Mono':{'Promonocyte', 'Monocyte_CCR2', 'Monocyte_IL1B'},
             'Neut':{'Myelocyte','Neutrophil'},
             'Mast': {'Mast'},

             'DC': {'DC1', 'DC1-Prolif','DC2', 'DC2-Prolif','aDC1','aDC2','aDC3','pDC', 'pDC-Prolif'},
             
             'Macrophage':{'Macrophage-APOC2','Macrophage-LYVE1','Macrophage-SPIC1'},
    
             'TEC': {'cTECI','cTECII','cTECIII','mTECI','mTECI-trans','mTECII','mTECIII','mcTEC','mcTEC-Prolif'},
             'TEC-mimetic': {'TEC-cilliated','TEC-myo','TEC-neuro'},

            'EC': {'EC-Art', 'EC-Art-ELN','EC-Cap', 'EC-Cap-Prolif','EC-Ven', 'EC-Ven-ELN','EC-Lymphatic'},
    
            'Vascular': {'Pericyte_general', 'Pericyte_CCL19', 'Pericyte_COL1A1', 'ProlifPericyte','SMC'},
    
            'Fb': {'PeriloFb','PeriloFb-Prolif','InterloFb', 'InterloFb-COL9A3','medFb', 'medFb-RGS5','medFB-MHCIIh','fetFB-CCL21','fetFB-NKX2-5','fetFB-RSPO2'},
    
            'Mesothelium':{'Mesothelium'},
            'Schwann': {'Schwann','nmSchwann'},
    
            'T_innate': {'T_innate_type_1','T_innate_type_3','T_γδT','T_CD8αα(I)','T_CD8αα(II)','T_CD8αα(entry)'},
            }

adata.obs['cell_type_level_1'] = 'unassigned'
for key, values in dict_anno_v1.items():
    adata.obs.loc[adata.obs['cell_type_level_4'].isin(values), 'cell_type_level_1'] = key
adata.obs['cell_type_level_1'] = adata.obs['cell_type_level_1'].astype('category')    
 
sc.set_figure_params(figsize=[10,10])
sc.pl.umap(adata, color=['cell_type_level_1'], legend_loc='on data',legend_fontsize = 10)
adata.obs['cell_type_level_1'].value_counts(dropna=False)

In [None]:
# map level 0
dict_anno_v0 = {
             'see_lv4_explore':{'see_lv4_explore'},
    
             'T_DN':{'T_DN(Q)','T_DN(P)','T_DN(early)','T_ETP'},
             'T_DP':{'T_DP(Q)','T_DP(P)' ,'T_DP(Q)-early'},
             'T_SP': {'T_CD4','T_CD8','T_CD8_memory','T_αβT(entry)',
                      'T_Treg(agonist)','T_Treg-diff_1','T_Treg-diff_2','T_Treg-intermediate','T_Treg_mature','T_Treg_recirc','T_Treg_CD8',
                      'T_NK','T_NK_fetal','T_innate_type_1','T_innate_type_3','T_γδT','T_CD8αα(I)','T_CD8αα(II)','T_CD8αα(entry)'},

             'B': {'B_pre_pro','pro_B', 'late_pro_B', 'large_pre_B', 'small_pre_B','B-naive', 'B-memory', 'B-Prolif','B-plasma'},
             'RBC': {'RBC'},
    
             'Myeloid':{'CMP','GMP','Promonocyte', 'Monocyte_CCR2', 'Monocyte_IL1B','Myelocyte','Neutrophil',
                       'Mast','DC1', 'DC1-Prolif','DC2', 'DC2-Prolif','aDC1','aDC2','aDC3','pDC', 'pDC-Prolif',
                       'Macrophage-APOC2','Macrophage-LYVE1','Macrophage-SPIC1'},

    
             'Epithelial': {'cTECI','cTECII','cTECIII','mTECI','mTECI-trans','mTECII','mTECIII','mcTEC','mcTEC-Prolif','TEC-cilliated','TEC-myo','TEC-neuro'},

             'Stroma': {'EC-Art', 'EC-Art-ELN','EC-Cap', 'EC-Cap-Prolif','EC-Ven', 'EC-Ven-ELN','EC-Lymphatic','Pericyte_general', 'Pericyte_CCL19', 'Pericyte_COL1A1', 'ProlifPericyte','SMC',
                       'PeriloFb','PeriloFb-Prolif','InterloFb', 'InterloFb-COL9A3','medFb', 'medFb-RGS5','medFB-MHCIIh','fetFB-CCL21','fetFB-NKX2-5','fetFB-RSPO2','Mesothelium'
                      },
    
            'Schwann': {'Schwann','nmSchwann'},
    
            }

adata.obs['cell_type_level_0'] = 'unassigned'
for key, values in dict_anno_v0.items():
    adata.obs.loc[adata.obs['cell_type_level_4'].isin(values), 'cell_type_level_0'] = key
adata.obs['cell_type_level_0'] = adata.obs['cell_type_level_0'].astype('category')    
 
sc.set_figure_params(figsize=[10,10])
sc.pl.umap(adata, color=['cell_type_level_0'], legend_loc='on data',legend_fontsize = 10)
adata.obs['cell_type_level_0'].value_counts(dropna=False)

In [None]:
# further cleanup

In [None]:
adata.obs['anno_CITE'] = adata.obs['anno_CITE_new']

In [None]:
del adata.obs['anno_CITE_new'] 
del adata.obs['anno_CITE_4v5'] 
del adata.obs['cell_type_level_0_old'] 
del adata.obs['annotation_level_4']

In [None]:
adata

In [None]:
os.chdir('.../clean/')
adata.write_h5ad(filename='adata_full_rev_3_clean.h5ad')

In [None]:
os.chdir('.../clean/')
adata = sc.read_h5ad(filename='adata_full_rev_3_clean.h5ad')
adata

In [None]:
adata.obs['cell_type_level_4_explore'] = adata.obs['pred_cell_type_level_4']
adata

In [None]:
sc.set_figure_params(figsize=[10,10])
sc.pl.umap(adata, color=['cell_type_level_4_explore'], legend_loc='on data',legend_fontsize = 10)
adata.obs['cell_type_level_0'].value_counts(dropna=False)

In [None]:
os.chdir('.../clean/')
adata.write_h5ad(filename='adata_full_rev_3_clean.h5ad')

# add CITEseq protein data

In [None]:
os.chdir('.../clean/')
adata = sc.read_h5ad(filename='')
adata

In [None]:
adata_cite = sc.read_h5ad(filename='.../HTSA_CITE_anno.h5ad')
adata_cite

In [None]:
# Step 1: Rename variables in CITE-seq object
adata_cite.var.index = "cite_" + adata_cite.var.index

# Step 2: Identify common cells
common_cells = adata.obs_names.intersection(adata_cite.obs_names)
print('found ',len(common_cells),' common cells')

In [None]:
# Step 3: Create a placeholder for protein data
# Initialize with zeros or NaNs
protein_data_placeholder = pd.DataFrame(np.nan, 
                                        index=adata.obs_names, 
                                        columns=adata_cite.var_names)

# Step 4: Transfer protein data for common cells
protein_data_placeholder.loc[common_cells] = adata_cite[common_cells].to_df()

# Update adata's obsm with the protein data
adata.obsm["protein_expression"] = protein_data_placeholder

In [None]:
adata.obsm["protein_expression"][adata.obs['study']=='HTSA_Ghent']

In [None]:
adata

In [None]:
os.chdir('.../clean/')
adata.write_h5ad(filename='adata_full_rev_4_clean.h5ad')

# show proportions

In [None]:
import os
import scanpy as sc
os.chdir('.../clean/')
adata = sc.read_h5ad(filename='adata_full_rev_4_clean.h5ad')

In [None]:
adata

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns

def get_cluster_proportions(adata,
                            cluster_key="cluster_final",
                            sample_key="replicate",
                            drop_values=None):
    """
    Input
    =====
    adata : AnnData object
    cluster_key : key of `adata.obs` storing cluster info
    sample_key : key of `adata.obs` storing sample/replicate info
    drop_values : list/iterable of possible values of `sample_key` that you don't want
    
    Returns
    =======
    pd.DataFrame with samples as the index and clusters as the columns and 0-100 floats
    as values
    """
    
    adata_tmp = adata.copy()
    sizes = adata_tmp.obs.groupby([cluster_key, sample_key]).size()
    props = sizes.groupby(level=1).apply(lambda x: 100 * x / x.sum()).reset_index() 
    props = props.pivot(columns=sample_key, index=cluster_key).T
    props.index = props.index.droplevel(0)
    props.fillna(0, inplace=True)
    
    if drop_values is not None:
        for drop_value in drop_values:
            props.drop(drop_value, axis=0, inplace=True)
    return props

In [None]:
sc.set_figure_params(figsize=[7,3],fontsize=10,dpi=100)

def plot_cluster_proportions(cluster_props, 
                             cluster_palette='colorblind',
                             xlabel_rotation=0): 
    fig, ax = plt.subplots(dpi=300)
    fig.patch.set_facecolor("white")
    
    cmap = None
    if cluster_palette is not None:
        cmap = sns.palettes.blend_palette(
            cluster_palette, 
            n_colors=len(cluster_palette), 
            as_cmap=True)
   
    cluster_props.plot(
        kind="bar", 
        stacked=True, 
        ax=ax, 
        legend=None, 
        colormap=cmap
    )
    
    ax.legend(bbox_to_anchor=(1.01, 1), frameon=False, title="Cluster")
    sns.despine(fig, ax)
    # ax.tick_params(axis="x", rotation=xlabel_rotation)
    ax.set_xlabel(cluster_props.index.name.capitalize())
    ax.set_ylabel("Proportion")
    fig.tight_layout()
    ax.grid(False)
    return fig
    

In [None]:
props = get_cluster_proportions(adata,
                            cluster_key="study_group",
                            sample_key="cell_type_level_0",
                            drop_values=None)

In [None]:
sc.set_figure_params(figsize=[20,8],fontsize=20,dpi=50)

fig = plot_cluster_proportions(props, 
                             cluster_palette=None,
                             xlabel_rotation=0)
os.chdir('.../')
fig.savefig('figures/study_groups_cell_type_level_0.pdf')

In [None]:
# import matplotlib.pyplot as plt
os.chdir('.../')

sc.set_figure_params(figsize=[10,4],fontsize=5,dpi=50)
anno = 'cell_type_level_4'
cell_type_counts = adata.obs[anno].value_counts()
# Plotting the bar chart
cell_type_counts.plot.bar()
plt.grid(which='both', linestyle='--', linewidth=0.5)

# Setting the y-axis to a log scale
plt.yscale('log')
# Adding labels and title for clarity (optional but recommended)
plt.xlabel(f'{anno.replace("_", " ")}')
plt.ylabel('Counts (log scale)')
plt.title('Counts of Cell Types (Log Scale)')
plt.savefig(f'figures/{anno}_counts.pdf')
# Display the plot
plt.show()
               
    

In [None]:
import os
import seaborn as sns
import matplotlib.pyplot as plt

# Change directory as required
os.chdir('.../')

# Assuming 'adata' is your AnnData object and is already defined
anno = 'cell_type_level_0'
# Getting value counts and converting to DataFrame
cell_type_counts = adata.obs[anno].value_counts().reset_index()
cell_type_counts.columns = [anno, 'counts']

# Sort categories by counts in descending order
cell_type_counts.sort_values('counts', ascending=False, inplace=True)

# Determine the figure width based on the number of categories
num_categories = len(cell_type_counts)
bar_width = 0.2  # The desired width of each bar
space_between_bars = 0.01# The space between each bar
desired_plot_width_per_category = bar_width + space_between_bars  # Total space for each category

# Calculate total plot width needed
total_plot_width = num_categories * desired_plot_width_per_category

# Make sure there is a minimum width for the plot
min_width = 0  # Adjust this as needed
figure_width = max(total_plot_width, min_width)

# Set the Seaborn figure parameters
sns.set_theme(style="whitegrid")
plt.figure(figsize=[figure_width, 6])  # Dynamic width based on the number of categories

plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

# Creating the bar plot
ax = sns.barplot(x=anno, y='counts', data=cell_type_counts, order=cell_type_counts[anno], rasterized=True)



# Setting the y-axis to a log scale
plt.yscale('log')

# Customizing the plot
ax.set_xlabel(f'{anno.replace("_", " ")}')
ax.set_ylabel('Counts (log scale)')
ax.set_title('Counts of Cell Types (Log Scale)')

# Rotating x-axis labels for better readability and adjusting font size if needed
plt.xticks(rotation=90, ha='center')  # Rotation set to 90 for vertical labels

# Adding grid
plt.grid(which='both', linestyle='--', linewidth=0.5)

# Save the plot as a PDF, SVG, or PNG file
plt.savefig(f'figures/{anno}_counts.pdf', bbox_inches='tight')


# Display the plot
plt.show()

In [None]:
# check proportion of each annotation for fetal vs infant and paed 
# cehch bunddance 
sc.set_figure_params(figsize=[2,3],dpi=100,fontsize=5)


props = get_cluster_proportions(adata,
                            cluster_key="donor",
                            sample_key="age_group",
                            drop_values=None)

fig = plot_cluster_proportions(props, 
                             cluster_palette=None,
                             xlabel_rotation=0)
plt.savefig(f'figures/age_group_donor.pdf')


# generate final umap 

In [None]:
os.chdir('.../clean/')
adata = sc.read_h5ad(filename='adata_full_rev_4_clean.h5ad')

In [None]:
os.chdir('...software/ImageSpot/')
import scvi_wrapper as sv
adata_new = sv.scvi_wrapper_basic(
    adata,
    max_epochs=400,
    remove_vdjgenes=True,
    hvg=5000,
    batch_key='sample',
    categorical_covariate_keys=['chemistry_simple','age_group','sex','donor'],
    continuous_covariate_keys = None,
    cluster_leiden=False,
    layer_use=None,
    remove_cite=False,
) 

In [None]:
os.chdir('.../clean/')
# adata_new.write_h5ad(filename='adata_full_rev_4_clean_scvi.h5ad')
adata_new = sc.read_h5ad(filename='adata_full_rev_4_clean_scvi.h5ad')

In [None]:
sc.set_figure_params(figsize=[10,10])
np.unique(adata_new.obs['cell_type_level_4'])

In [None]:
sc.pl.umap(adata_new, color=['cell_type_level_0'], legend_loc='on data',legend_fontsize = 10,palette='tab10')
# adata.obs[''].value_counts(dropna=False)

In [None]:
import scanpy as sc

# Assuming `adata` is your preprocessed AnnData object

# Define ranges for the parameters you want to test
n_neighbors_list = [30]  # Different values for the neighborhood size
min_dist_list = [0.1, 0.2, 0.3]  # Different values for the min_dist parameter in UMAP

# Loop over the parameters
for n_neighbors in n_neighbors_list:
    for min_dist in min_dist_list:
                    
            # Compute the neighborhood graph
            sc.pp.neighbors(adata_new, n_neighbors=n_neighbors, use_rep="X_scVI")
            
            # Run UMAP with the current set of parameters
            sc.tl.umap(adata_new, min_dist=min_dist)
            
            # Plot the UMAP
            sc.pl.umap(adata_new, color='cell_type_level_1', 
                       title=f'UMAP: n_neighbors={n_neighbors}, min_dist={min_dist}')
            
        