# Prepare anndata and genesets to run Spectra

This notebook modifies cell annotation of anndata to match the genesets.

**Dataset**: MIS-C dataset

**Type of cells**: All

**Inputs**: Filtered anndata object, cell_annotation column and spectra gene sets (currently using `44_24-03-27_spectra_dict_lucy.json`)

**Output**: Anndata and genests with modified cell_annotation

In [1]:
import rpy2
import scanpy as sc
import numpy as np
import pandas as pd
import anndata
from matplotlib import pyplot as plt
from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable
import seaborn as sns
from rpy2.robjects import pandas2ri
import warnings
from anndata import AnnData
import sklearn
from sklearn.decomposition import PCA
from scipy.sparse import csr_matrix
import os

In [2]:
import Spectra

In [3]:
%load_ext rpy2.ipython
%matplotlib inline

In [4]:
input_dir = '/data/niecr/cheongj/misc/results_seurat/anndata_obj'

# 1. Load Data

In [21]:
rna_ad_path = os.path.join(input_dir, 'batch_corrected_misc_merged_RNA_lv2_filter_dbl_removed_UNK_removed_soupdbl_removed.h5ad')
rna_ad = sc.read_h5ad(rna_ad_path)
rna_ad

AnnData object with n_obs × n_vars = 149259 × 27424
    obs: 'og_barcode', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'ribo_pct', 'mito_pct', 'doublet', 'doublet_score', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'nCount_ATAC', 'nFeature_ATAC', 'orig.barcode', 'RNA_snn_res.2', 'seurat_clusters', 'nucleosome_signal', 'nucleosome_percentile', 'TSS.enrichment', 'TSS.percentile', 'pANN_0.25_0.09_2745', 'DF.classifications_0.25_0.09_2745', 'DF.classifications_0.25_0.09_2627', 'pANN_0.25_0.09_623', 'DF.classifications_0.25_0.09_623', 'DF.classifications_0.25_0.09_808', 'pANN_0.25_0.09_701', 'DF.classifications_0.25_0.09_701', 'pANN_0.25_0.09_1124', 'DF.classifications_0.25_0.09_1124', 'pANN_0.25_0.09_1500', 'DF.classifications_0.25_0.09_1500', 'pANN_0.25_0.09_1478', 'DF.classifications_0.25_0.09_1478', 'pANN_0.25_

In [22]:
rna_ad.obs['celltype_v1'].unique().tolist()

['CD4T',
 'B',
 'NaiveT',
 'CD14M',
 'HSPC',
 'CD8T',
 'pDC',
 'CD16M',
 'NK',
 'DC',
 'PC']

In [11]:
import json
f = open('44_24-03-27_spectra_dict_lucy.json')
gene_set = json.load(f)
gene_set.keys()

dict_keys(['HSPC', 'CD14', 'CD16', 'cDC', 'pDC', 'CD4', 'CD8', 'NK', 'B', 'Plasma', 'global'])

# 2. Match cell annotations

Rename the cells in anndata to match the cells ingene sets.

Here, we would like to combine all the T cell subtypes to a single group: `T` cells.

Merge the `CD4` and `CD8` genesets to get `T` cells factors and delete the cells that are not present in the anndata like `CD4` and `CD8`. `global` does not need to be removed.

In [13]:
combined_dict = gene_set['CD4'].copy()  # Copy 'CD4' dictionary
combined_dict.update(gene_set['CD8'])   # Merge 'CD8' dictionary into the copy
gene_set['T'] = combined_dict
gene_set.keys()

dict_keys(['HSPC', 'CD14', 'CD16', 'cDC', 'pDC', 'CD4', 'CD8', 'NK', 'B', 'Plasma', 'global', 'T'])

In [16]:
# remove genesets that are not present in anndata
gene_set.pop('CD4', None)
gene_set.pop('CD8', None)
gene_set.keys()

dict_keys(['HSPC', 'CD14', 'CD16', 'cDC', 'pDC', 'NK', 'B', 'Plasma', 'global', 'T'])

In [24]:
# Rename cells in anndata
rename_mapping = {
    'CD16M': 'CD16',
    'CD4T': 'T',
    'CD14M': 'CD14',
    'CD8T': 'T',
    'PC': 'Plasma',
    'DC': 'cDC',
    'NaiveT': 'T' 
}

In [25]:
# Replace values in the 'celltype_v1' column based on the mapping
rna_ad.obs['celltype_spectra'] = rna_ad.obs['celltype_v1']
rna_ad.obs['celltype_spectra'] = rna_ad.obs['celltype_spectra'].replace(rename_mapping)
rna_ad.obs['celltype_spectra'].unique().tolist()

['T', 'B', 'CD14', 'HSPC', 'pDC', 'CD16', 'NK', 'cDC', 'Plasma']

# 3. Save the anndata and json file

In [25]:
# Save the genesets as a new json file
with open('misc_spectra_dict_resh_1.json', 'w') as f:
    json.dump(gene_set, f, indent=4)

In [26]:
rna_ad.write(os.path.join(input_dir, '03_spectra_annotated_rna_anndata.h5ad'))