## Notebook for the Healthy reference counts preparation

**Developed by**: Anna Maguza  
**Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich**  
**19 June 2023**  

#### Load required packages

In [5]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
from pybiomart import Server

#### Setup Cells

In [2]:
%matplotlib inline

In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.3 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.10.1 pandas==1.3.5 scikit-learn==1.2.2 statsmodels==0.13.5 pynndescent==0.5.8


In [4]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

### Data Upload

In [8]:
Wang_adata = sc.read('/Users/anna.maguza/Desktop/GCA_social_network/data/raw_anndata/Wang/Wang_2022_raw_anndata.h5ad')
X_is_raw(Wang_adata)

True

### Ensembla gene id extraction

In [6]:
# Connect to BioMart server
server = Server(host='http://www.ensembl.org')


In [9]:
# Make a list of gene names
gene_names = Wang_adata.var_names.tolist()

In [12]:
server.marts

{'ENSEMBL_MART_ENSEMBL': <biomart.Mart name='ENSEMBL_MART_ENSEMBL', display_name='Ensembl Genes 109', database_name='ensembl_mart_109'>,
 'ENSEMBL_MART_MOUSE': <biomart.Mart name='ENSEMBL_MART_MOUSE', display_name='Mouse strains 109', database_name='mouse_mart_109'>,
 'ENSEMBL_MART_SEQUENCE': <biomart.Mart name='ENSEMBL_MART_SEQUENCE', display_name='Sequence', database_name='sequence_mart_109'>,
 'ENSEMBL_MART_ONTOLOGY': <biomart.Mart name='ENSEMBL_MART_ONTOLOGY', display_name='Ontology', database_name='ontology_mart_109'>,
 'ENSEMBL_MART_GENOMIC': <biomart.Mart name='ENSEMBL_MART_GENOMIC', display_name='Genomic features 109', database_name='genomic_features_mart_109'>,
 'ENSEMBL_MART_SNP': <biomart.Mart name='ENSEMBL_MART_SNP', display_name='Ensembl Variation 109', database_name='snp_mart_109'>,
 'ENSEMBL_MART_FUNCGEN': <biomart.Mart name='ENSEMBL_MART_FUNCGEN', display_name='Ensembl Regulation 109', database_name='regulation_mart_109'>}

In [29]:
# Choose the database
dataset = (server.marts['ENSEMBL_MART_ENSEMBL']
                  .datasets['hsapiens_gene_ensembl'])

In [30]:
# Query the BioMart server to convert gene names to Ensembl IDs
query_result = dataset.query(attributes=['external_gene_name', 'ensembl_gene_id'])

In [31]:
# Keep only the rows where the gene name is in our list of genes
query_result = query_result[query_result['Gene name'].isin(gene_names)]

In [32]:
# Make 'Gene name' column as index
query_result = query_result.set_index('Gene name')

In [34]:
# Merge Wang_adata.var with query_result by index. For genes absent in query_result, NaN will be added
Wang_adata.var = Wang_adata.var.merge(query_result, left_index=True, right_index=True, how='left')

ValueError: Length of passed value for var_names is 21342, but this AnnData has shape: (14537, 19525)