# Notebook to integrate anndata from 'CellxGene' and 'HubMap'

**Developed by** :Srivalli Kolla

**Created on** : 08 July, 2024

**Last modified** : 08 July, 2024

**Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**

# Import packages

In [None]:
import anndata as ad
import scanpy as sc
import bbknn
import scib
import time
import os 
import glob

# Setting up environment

In [None]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 300, color_map = 'magma', dpi_save = 300, vector_friendly = True, format = 'svg')
timestamp = time.strftime("%d_%m_%Y")

# Data loading and Integration

## Data loading

In [None]:
adata_cellxgene = sc.read_h5ad('../cellxgene/data/cg_heart_all.h5ad')
adata_cellxgene

##### Steps
1. Define paths for single cell and single nuclei of hubmap
2. Load the HuBMAP data
3. Create a for loop which creates a new obs 'suspension_type'
4. Create an other obs 'cell_id' if not present
5. Data concatenation seperately for single cell and single nuclei

In [None]:
sc_path = '../hubmap/sc/'
sn_path = '../hubmap/sn/'

def load_and_annotate_datasets(path, suspension_type):
    file_pattern = os.path.join(path, '*.h5ad')
    hubmap_files = glob.glob(file_pattern)
    datasets = []
    for file in hubmap_files:
        adata = ad.read_h5ad(file)
        adata.obs['suspension_type'] = suspension_type

        if 'cell_id' not in adata.obs.columns:
            adata.obs['cell_id'] = None

        datasets.append(adata)
    return datasets

sc_datasets = load_and_annotate_datasets(sc_path, 'cell')
sn_datasets = load_and_annotate_datasets(sn_path, 'nucleus')

sc_combined = ad.concat(sc_datasets, merge = 'same')
sn_combined = ad.concat(sn_datasets, merge = 'same') 

print(sc_combined)
print(sn_combined)

In [None]:
adata_hubmap_combined = ad.concat([sc_combined, sn_combined], merge = 'same')
adata_hubmap_combined

In [None]:
adata_hubmap_combined.var['feature_id'] = adata_hubmap_combined.var_names
adata_hubmap_combined.var

In [None]:
adata_cellxgene.var.set_index('feature_id', inplace= True, drop= False)
adata_cellxgene.var

In [None]:
adata_cellxgene

In [None]:
adata_hubmap_combined

## Data Preparation for integration

#### Steps

1. Mapping for columns that have the same information but different names
2. Rename obs and var as in hubmap data to make it same with cellxgene
3. Define function to make unique obs and var
4. Make the obs and var unique

In [None]:
obs_rename_mapping = {
    'cell_id': 'cell_type' 
}

var_rename_mapping = {
    'feature_id': 'feature_id',
    'hugo_symbol': 'feature_name'
}

adata_hubmap_combined.obs.rename(columns= obs_rename_mapping, inplace= True)
adata_hubmap_combined.var.rename(columns= var_rename_mapping, inplace= True)

def make_unique(column_names):
    seen = set()
    for idx, col in enumerate(column_names):
        while col in seen:
            col += '_dup'
        seen.add(col)
        column_names[idx] = col
    return column_names


adata_cellxgene.obs.columns = make_unique(list(adata_cellxgene.obs.columns))
adata_hubmap_combined.obs.columns = make_unique(list(adata_hubmap_combined.obs.columns))
adata_cellxgene.var.columns = make_unique(list(adata_cellxgene.var.columns))
adata_hubmap_combined.var.columns = make_unique(list(adata_hubmap_combined.var.columns))

print(adata_cellxgene)
print(adata_hubmap_combined)

In [None]:
adata_hubmap_combined.write_h5ad(f'../hubmap/integrated_hubmap-{timestamp}.h5ad')

In [None]:
adata_hubmap_combined = sc.read_h5ad('../hubmap/integrated_hubmap-08_07_2024.h5ad')
adata_hubmap_combined

In [None]:
adata_hubmap_combined.obs_names_make_unique()
adata_hubmap_combined

## Data integration

In [None]:
datasets = [adata_cellxgene, adata_hubmap_combined]
adata_combined = ad.concat(datasets, label='database', keys=['cellxgene', 'hubmap'], merge = 'same', join= 'outer')
adata_combined

In [None]:
adata_combined.write_h5ad(f'integrated_cg_hm{timestamp}.h5ad')

In [None]:
adata_combined = sc.read_h5ad('../data_integration/integrated_cg_hm08_07_2024.h5ad')


In [None]:
adata_combined

# Data preprocessing

## Filtration

In [None]:
sc.pp.filter_cells(adata_combined, min_genes=200)
sc.pp.filter_genes(adata_combined, min_cells=3)

## Normalization

In [None]:
sc.pp.normalize_total(adata_combined, target_sum=1e4)
sc.pp.log1p(adata_combined)

## Highly variable genes selection

In [None]:
sc.pp.highly_variable_genes(adata_combined, n_top_genes=10000)
adata_combined = adata_combined[:, adata_combined.var['highly_variable']]
adata_combined

## Dimensionality reduction

In [None]:
sc.pp.scale(adata_combined, max_value=10)
sc.tl.pca(adata_combined, svd_solver='arpack')

# Data visualization

In [None]:
sc.pp.neighbors(adata_combined)
sc.tl.umap(adata_combined)
sc.pl.umap(adata_combined, frameon = False, color = ['cell_type', 'database','suspension_type'], save = f': Before batch correction:{timestamp}.svg', title = 'Before batch correction')

## Batch correction

In [None]:
filtered_data = bbknn.bbknn(adata_combined, batch_key='database',approx = True, copy = True)
filtered_data

In [None]:
sc.tl.umap(filtered_data)
sc.pl.umap(filtered_data, color = ['sample_id', 'cell_type', 'database', 'disease', 'development_stage', 'tissue'], save = f': After batch correction:{timestamp}.svg', title = 'After batch correction')

# Data Storage

In [None]:
filtered_data.write('./data/integrated_cg_hubmap_filtered{timestamp}.h5ad')