## Notebook for anndata object preparation that will be used for the datasets integration

- **Developed by**: Anna Maguza
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- 31st May 2023

### Import Packages

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import os
import anndata as ad

In [2]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

### Datasets Upload

In [3]:
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/all_cells/Healthy_epithelial_cells_all_genes.h5ad'
Healthy_adata = sc.read_h5ad(input)

In [4]:
input_cancer = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/Labels_transfer/scBalance/Joanito_predicted_labels_with_scBalance_7000.h5ad'
Cancer_adata_predicted = sc.read_h5ad(input_cancer)

In [5]:
X_is_raw(Healthy_adata)

True

In [6]:
X_is_raw(Cancer_adata_predicted)

False

In [7]:
Cancer_adata_predicted.raw.X

<35714x33287 sparse matrix of type '<class 'numpy.float32'>'
	with 66145963 stored elements in Compressed Sparse Row format>

In [8]:
Cancer_adata = Cancer_adata_predicted.raw.to_adata()

In [9]:
X_is_raw(Cancer_adata)

True

In [12]:
# Rename column 'C_scANVI' to 'predictions'
Cancer_adata.obs.rename(columns={'Predicted Label': 'Unified_Cell_States'}, inplace=True)
Healthy_adata.obs.rename(columns={'Unified Cell States': 'Unified_Cell_States'}, inplace=True)

### Datasets concatenation

In [16]:
# Concatenate reference and query
adata = Healthy_adata.concatenate(Cancer_adata, index_unique = None, batch_key = 'Sample origin', batch_categories = ['Healthy gut', 'Colorectal cancer'])

  warn(
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


### Identify Highly Variable Genes

In [19]:
# Copy counts to raw
adata.raw = adata

In [20]:
adata.layers['counts'] = adata.X.copy()

# Calculate 5000 HVGs
sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 5000,
    layer = "counts",
    batch_key = "Library_Preparation_Protocol",
    subset = True,
    span = 1
)

In [25]:
# Save the adata object
adata.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/Datasets integration/Epithelial_healthy_cancer/All_cells_5000_HVGs.h5ad')

### Draft functions

In [None]:
# Copy indexes from Cancer_adata as a column in obs of Cancer dataset
Cancer_adata_predicted.obs['Cell_ID2'] = Cancer_adata_predicted.obs.index

# Remove part '-Cancer' and '-Healthy' from the Cancer_adata.obs['Cell_ID'] 
Cancer_adata_predicted.obs['Cell_ID2'] = Cancer_adata_predicted.obs['Cell_ID2'].str.replace('-Cancer', '')
Cancer_adata_predicted.obs['Cell_ID2'] = Cancer_adata_predicted.obs['Cell_ID2'].str.replace('-Healthy', '')

# Make 'Cell_ID2' column as index
Cancer_adata_predicted.obs.set_index('Cell_ID2', inplace=True)

del Cancer_adata_predicted.obs['Cell_ID2']