## Notebook for anndata object preparation that will be used for the datasets integration

- **Developed by**: Anna Maguza
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- 31st May 2023

### Import Packages

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import os
import anndata as ad

In [2]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

### Datasets Upload

In [3]:
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/Epithelial_cells/Healthy_epithelial_cells_all_genes.h5ad'
Healthy_adata = sc.read_h5ad(input)

In [4]:
input_cancer = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/output/Epithelial/Joanito_predicted_labels_with_scBalance_7000HVGs.h5ad'
Cancer_adata_predicted = sc.read_h5ad(input_cancer)

In [5]:
X_is_raw(Healthy_adata)

True

In [6]:
X_is_raw(Cancer_adata_predicted)

False

In [7]:
Cancer_adata_predicted.raw.X

<35714x33287 sparse matrix of type '<class 'numpy.float32'>'
	with 66145963 stored elements in Compressed Sparse Row format>

In [8]:
Cancer_adata = Cancer_adata_predicted.raw.to_adata()

In [9]:
X_is_raw(Cancer_adata)

True

In [10]:
# Rename column 'C_scANVI' to 'predictions'
Cancer_adata.obs.rename(columns={'Predicted Label': 'Unified Cell States'}, inplace=True)

In [11]:
# List of Donor_IDs for 'Female'
female_donors = ['Wang_Donor_2', 'N7', 'N8', 'N10', 'N13', 'N14', 'N18', 'N19', 'N20', 'N21', 'N23', 'N24', 'N44', 'N50', 'N106', 'N110', 'N111', 'N539']

# Assign 'Female' to specified Donor_IDs with 'nan' in 'Sex'
Healthy_adata.obs.loc[(Healthy_adata.obs['Donor_ID'].isin(female_donors)) & (Healthy_adata.obs['Sex'] == 'nan'), 'Sex'] = 'Female'

# Assign 'Male' to the remaining 'nan' entries in 'Sex'
Healthy_adata.obs.loc[Healthy_adata.obs['Sex'] == 'nan', 'Sex'] = 'Male'

### Datasets concatenation

In [16]:
# Concatenate reference and query
adata = Healthy_adata.concatenate(Cancer_adata, index_unique = None, batch_key = 'Sample origin', batch_categories = ['Healthy gut', 'Colorectal cancer'])

  warn(
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [18]:
adata.obs

Unnamed: 0,Sample_ID,Cell Type,Study_name,Donor_ID,Diagnosis,Age,Region code,Fraction,Sex,Library_Preparation_Protocol,...,KRAS,BRAF,TP53,APC,PIK3CA,LymphNode,Normal,Tumor,CMS,Sample origin
H158108_N1-GTTAAGCAGAGGTAGA,H158108_N1,Epithelial,"Kong, 2023",158108,Healthy adult,,,,Male,10x 3' v2,...,,,,,,,,,,Healthy gut
H180844_N1-AACTCAGTCAAGATCC,H180844_N1,Epithelial,"Kong, 2023",180844,Healthy adult,,,,Male,10x 3' v2,...,,,,,,,,,,Healthy gut
H180844_N4-CGCTTCAGTAGGCATG,H180844_N4,Epithelial,"Kong, 2023",180844,Healthy adult,,,,Male,10x 3' v2,...,,,,,,,,,,Healthy gut
GACGCGTTCCTCAACC-1-WTDAtest7844018,A33-CAE-0-SC-45N-1,Epithelial,"Elmentaite, 2021",A33 (414C),Healthy adult,20-25,CAE,SC-45N,Male,10x 3' v1,...,,,,,,,,,,Healthy gut
N10_LP_A-TGTGATCTTCGTTT,N10_LP_A,Epithelial,"Kong, 2023",N10,Healthy adult,,,,Female,10x 3' v1,...,,,,,,,,,,Healthy gut
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
KUL5_EXT127_GACTGCGAGTAGCGGT-1,EXT127,Epithelial,"Joanito, 2022",SC044,"Colorectal cancer, Stage III",,,,Female,10x 5' v1,...,wt,wt,wt,wt,,0.0,2.0,3.0,CMS1,Colorectal cancer
KUL5_EXT127_GTGCATAGTTTGACAC-1,EXT127,Epithelial,"Joanito, 2022",SC044,"Colorectal cancer, Stage III",,,,Female,10x 5' v1,...,wt,wt,wt,wt,,0.0,2.0,3.0,CMS1,Colorectal cancer
KUL5_EXT127_TATCAGGGTGTGAAAT-1,EXT127,Epithelial,"Joanito, 2022",SC044,"Colorectal cancer, Stage III",,,,Female,10x 5' v1,...,wt,wt,wt,wt,,0.0,2.0,3.0,CMS1,Colorectal cancer
KUL5_EXT127_TCACAAGAGATCCCGC-1,EXT127,Epithelial,"Joanito, 2022",SC044,"Colorectal cancer, Stage III",,,,Female,10x 5' v1,...,wt,wt,wt,wt,,0.0,2.0,3.0,CMS1,Colorectal cancer


### Identify Highly Variable Genes

In [17]:
adata.obs['Library_Preparation_Protocol'].value_counts()

10x 3' v1    65477
10x 3' v2    52863
10x 5' v1    48529
10x 3' v3    25040
Name: Library_Preparation_Protocol, dtype: int64

In [18]:
# Copy counts to raw
adata.raw = adata

In [19]:
adata.layers['counts'] = adata.X.copy()

# Calculate 5000 HVGs
sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 7000,
    layer = "counts",
    batch_key = "Library_Preparation_Protocol",
    subset = True,
    span = 1
)

In [20]:
# Save the adata object
adata.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/Datasets_integration/All_cells_7000_HVGs.h5ad')

### Draft functions

In [None]:
# Copy indexes from Cancer_adata as a column in obs of Cancer dataset
Cancer_adata_predicted.obs['Cell_ID2'] = Cancer_adata_predicted.obs.index

# Remove part '-Cancer' and '-Healthy' from the Cancer_adata.obs['Cell_ID'] 
Cancer_adata_predicted.obs['Cell_ID2'] = Cancer_adata_predicted.obs['Cell_ID2'].str.replace('-Cancer', '')
Cancer_adata_predicted.obs['Cell_ID2'] = Cancer_adata_predicted.obs['Cell_ID2'].str.replace('-Healthy', '')

# Make 'Cell_ID2' column as index
Cancer_adata_predicted.obs.set_index('Cell_ID2', inplace=True)

del Cancer_adata_predicted.obs['Cell_ID2']