# Downloading requirements

In [1]:
import tarfile
import urllib.request
import tempfile
import anndata as ad
import scanpy as sc

import pandas as pd
import numpy as np
import seaborn as sb
from scipy import io, sparse
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors

import pickle
import copy
import gzip

import logging

import rpy2.robjects as ro
from rpy2.robjects import numpy2ri, pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr

numpy2ri.activate()
pandas2ri.activate()
%load_ext rpy2.ipython

In [2]:
from lab_scripts.data.preprocessing.common import gex_normalization
from data import download_data

from lab_script.utils import utils
utils.change_

#import importlib
#importlib.reload(gex_normalization)

In [3]:
%%R
old_paths <- .libPaths()[-1]
new_paths <- c("~/R/nips", old_paths)
.libPaths(new_paths)

suppressMessages(library("dplyr"))
suppressMessages(library("Seurat"))
suppressMessages(library("anndata"))
suppressMessages(library("SingleCellExperiment"))
suppressMessages(library("scran"))
suppressMessages(library("Matrix"))

# Initial data load

In [20]:
download_data.download_raw()

Download s3://nips2021/data/raw to data/raw...
download: s3://nips2021/data/raw/adt/.gitignore to data/raw/adt/.gitignore
download: s3://nips2021/data/raw/gex_adt/.gitignore to data/raw/gex_adt/.gitignore
download: s3://nips2021/data/raw/atac/.gitignore to data/raw/atac/.gitignore
download: s3://nips2021/data/raw/gex/.gitignore to data/raw/gex/.gitignore
download: s3://nips2021/data/raw/gex_atac/.gitignore to data/raw/gex_atac/.gitignore
download: s3://nips2021/data/raw/gex_adt/totalVI_10x_adt.h5ad to data/raw/gex_adt/totalVI_10x_adt.h5ad
download: s3://nips2021/data/raw/gex_adt/totalVI_10x_gex.h5ad to data/raw/gex_adt/totalVI_10x_gex.h5ad
download: s3://nips2021/data/raw/gex_adt/azimuth_adt.h5ad to data/raw/gex_adt/azimuth_adt.h5ad
download: s3://nips2021/data/raw/gex_adt/azimuth_gex.h5ad to data/raw/gex_adt/azimuth_gex.h5ad



# I. GEX modality

In [22]:
adata_RNA = ad.read_h5ad('/data/raw/gex_adt/azimuth_gex.h5ad')

## 1. QC metrics

In [4]:
# Percentage of mitochondrial counts

is_mito = adata_RNA.var_names.str.startswith("MT-")
total_mito_genes = np.sum(adata_RNA[:, is_mito].X, axis=1).A1
total_all_genes = np.sum(adata_RNA.X, axis=1).A1
mito_genes_percent = (total_mito_genes / total_all_genes) * 100.0
adata_RNA.obs['pct_counts_mt'] = mito_genes_percent
adata_RNA.obs['pct_counts_mt'].max()

14.998124

In [5]:
# UMI counts per cell
# asarray convert matrix (n_cells, 1) to array (n_cells,)

adata_RNA.obs['n_counts'] = np.asarray(np.sum(adata_RNA.X, axis = 1)).reshape(-1)
adata_RNA.obs['n_counts'].min()

811.0

In [6]:
# number of genes per cell

total_detected_genes = (adata_RNA.X > 0).sum(axis=1)
total_detected_genes = np.asarray(total_detected_genes).reshape(-1)
adata_RNA.obs['n_genes'] = total_detected_genes
adata_RNA.obs['n_genes'].min()

501

In [7]:
# Filter cells according to identified QC thresholds:

print('Total number of cells: {:d}'.format(adata_RNA.n_obs))

sc.pp.filter_cells(adata_RNA, min_counts = 1500)

print('Number of cells after min count filter: {:d}'.format(adata_RNA.n_obs))

sc.pp.filter_cells(adata_RNA, max_counts = 40000)
print('Number of cells after max count filter: {:d}'.format(adata_RNA.n_obs))

adata_RNA_filtered = adata_RNA[adata_RNA.obs['pct_counts_mt'] < 20]
print('Number of cells after MT filter: {:d}'.format(adata_RNA.n_obs))

sc.pp.filter_cells(adata_RNA, min_genes = 700)
print('Number of cells after gene filter: {:d}'.format(adata_RNA.n_obs))

Total number of cells: 161764
Number of cells after min count filter: 160326
Number of cells after max count filter: 160273
Number of cells after MT filter: 160273
Number of cells after gene filter: 159964


In [8]:
# Filter genes

print('Total number of genes: {:d}'.format(adata_RNA.n_vars))
sc.pp.filter_genes(adata_RNA, min_cells=20)
print('Number of genes after cell filter: {:d}'.format(adata_RNA.n_vars))
adata_RNA.write('./adata_RNA_filtered.h5ad')

Total number of genes: 33538
Number of genes after cell filter: 21358


## 2. Normalization (size factors)

In [None]:
batches = sorted(adata_RNA.obs['batch'].value_counts().index.tolist())
del adata_RNA

for i in range(len(batches)):
        print(i)
        adata_RNA = ad.read_h5ad('./adata_RNA_filtered.h5ad')
        batch = adata_RNA[adata_RNA.obs['batch'] == batches[i]].copy()
        del adata_RNA
        batch_processed = gex_normalization.standard_normalization(batch, {})
        batch_processed.write_h5ad(par['./batch_processed{0}'.format(i+1)], compression = "gzip")
        del batch_processed

0
         Falling back to preprocessing with `sc.pp.pca` and default params.


In [None]:
# Keep the count data in a counts layer
adata_RNA.layers["counts"] = adata_RNA.X.copy()

# Normalize & Log-transform 
adata_RNA.X /= adata_RNA.obs['size_factors'].values[:,None]
adata_RNA.X = sc.pp.log1p(adata_RNA.X)
adata_RNA.write('./adata_RNA_final.h5ad')

# II. ADT

In [4]:
# Inner load

adata_ADT = ad.read_h5ad('./data/raw/gex_adt/azimuth_adt.h5ad')

In [5]:
protein_names = adata_ADT.var.index.tolist()
len(protein_names)

228

In [6]:
# Download protein names from example dataset

file = open('/home/alina/Desktop/neurips/notebooks/protein_names_explore', 'rb')
protein_names_explore = pickle.load(file)
file.close()
len(protein_names_explore)

134

In [7]:
outersect = set(protein_names_explore) - set(protein_names)
len(outersect)

34

Azimuth ADT dataset contains 228 proteins.
<br>Example ADT dataset contains 134 proteins.
<br>34 proteins from example dataset are not in azimuth dataset.

# 1. QC metrics

## Isotypic proteins counts per cell

Isotypic proteins are controls which do not target any human proteins and their expression should be considered background. 
<br>Here 4 rat proteins are observed. It is needed to relocate their matrix to another layes & remove them from adata.X and adata.var.

In [8]:
# Find rat proteins

for i in range(len(protein_names)):
    if protein_names[i][0:2] == 'Ra':
        print(protein_names[i], '   Index:', i)
    else:
        continue

Rat-IgG1-1    Index: 1
Rat-IgG2b    Index: 19
Rat-IgG1-2    Index: 37
Rag-IgG2c    Index: 54


In [9]:
# Create df from sparse matrix
# Extract only columns with isoproteins from df
# Add columns to adata.obsm layer

isotype_df = pd.DataFrame.sparse.from_spmatrix(adata_ADT.X)
isotype_df.index = adata_ADT.obs.index
isotype_df.columns = adata_ADT.var.index

isotype_proteins = ['Rat-IgG1-1', 'Rat-IgG2b', 'Rat-IgG1-2', 'Rag-IgG2c']
isotype_df = isotype_df[isotype_proteins]
adata_ADT.obsm['isotype_controls'] = isotype_df

In [10]:
# Filter out isoproteins from adata.var

adata_ADT.var_filtered = adata_ADT.var.drop(index=isotype_proteins)

In [11]:
adata_ADT

AnnData object with n_obs × n_vars = 161764 × 228
    obs: 'nCount_ADT', 'nFeature_ADT', 'orig.ident', 'lane', 'batch', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'seq_batch', 'cell_type'
    var: 'names', 'feature_types'
    uns: 'dataset_id', 'organism'
    obsm: 'isotype_controls'

In [12]:
# Filter out isoproteins from adata.X

all_cols = np.arange(adata_ADT.X.shape[1])
cols_to_remove = [1, 19, 37, 54]
cols_to_keep = list(set(all_cols) - set(cols_to_remove))
adata_ADT.X_filtered = adata_ADT.X[:, cols_to_keep]

In [13]:
adata_ADT_new =  ad.AnnData(
    X = adata_ADT.X_filtered,
    var = adata_ADT.var_filtered,
    obs = adata_ADT.obs,
    uns = adata_ADT.uns,
    obsm = adata_ADT.obsm)

In [14]:
# Sum of counts in 4 isotypic proteins per cell
# Range of isotypic counts under filtration in explore dataset is (1, 100) for 6 isoproteins
# Here we have 4 proteins, so the proportion of 100 is ~67
# Thresholds should be (1, 67)

adata_ADT_new.obs['iso_count'] = np.asarray(np.sum(adata_ADT_new.obsm['isotype_controls'], axis = 1)).reshape(-1)
(adata_ADT_new.obs['iso_count'].min(), adata_ADT_new.obs['iso_count'].max())

(0.0, 2568.0)

## Total counts per cell

In [17]:
# ADT counts per cell
# Same thresholds as in example dataset: (1100, 24000)

adata_ADT_new.obs['total_counts'] = np.asarray(np.sum(adata_ADT_new.X, axis = 1)).reshape(-1)
(adata_ADT_new.obs['total_counts'].min(), adata_ADT_new.obs['total_counts'].max())

(953.0, 49287.0)

## Proteins per cell

In [21]:
# Number of proteins per cell; Total number = 228
# total_number*0.6 is a threshold in example dataset
# Here it is equal to 134 proteins, but is seems not enough strict (see EDA),
# so 170 is taken as thresholds

adata_ADT_new.obs['n_antibodies_by_counts'] = np.asarray((adata_ADT_new.X > 0).sum(axis=1)).reshape(-1)
(adata_ADT_new.obs['n_antibodies_by_counts'].min(), adata_ADT_new.obs['n_antibodies_by_counts'].max())

(103, 224)

## Filtering

In [24]:
# Filter cells according to identified QC thresholds:

print('Total number of cells: {:d}'.format(adata_ADT_new.n_obs))

adata_ADT_new = adata_ADT_new[adata_ADT_new.obs['iso_count'] > 1]
print('Number of cells after min isocount filter: {:d}'.format(adata_ADT_new.n_obs))

adata_ADT_new = adata_ADT_new[adata_ADT_new.obs['iso_count'] < 67]
print('Number of cells after max isocount filter: {:d}'.format(adata_ADT_new.n_obs))

sc.pp.filter_cells(adata_ADT_new, min_counts = 1100)
print('Number of cells after min count filter: {:d}'.format(adata_ADT_new.n_obs))

sc.pp.filter_cells(adata_ADT_new, max_counts = 24000)
print('Number of cells after max count filter: {:d}'.format(adata_ADT_new.n_obs))

sc.pp.filter_cells(adata_ADT_new, min_genes = 170)  # 60% of n_genes
print('Number of cells after protein number filter: {:d}'.format(adata_ADT_new.n_obs))

Trying to set attribute `.obs` of view, copying.


Total number of cells: 161764
Number of cells after min isocount filter: 161450
Number of cells after max isocount filter: 160582
Number of cells after min count filter: 160377
Number of cells after max count filter: 159998
Number of cells after protein number filter: 158187


In [25]:
# Filter genes

print('Total number of genes: {:d}'.format(adata_ADT_new.n_vars))
sc.pp.filter_genes(adata_ADT_new, min_cells=40)

print('Number of genes after cell filter: {:d}'.format(adata_ADT_new.n_vars))
adata_ADT.write('./adata_ADT_filtered.h5ad')

Total number of genes: 224
Number of genes after cell filter: 224


## 2. Normalization

In [26]:
# Keep the count data in a counts layer

adata_ADT_new.layers["counts"] = adata_ADT_new.X.copy()

In [27]:
# Reimplemented in Python

def CLR_transform(sparse):
    array = sparse.toarray()
    logn1 = np.log(array+1)
    mean = np.nanmean(logn1, axis = 1)
    exponent = np.exp(mean)
    ratio = (array/exponent[:,None]) + 1
    T_clr = np.log(ratio)
    transformed = csr_matrix(T_clr)
    return transformed

In [29]:
adata_ADT_new

AnnData object with n_obs × n_vars = 158187 × 224
    obs: 'nCount_ADT', 'nFeature_ADT', 'orig.ident', 'lane', 'batch', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'seq_batch', 'cell_type', 'iso_count', 'total_counts', 'n_antibodies_by_counts', 'n_counts', 'n_genes'
    var: 'names', 'feature_types', 'n_cells'
    uns: 'dataset_id', 'organism'
    obsm: 'isotype_controls'
    layers: 'counts'

In [30]:
batches = sorted(adata_ADT_new.obs['batch'].value_counts().index.tolist())
var = adata_ADT_new.var
uns = adata_ADT_new.uns
print(batches)

['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8']


In [31]:
for i in range(len(batches)):
    if i==0:
        batch = adata_ADT_new[adata_ADT_new.obs['batch'] == batches[i]].copy()
        del adata_ADT_new
        batch.X = CLR_transform(batch.X)
        adata_ADT_final = batch
    else:
        adata_ADT_new = ad.read_h5ad('./adata_ADT_filtered.h5ad')
        batch = adata_ADT_new[adata_ADT_new.obs['batch'] == batches[i]].copy()
        del adata_ADT_new
        batch.X = CLR_transform(batch.X)
        adata_ADT_final = ad.concat([adata_ADT_final, batch], axis=0)

In [32]:
adata_ADT_final.var = var
adata_ADT_final.uns = uns
adata_ADT_final

AnnData object with n_obs × n_vars = 161375 × 224
    obs: 'nCount_ADT', 'nFeature_ADT', 'orig.ident', 'lane', 'batch', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'seq_batch', 'cell_type'
    var: 'names', 'feature_types', 'n_cells'
    uns: 'dataset_id', 'organism'
    obsm: 'isotype_controls'

In [35]:
adata_ADT_final.write_h5ad("adata_ADT_final.h5ad")