# Downloading requirements

In [1]:
import tarfile
import urllib.request
import tempfile
import anndata as ad
import scanpy as sc

import pandas as pd
import numpy as np
import seaborn as sb
from scipy import io, sparse
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors

import pickle
import copy
import gzip

import rpy2.rinterface_lib.callbacks
import logging
from rpy2.robjects import pandas2ri
%load_ext rpy2.ipython

In [2]:
%%R
old_paths <- .libPaths()[-1]
new_paths <- c("~/R/nips", old_paths)
.libPaths(new_paths)

suppressMessages(library("dplyr"))
suppressMessages(library("Seurat"))
suppressMessages(library("anndata"))
suppressMessages(library("SingleCellExperiment"))
suppressMessages(library("scran"))
suppressMessages(library("Matrix"))

# Initial data load

In [3]:
## VIASH START
par = {
    "id": "azimuth_ref",
    "input_count": "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE164378&format=file",
    "input_meta": "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE164378&format=file&file=GSE164378%5Fsc%2Emeta%2Edata%5F3P%2Ecsv%2Egz",
    "organism": "human",
    "output_rna": "output_rna.h5ad",
    "output_mod2": "output_mod2.h5ad"
}
## VIASH END

In [4]:
###############################################################################
###                     DOWNLOAD AND READ DATA.                             ###
###############################################################################
print("Downloading file from", par['input_count'])
tar_temp = tempfile.NamedTemporaryFile()
url = par['input_count']
urllib.request.urlretrieve(url, tar_temp.name)

print("Downloading meta data from", par['input_meta'])
meta_temp = tempfile.NamedTemporaryFile()
url = par['input_meta']
urllib.request.urlretrieve(url, meta_temp.name)

###############################################################################
###                      EXTRACT AND CREATE H5ADs                           ###
###############################################################################

print("Extracting and create h5ads")
samples = ['GSM5008737_RNA_3P', 'GSM5008738_ADT_3P'] # first sample is rna, second is protein data
adatas = []        

with tarfile.open(tar_temp.name, 'r') as tar:
    for sample in samples:
        print("Processing sample " + sample)
        with gzip.open(tar.extractfile(sample + '-matrix.mtx.gz'), 'rb') as mm:
            print('Loading matrix')
            X = io.mmread(mm).T.tocsr()
        obs = pd.read_csv(
            tar.extractfile(sample + '-barcodes.tsv.gz'), 
            compression='gzip',
            header=None, 
            sep='\t',
            index_col=0
        )
        obs.index.name = None
        var = pd.read_csv(
            tar.extractfile(sample + '-features.tsv.gz'), 
            compression='gzip',
            header=None, 
            sep='\t'
        ).iloc[:, :1]
        var.columns = ['names']
        var.index = var['names'].values
        adata = ad.AnnData(X=X, obs=obs, var=var)

        adata.var_names_make_unique()
        adatas.append(adata)

    tar.close()

adata_RNA = adatas[0]
adata_ADT = adatas[1]

###############################################################################
###                            POST PROCESS                                 ###
###############################################################################
print("Reading metadata")
meta = pd.read_csv(meta_temp.name, index_col = 0, compression = "gzip")
meta_adt = meta.loc[:,~meta.columns.str.endswith('RNA')]
meta_rna = meta.loc[:,~meta.columns.str.endswith('ADT')]

print("Setting additional output fields")
# set obs
adata_RNA.obs = adata_RNA.obs.join(meta_rna).rename(columns = {'Batch':'seq_batch', 'donor':'batch'})
adata_RNA.obs['cell_type'] = adata_RNA.obs['celltype.l2']

adata_ADT.obs = adata_ADT.obs.join(meta_adt).rename(columns = {'Batch':'seq_batch', 'donor':'batch'})
adata_ADT.obs['cell_type'] = adata_ADT.obs['celltype.l2']

#  set var
adata_RNA.var['feature_types'] = "GEX"
adata_ADT.var['feature_types'] = "ADT"

# set uns 
uns = { "dataset_id" : par["id"], "organism" : par["organism"] }
adata_RNA.uns = uns
adata_ADT.uns = uns


###############################################################################
###                             SAVE OUTPUT                                 ###
###############################################################################
print("Saving output")
adata_RNA.write_h5ad(par['output_rna'], compression = "gzip")
adata_ADT.write_h5ad(par['output_mod2'], compression = "gzip")

Downloading file from https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE164378&format=file


KeyboardInterrupt: 

# I. GEX modality

## 1. QC metrics

In [5]:
# Percentage of mitochondrial counts

is_mito = adata_RNA.var_names.str.startswith("MT-")
total_mito_genes = np.sum(adata_RNA[:, is_mito].X, axis=1).A1
total_all_genes = np.sum(adata_RNA.X, axis=1).A1
mito_genes_percent = (total_mito_genes / total_all_genes) * 100.0
adata_RNA.obs['pct_counts_mt'] = mito_genes_percent
adata_RNA.obs['pct_counts_mt'].max()

14.998124

In [6]:
# UMI counts per cell
# asarray convert matrix (n_cells, 1) to array (n_cells,)

adata_RNA.obs['n_counts'] = np.asarray(np.sum(adata_RNA.X, axis = 1)).reshape(-1)
adata_RNA.obs['n_counts'].min()

811.0

In [7]:
# number of genes per cell

total_detected_genes = (adata_RNA.X > 0).sum(axis=1)
total_detected_genes = np.asarray(total_detected_genes).reshape(-1)
adata_RNA.obs['n_genes'] = total_detected_genes
adata_RNA.obs['n_genes'].min()

501

In [9]:
# Filter cells according to identified QC thresholds:

print('Total number of cells: {:d}'.format(adata_RNA.n_obs))

sc.pp.filter_cells(adata_RNA, min_counts = 1500)

print('Number of cells after min count filter: {:d}'.format(adata_RNA.n_obs))

sc.pp.filter_cells(adata_RNA, max_counts = 40000)
print('Number of cells after max count filter: {:d}'.format(adata_RNA.n_obs))

adata_RNA_filtered = adata_RNA[adata_RNA.obs['pct_counts_mt'] < 20]
print('Number of cells after MT filter: {:d}'.format(adata_RNA.n_obs))

sc.pp.filter_cells(adata_RNA, min_genes = 700)
print('Number of cells after gene filter: {:d}'.format(adata_RNA.n_obs))

Total number of cells: 161764
Number of cells after min count filter: 160326
Number of cells after max count filter: 160273
Number of cells after MT filter: 160273
Number of cells after gene filter: 159964


In [10]:
# Filter genes

print('Total number of genes: {:d}'.format(adata_RNA.n_vars))
sc.pp.filter_genes(adata_RNA, min_cells=20)
print('Number of genes after cell filter: {:d}'.format(adata_RNA.n_vars))
adata_RNA.write('./adata_RNA_filtered.h5ad')

Total number of genes: 33538
Number of genes after cell filter: 21358


## 2. Normalization (size factors)

In [5]:
# Perform a clustering for scran normalization in clusters

adata_pp = adata_RNA.copy()

sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
#sc.pp.pca(adata_pp, n_comps=50)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

In [None]:
# Preprocess variables for scran normalization

input_groups = adata_pp.obs['groups']
del adata_pp
data_mat = adata_RNA.X.T
io.mmwrite("data_mat.mtx", data_mat, comment='', field=None, precision=None, symmetry=None)

In [None]:
%%R -i input_groups -o size_factors

data_mat <- readMM("./data_mat.mtx")
size_factors = calculateSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [None]:
# Keep the count data in a counts layer
adata_RNA.layers["counts"] = adata_RNA.X.copy()

# Normalize & Log-transform 
adata_RNA.X /= adata_RNA.obs['size_factors'].values[:,None]
adata_RNA.layers["log_norm"] = sc.pp.log1p(adata_RNA.X)
adata_RNA.write('./adata_RNA_final.h5ad')

# II. ADT

# 1. QC metrics

In [11]:
# ADT counts per cell

adata_ADT.obs['total_counts'] = np.asarray(np.sum(adata_ADT.X, axis = 1)).reshape(-1)
adata_ADT.obs['total_counts'].min()

961.0

In [13]:
# Number of proteins per cell
# Total number = 228

adata_ADT.obs['n_genes'] = np.asarray((adata_ADT.X > 0).sum(axis=1)).reshape(-1)
adata_ADT.obs['n_genes'].min()

105

In [28]:
# Filter cells according to identified QC thresholds:

print('Total number of cells: {:d}'.format(adata_ADT.n_obs))

sc.pp.filter_cells(adata_ADT, min_counts = 1100)

print('Number of cells after min count filter: {:d}'.format(adata_ADT.n_obs))

sc.pp.filter_cells(adata_ADT, max_counts = 24000)
print('Number of cells after max count filter: {:d}'.format(adata_ADT.n_obs))

sc.pp.filter_cells(adata_ADT, min_genes = round(len(adata_ADT.var)*0.6, 0))  # 60% of n_genes
print('Number of cells after protein number filter: {:d}'.format(adata_ADT.n_obs))

Total number of cells: 161764
Number of cells after min count filter: 161541
Number of cells after max count filter: 160899
Number of cells after protein number filter: 160786


In [30]:
# Filter genes

print('Total number of genes: {:d}'.format(adata_ADT.n_vars))
sc.pp.filter_genes(adata_ADT, min_cells=20)
print('Number of genes after cell filter: {:d}'.format(adata_ADT.n_vars))
adata_ADT.write('./adata_ADT_filtered.h5ad')

Total number of genes: 228
Number of genes after cell filter: 228


## 2. Normalization

In [33]:
# Keep the count data in a counts layer

adata_ADT.layers["counts"] = adata_ADT.X.copy()

In [34]:
def CLR_transform(sparse):
    '''
    implements the CLR transform used in CITEseq (need to confirm in Seurat's code)
    https://doi.org/10.1038/nmeth.4380
    source: https://github.com/theislab/scanpy/pull/1117
    '''
    df = pd.DataFrame.sparse.from_spmatrix(sparse)
    logn1 = np.log(df + 1)
    T_clr = logn1.sub(logn1.mean(axis=1), axis=0)
    return T_clr

In [36]:
# Perform centre log ratio (CLR) transformation
# Store transformed data in adata.X

adata_ADT.X = CLR_transform(adata_ADT.X)
adata_ADT.write("adata_ADT_final.h5ad")