In [None]:
import sys
import os
import os.path
import numpy as np
import pandas as pd
import scanpy as sc
import scanpy.external as sce
from scipy.sparse import issparse
import anndata
from anndata import AnnData
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from collections import defaultdict, OrderedDict
import gzip
import csv
import re
import io
import logging
import warnings
import subprocess
from multiprocessing import Pool
from preprocessing import *

FORMAT = '%(asctime)-15s %(message)s'
logging.basicConfig(format=FORMAT)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 50)
sc.settings.verbosity = 3
sc.logging.print_header()
sc.settings.set_figure_params(dpi=120, color_map="cividis")

In [None]:
dataset = "GSE155622"
species = "Mouse"

In [None]:
dirname = os.getcwd()
source = os.path.abspath(os.path.join(dirname, f'../../build/datasets/{dataset}'))
fns = os.listdir(os.path.join(source, 'downloads'))
print(source)
fns

In [None]:
for i in ["1", "2", "3", "4"]:
    dfm = pd.read_csv(os.path.join(source, 'downloads', f'GSE155622_raw_UMI_counts_{i}_metadata.txt.gz'), index_col=0, sep='\t')
    print(i + " " + repr(list(sorted(dfm.columns))))

In [None]:
column_map = {
    "Conditions": "condition",
    "condition": "condition",
    "modify.ident": "condition",
    "Celltype": "cell_type",
    "celltype": "cell_type",
    "concat_batch": "concat_batch",
}

In [None]:
# build anndata for all samples:
adatas = []
def load_tsvs(index):
    print(f'loading {index} of 4')
    fn = f"{dataset}_raw_UMI_counts_{index}.txt.gz"
    mfn = f"{dataset}_raw_UMI_counts_{index}_metadata.txt.gz"
    
    df = pd.read_csv(os.path.join(source, 'downloads', fn), index_col=0, sep='\t').T
    x = csr_matrix(df, dtype=np.float32)
    obsn = list(df.index)
    varn = list(df.columns)
    del(df)
    mdf = pd.read_csv(os.path.join(source, 'downloads', mfn), index_col=0, sep='\t')
    print(f'loaded {index} of 4')
    
    adata = AnnData(x)
    adata.obs_names = obsn
    adata.var_names = varn
    adata.obs = mdf
    adata.obs['concat_batch'] = [index for x in adata.obs_names]
    print(f'adata made for {index} of 4')
    return adata

p = Pool(4)
adatas = [x for x in p.map(load_tsvs, ["1", "2", "3", "4"])]

In [None]:
for a in adatas:
    columns = list(a.obs.columns)
    for c in columns:
        if c in column_map:
            if not c == column_map[c]:
                a.obs[column_map[c]] = a.obs[c]
                a.obs[f"author.{c}"] = a.obs[c]
                del(a.obs[c])
        else:
            a.obs[f"author.{c}"] = a.obs[c]
            del(a.obs[c])

In [None]:
for a in adatas:
    print(list(sorted([x for x in a.obs.columns if not x.startswith('author.')])))

In [None]:
adata = adatas[0].copy()
adata = adata.concatenate(adatas[1:], join="outer", batch_key="concat_batch")
adata

In [None]:
# process dataset
ensure_unique_idx(adata)
enforce_sparsity(adata)
fix_nan_x(adata)
filter_nan_var_names(adata)
del_raw(adata)
calc_n_genes(adata)
filter_low_quality_cells(adata)
adata, doublet_counts = find_doublets(adata, batch_key='concat_batch')
print(doublet_counts)
normalize_total(adata, target_sum=1e4)
adata

In [None]:
adata.obs.head()

In [None]:
adata.var

In [None]:
fname = os.path.join(source, f"{dataset}.h5ad")
print(fname)
adata.write(fname, compression="gzip")