# Add fetal gut data to Fetal Immune atlas

In this notebook I am loading and preprocessing data from the fetal gut samples following the code from `Pan_fetal_immune/legacy_code/PAN.A01.v01.Import_data.ipynb` as strictly as possible.

### Import modules

In [1]:
# importing scjp
%load_ext autoreload
%autoreload 2
import os, sys
cwd = '/nfs/team205/ed6/data/Fetal_immune/Pan_fetal/JP_archive/a18_Pan_fetal'
sys.path.append(cwd)
# import scjp

In [2]:
# useful imports
import numpy as np
import scipy as scipy
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
import networkx as nx
import igraph, re, glob
from bbknn import bbknn
from geosketch import gs
import scrublet as scr
import joblib as jl
from datetime import datetime
def timestamp():
    return datetime.now().strftime("%y%m%d%H%M")

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes

In [3]:
# setting scanpy
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=100, color_map='OrRd')
sc.logging.print_version_and_date()

Running Scanpy 1.6.0, on 2021-02-20 18:03.


In [4]:
# other environmental imports
%load_ext rpy2.ipython

In [5]:
# # printing version
# version = 'PAN.A01.v01.'
# Name = 'PAN.A01.v01.Import_data.ipynb'
# print('Version: %s'%(version))

# scjp.save_html('PAN.A01.v01.Import_data.ipynb')

## Load full atlas dataset

This was generated in `Pan_fetal_immune/legacy_code/PAN.A01.v01.Import_data.ipynb`. 

In [6]:
# merged_raw = sc.read_h5ad('/nfs/team205/ed6/data/Fetal_immune/Pan_fetal/working_data/PAN.A01.v01.entire_data_normalised_log.h5ad')

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [7]:
# merged_raw

AnnData object with n_obs × n_vars = 849542 × 33694
    obs: 'Sample', 'n_counts', 'n_genes', 'donor', 'organ', 'sort', 'method', 'file', 'mito', 'doublet_scores', 'predicted_doublets', 'name'
    var: 'GeneName', 'GeneID'

## Load fetal gut data

In [5]:
gut_data_dir = '/nfs/team205/ed6/data/Fetal_immune/fetal_gut/'
file_list = [x for x in os.listdir(gut_data_dir) if x.startswith("cellranger") and not x.endswith(".h5ad")]
file_name_list = ["_".join(x.split("_")[3:]).split("_GRCh")[0] for x in file_list]

Read metadata table from gdrive

In [6]:
gut_meta = pd.read_csv("/home/jovyan/mount/gdrive/Pan_fetal/meta_share/metadata_fetal_gut_RE.csv")

In [7]:
gut_meta.index = gut_meta["file"]

In [8]:
smp_list = gut_meta.loc[file_name_list]["sample"].values.tolist()

In [9]:
def convert_obs_name(obs_name):
#     obs_name = obs_name.split("-")[0]
    fetus = obs_name.split("-")[0]
    organ = obs_name.split("-")[1]
    sort = obs_name.split("-")[2]
    method = '5GEX' if '5GEX' in obs_name else '3GEX'
    return (fetus,organ,sort,method)

In [10]:
def read_files(filename, sample, raw_file_path = gut_data_dir,
               min_n_count = 2000, min_n_gene = 500, max_ne_gene = 7000):    
    try:
        path = '%s/%s/raw/'%(raw_file_path,filename)
        adata = sc.read(path+'matrix.mtx',cache=True).transpose()
    except:
        try:
            path = '%s/%s/'%(raw_file_path,filename)
            adata = sc.read(path+'matrix.mtx',cache=True).transpose()
        except:
            path = '%s/%s/filtered_gene_bc_matrices/GRCh38/'%(raw_file_path,filename)
            adata = sc.read(path+'matrix.mtx',cache=True).transpose()      

    adata.var_names = np.genfromtxt(path + 'genes.tsv',dtype=str)[:,1]
    adata.obs_names = np.genfromtxt(path + 'barcodes.tsv',dtype=str)
    adata.obs_names = [filename+"-"+x.strip("-1") for x in adata.obs_names]
    adata.var['GeneName'] = np.genfromtxt(path + 'genes.tsv', dtype=str)[:, 1]
    adata.var['GeneID'] = np.genfromtxt(path + 'genes.tsv', dtype=str)[:, 0]
    adata.obs['Sample'] = sample

    # caculate n_counts / n_genes per cell
    adata.obs['n_counts'] = np.sum(adata.X, axis=1).A1
    adata.obs['n_genes'] = np.sum(adata.X>0,axis=1)

    # filter cells
    print("Filtering cells...")
    clist = []
    clist.append(np.array(adata.obs['n_counts'] > min_n_count))
    clist.append(np.array(adata.obs['n_genes'] > min_n_gene))
    clist.append(np.array(adata.obs['n_genes'] < max_n_gene))

    c = np.column_stack(clist).all(axis=1)
    adata = adata[c].copy()

    adata = adata[:,np.argsort(adata.var.GeneID)]

    adata.obs['donor'] = [convert_obs_name(x)[0] for x in adata.obs['Sample']]
    adata.obs['organ'] = [convert_obs_name(x)[1] for x in adata.obs['Sample']]
    adata.obs['sort'] = [convert_obs_name(x)[2] for x in adata.obs['Sample']]
    adata.obs['method'] = ['5GEX' if '5GEX' in x else '3GEX' for x in adata.obs['Sample']]
    adata.obs['file'] = filename

    mito_genes = adata.var_names.str.startswith('MT-')
    adata.obs['mito'] = (np.sum(adata.X[:, mito_genes],axis=1).A1) / (np.sum(adata.X,axis=1).A1)

    print("Computing doublets...")
    scrub = scr.Scrublet(adata.X)
    if adata.shape[0] < 30:
        doublet_scores, predicted_doublets = scrub.scrub_doublets(verbose=False, n_prin_comps=adata.shape[0] - 1)
    else:
        doublet_scores, predicted_doublets = scrub.scrub_doublets(verbose=False)
    adata.obs['doublet_scores'] = doublet_scores
    adata.obs['predicted_doublets'] = predicted_doublets

    sc.write('/nfs/team205/ed6/data/Fetal_immune/fetal_gut/%s_filtered'%(filename),adata)

In [153]:
for k,v in dict(zip(smp_list, file_list)).items():
    read_files(v,k)

... reading from cache file cache/nfs-team205-ed6-data-Fetal_immune-fetal_gut-cellranger202_count_30328_Human_colon_16S7985394_GRCh38-1_2_0-raw-matrix.h5ad


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


Filtering cells...


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Computing doublets...


  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical
... storing 'GeneName' as categorical


... reading from cache file cache/nfs-team205-ed6-data-Fetal_immune-fetal_gut-cellranger210_count_31512_Human_colon_16S8159190_GRCh38-1_2_0-raw-matrix.h5ad


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


Filtering cells...


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Computing doublets...


  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical
... storing 'GeneName' as categorical


... reading from cache file cache/nfs-team205-ed6-data-Fetal_immune-fetal_gut-cellranger202_count_30328_Human_colon_16S7985390_GRCh38-1_2_0-raw-matrix.h5ad


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Filtering cells...
Computing doublets...


  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical
... storing 'GeneName' as categorical


... reading from cache file cache/nfs-team205-ed6-data-Fetal_immune-fetal_gut-cellranger210_count_31512_Human_colon_16S8159184_GRCh38-1_2_0-raw-matrix.h5ad
Filtering cells...


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Computing doublets...


  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical
... storing 'GeneName' as categorical


... reading from cache file cache/nfs-team205-ed6-data-Fetal_immune-fetal_gut-cellranger210_count_31512_Human_colon_16S8159186_GRCh38-1_2_0-raw-matrix.h5ad


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


Filtering cells...


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Computing doublets...


  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical
... storing 'GeneName' as categorical


... reading from cache file cache/nfs-team205-ed6-data-Fetal_immune-fetal_gut-cellranger210_count_31512_Human_colon_16S8159182_GRCh38-1_2_0-raw-matrix.h5ad
Filtering cells...


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Computing doublets...


  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical
... storing 'GeneName' as categorical


... reading from cache file cache/nfs-team205-ed6-data-Fetal_immune-fetal_gut-cellranger202_count_31131_FCA_gut8015061_GRCh38-1_2_0-raw-matrix.h5ad


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical


Filtering cells...
Computing doublets...


... storing 'GeneName' as categorical


... reading from cache file cache/nfs-team205-ed6-data-Fetal_immune-fetal_gut-cellranger202_count_30328_Human_colon_16S7985392_GRCh38-1_2_0-raw-matrix.h5ad
Filtering cells...


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Computing doublets...


  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical
... storing 'GeneName' as categorical


... reading from cache file cache/nfs-team205-ed6-data-Fetal_immune-fetal_gut-cellranger202_count_31131_FCA_gut8015059_GRCh38-1_2_0-raw-matrix.h5ad


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical


Filtering cells...
Computing doublets...


... storing 'GeneName' as categorical


... reading from cache file cache/nfs-team205-ed6-data-Fetal_immune-fetal_gut-cellranger202_count_30328_Human_colon_16S7985389_GRCh38-1_2_0-raw-matrix.h5ad


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


Filtering cells...


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Computing doublets...


  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical
... storing 'GeneName' as categorical


... reading from cache file cache/nfs-team205-ed6-data-Fetal_immune-fetal_gut-cellranger202_count_31131_FCA_gut8015058_GRCh38-1_2_0-raw-matrix.h5ad


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical
... storing 'predicted_doublets' as categorical


Filtering cells...
Computing doublets...


... storing 'GeneName' as categorical


... reading from cache file cache/nfs-team205-ed6-data-Fetal_immune-fetal_gut-cellranger210_count_31512_Human_colon_16S8159185_GRCh38-1_2_0-raw-matrix.h5ad


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Filtering cells...
Computing doublets...


  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical
... storing 'GeneName' as categorical


... reading from cache file cache/nfs-team205-ed6-data-Fetal_immune-fetal_gut-cellranger210_count_31512_Human_colon_16S8159189_GRCh38-1_2_0-raw-matrix.h5ad


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


Filtering cells...


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Computing doublets...


  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical
... storing 'GeneName' as categorical


... writing an h5ad cache file to speedup reading next time
Filtering cells...


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Computing doublets...


  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical
... storing 'GeneName' as categorical


... writing an h5ad cache file to speedup reading next time
Filtering cells...


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Computing doublets...


  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical
... storing 'GeneName' as categorical


... writing an h5ad cache file to speedup reading next time


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


Filtering cells...


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Computing doublets...


  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical
... storing 'GeneName' as categorical


... writing an h5ad cache file to speedup reading next time


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical


Filtering cells...
Computing doublets...


... storing 'GeneName' as categorical


... writing an h5ad cache file to speedup reading next time


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Filtering cells...
Computing doublets...


  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical
... storing 'GeneName' as categorical


... writing an h5ad cache file to speedup reading next time


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


Filtering cells...


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Computing doublets...


  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical
... storing 'GeneName' as categorical


... reading from cache file cache/nfs-team205-ed6-data-Fetal_immune-fetal_gut-cellranger202_count_31131_FCA_gut8015057_GRCh38-1_2_0-raw-matrix.h5ad


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical


Filtering cells...
Computing doublets...


... storing 'GeneName' as categorical


... writing an h5ad cache file to speedup reading next time


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


Filtering cells...


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Computing doublets...


  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical
... storing 'GeneName' as categorical


## Merge dataset

Code borrowed from `scjp` module

In [11]:
def merge_matrix(ad,obskeys = None,use_raw = False,keep_only_mutual=False):
    '''merge matrix stored in ad
    ad: dictionary of anndata to merge
    obskeys: list to merge within anndata
    use_raw: if True, merge from .raw.X'''
    
    smp_list = list(ad.keys())
    obs_dict = defaultdict(list)
    obs_names = []
    
    for smp in smp_list:
        ad[smp].obs['name'] = smp
    
    if not obskeys:
        obskey_list = []
        obskeys = []
        for sample in smp_list:
            obskey_list.extend(list(ad[sample].obs.columns))
        for (obskey, number) in Counter(obskey_list).items():
            if number == len(smp_list):
                obskeys.append(obskey)
            else:
                if keep_only_mutual:
                    pass
                else:
                    for sample in smp_list:
                        if obskey not in ad[sample].obs.columns:
                            ad[sample].obs[obskey]='n/a'
                    obskeys.append(obskey)
                               
    for sample in smp_list:
        obs_names.extend(list(ad[sample].obs_names))
        for key in obskeys:   
            obs_dict[key].extend(list(ad[sample].obs[key]))
    
    from scipy.sparse import vstack
    if use_raw == True:
        stack = vstack([ad[x].raw.X for x in smp_list]) # stack data
        adata = sc.AnnData(stack, var = ad[smp_list[0]].raw.var)
    else:
        stack = vstack([ad[x].X for x in smp_list]) # stack data
        adata = sc.AnnData(stack, var = ad[smp_list[0]].var)
      
    
    adata.obs_names = obs_names
    print(len(adata))
    for obs_col in obs_dict:
        print(obs_col)
        adata.obs[obs_col] = obs_dict[obs_col]
    return adata

In [12]:
ad = {}

for sample,filename in dict(zip(smp_list, file_list)).items():
    ad[sample] = sc.read('/nfs/team205/ed6/data/Fetal_immune/fetal_gut/%s_filtered'%(filename))

Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Vari

In [13]:
merged_gut = merge_matrix(ad)

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


81363
Sample
n_counts
n_genes
donor
organ
sort
method
file
mito
doublet_scores
predicted_doublets
name


In [14]:
merged_gut.X.max()

12481.0

### Add to raw data of pan fetal atlas

In [15]:
merged_raw = sc.read_h5ad('/nfs/team205/ed6/data/Fetal_immune/PAN.A01.v01.entire_data_raw_count.h5ad')

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [16]:
merged_raw

AnnData object with n_obs × n_vars = 849542 × 33694
    obs: 'Sample', 'n_counts', 'n_genes', 'donor', 'organ', 'sort', 'method', 'file', 'mito', 'doublet_scores', 'predicted_doublets', 'name'
    var: 'GeneName', 'GeneID'

In [17]:
merged_gut.obs['organ'] = "GU"

In [26]:
# del merged_gut.uns["log1p"]

In [19]:
merged_raw_new = merge_matrix({'all':merged_raw,'gut':merged_gut})

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


930905
Sample
n_counts
n_genes
donor
organ
sort
method
file
mito
doublet_scores
predicted_doublets
name


In [20]:
merged_raw_new.obs.predicted_doublets

FCAImmP7179363-AAACCTGAGCAGATCG                                                   False
FCAImmP7179363-AAACCTGAGCTGATAA                                                   False
FCAImmP7179363-AAACCTGAGGGTTTCT                                                   False
FCAImmP7179363-AAACCTGGTAGAGGAA                                                   False
FCAImmP7179363-AAACCTGGTCTTTCAT                                                   False
                                                                                  ...  
cellranger202_count_30328_Human_colon_16S7985391_GRCh38-1_2_0-TTTGGTTTCTTGTTTG    False
cellranger202_count_30328_Human_colon_16S7985391_GRCh38-1_2_0-TTTGTCAAGCCAGTAG    False
cellranger202_count_30328_Human_colon_16S7985391_GRCh38-1_2_0-TTTGTCAGTCCTCTTG    False
cellranger202_count_30328_Human_colon_16S7985391_GRCh38-1_2_0-TTTGTCAGTTTCGCTC    False
cellranger202_count_30328_Human_colon_16S7985391_GRCh38-1_2_0-TTTGTCATCTCCAGGG    False
Name: predicted_doublets, Length

In [21]:
merged_raw_new.obs['batch'] = [x+y for x,y in zip(merged_raw_new.obs['organ'],merged_raw_new.obs['method'])]
merged_raw_new.obs['bbk'] = [x+y for x,y in zip(merged_raw_new.obs['donor'],merged_raw_new.obs['method'])]

In [22]:
merged_raw_new.write_h5ad('/nfs/team205/ed6/data/Fetal_immune/PAN.A01.v01.entire_data_raw_count.wGut.h5ad')

... storing 'Sample' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'sort' as categorical
... storing 'method' as categorical
... storing 'file' as categorical
... storing 'predicted_doublets' as categorical
... storing 'name' as categorical
... storing 'batch' as categorical
... storing 'bbk' as categorical


## Preprocessing

In [23]:
sc.pp.normalize_per_cell(merged_gut, counts_per_cell_after=10e4)

normalizing by total count per cell


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


    finished (0:00:03): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)


In [24]:
sc.pp.log1p(merged_gut)

### Add to pan fetal dataset

In [None]:
merged_raw = sc.read_h5ad('/nfs/team205/ed6/data/Fetal_immune/Pan_fetal/working_data/PAN.A01.v01.entire_data_normalised_log.h5ad')

In [25]:
merged_gut.obs['organ'] = "GU"

In [26]:
del merged_gut.uns["log1p"]

In [27]:
merged_raw_new = merge_matrix({'all':merged_raw,'gut':merged_gut})

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


913795
Sample
n_counts
n_genes
donor
organ
sort
method
file
mito
doublet_scores
predicted_doublets
name


In [36]:
merged_raw_new.obs.predicted_doublets

False    899543
True      14247
nan           5
Name: predicted_doublets, dtype: int64

In [37]:
merged_raw_new.obs['batch'] = [x+y for x,y in zip(merged_raw_new.obs['organ'],merged_raw_new.obs['method'])]
merged_raw_new.obs['bbk'] = [x+y for x,y in zip(merged_raw_new.obs['donor'],merged_raw_new.obs['method'])]

In [43]:
merged_raw_new.write_h5ad('/nfs/team205/ed6/data/Fetal_immune/PAN.A01.v01.entire_data_normalised_log.wGut.h5ad')

## Checking the contents of data

In [48]:
org_dict = {}
for org in set(merged_raw_new.obs['organ']):
    org_dict[org] = Counter(merged_raw_new.obs['donor'][merged_raw_new.obs['organ']==org])

In [49]:
df = pd.DataFrame(org_dict)

In [50]:
df = df.fillna(0)
df = df.astype(int)

In [51]:
df.sort_index()

Unnamed: 0,SP,TH,BM,GU,LI,KI,TH(pharyn),YS,SK
F19,0,0,0,0,1942,0,0,0,5605
F21,9957,8651,6274,0,10093,0,0,0,0
F22,663,3167,0,0,17382,0,0,76,0
F23,10258,5838,0,0,7978,0,0,0,0
F29,11317,11892,15583,0,10866,0,0,0,0
F30,12375,12969,17440,0,19830,0,0,0,0
F32,0,0,0,0,9315,0,4926,765,11347
F33,0,0,0,0,26402,0,0,0,16300
F34,0,0,0,0,21781,0,0,0,15777
F35,0,0,0,0,7634,6557,0,1575,8449
