## Prepare reference data for cell2location training

In [23]:
import os,sys
import scanpy as sc
import pandas as pd
import numpy as np
import anndata

data_dir="/nfs/team205/ed6/data/Fetal_immune/"
timestamp="20210429"

Make folder to save outputs

outdir = data_dir + "c2l_scRNA_references/"
if not os.path.exists(outdir):
    os.mkdir(outdir)

### Load reference data

adata = sc.read_h5ad(data_dir + 'PAN.A01.v01.entire_data_raw_count.{t}.h5ad'.format(t=timestamp))
adata.var_names_make_unique()

## Filter maternal contaminants
mat_barcodes = pd.read_csv("~/Pan_fetal_immune/metadata/souporcell_results/maternal_barcodes.csv", index_col=0)
mat_barcodes["x"] = pd.Series([x.split("-1")[0] for x in mat_barcodes['x']])

adata = adata[~adata.obs_names.isin(mat_barcodes["x"])]

adata

### Load annotations

In [25]:
## Read annotation groupings
import json
with open('../../metadata/anno_groups.json', 'r') as json_file:
    anno_groups_dict = json.load(json_file)

anno_obs = pd.read_csv(data_dir + "PAN.A01.v01.entire_data_normalised_log.20210429.full_obs.annotated.clean.csv", index_col=0)
adata = adata[adata.obs_names.isin(anno_obs.index)].copy()
adata.obs = anno_obs.loc[adata.obs_names].copy()

ValueError: To copy an AnnData object in backed mode, pass a filename: `.copy(filename='myfilename.h5ad')`. To load the object into memory, use `.to_memory()`.

In [9]:
del anno_obs

In [48]:
def make_c2l_reference(
    ref_adata,
    annotation_obs = 'anno_lvl_2_final_clean',
    technical_obs = ["method", "donor"], ## Covariates to regress out
    library_obs = ['Sample.lanes'], ## Covariate for 10x library
    subset_organ=None,
    min_age = None,
    exclude_clusters = [ ## Clusters to exclude
        'DOUBLET_IMMUNE_FIBROBLAST',
         'LOW_Q_INCONSISTENT',
         'DOUBLET_LYMPHOID_MACROPHAGE',
         'DOUBLETS_FIBRO_ERY',
         'DOUBLET_ENDOTHELIUM_ERYTHROCYTE',
         'DOUBLET_ERY_B',
         'PLACENTAL_CONTAMINANTS',
         'DOUBLET'],
    split_by_organ = None, ## for which clusters should the annotation be split by organ? e.g. just stroma
    min_cells = 10 ## Minimum number of cells to keep a cluster
    ):
    ## Subset by organ
    if subset_organ:
        ref_adata = ref_adata[ref_adata.obs["organ"] == subset_organ]

    ## Subset by age
    if min_age:
        ref_adata = ref_adata[ref_adata.obs["age"] >= min_age]

    ## Exclude low quality clusters
    if exclude_clusters:
        ref_adata = ref_adata[~ref_adata.obs[annotation_obs].isin(exclude_clusters)].copy()

    ## Split selected clusters by organ
    if split_by_organ:
        tosplit_obs = ref_adata.obs[ref_adata.obs[annotation_obs].isin(split_by_organ)].copy()
        organ_anno = tosplit_obs[annotation_obs] + "_" + tosplit_obs['organ']
        ref_adata.obs.loc[tosplit_obs.index, annotation_obs] = organ_anno.values

    ## Remove clusters containing less than n cells
    clus_counts = ref_adata.obs[annotation_obs].value_counts() 
    keep_clus = clus_counts.index[clus_counts >= min_cells] 
    ref_adata = ref_adata[ref_adata.obs[annotation_obs].isin(keep_clus)].copy()

    ## Clean obs
    ref_adata.obs = ref_adata.obs[technical_obs + library_obs + [annotation_obs, "organ", "age"]].copy()

    return(ref_adata)

def save_c2l_reference(params):
    outfile = "PAN.A01.v01.c2l_reference."
    if params["subset_organ"]:
        outfile = outfile + "subset{o}.".format(o=params["subset_organ"])
    if params["split_by_organ"]:
        outfile = outfile + "organ_split_stroma."
    if params["min_age"]:
        outfile = outfile + "minAge{a}.".format(a=params["min_age"])
    if params["exclude_clusters"]:
        outfile = outfile + "exclude_lowQ."
    outfile = outdir + outfile + "h5ad"

    ref_adata = make_c2l_reference(adata, **params)
    ref_adata.write_h5ad(outfile)

In [65]:
lowQ_clusters = ('DOUBLET_IMMUNE_FIBROBLAST',
                     'LOW_Q_INCONSISTENT',
                     'DOUBLET_LYMPHOID_MACROPHAGE',
                     'DOUBLETS_FIBRO_ERY',
                     'DOUBLET_ENDOTHELIUM_ERYTHROCYTE',
                     'DOUBLET_ERY_B',
                     'PLACENTAL_CONTAMINANTS',
                     'DOUBLET')

In [None]:
for org in (None, "TH", "SP", "LI"):
    for age in (None, 10, 15):
        for sbo in (None, anno_groups_dict["STROMA"]):
            params = {
                'subset_organ':org,
                'split_by_organ' : sbo,
                'min_age':age,
                'exclude_clusters':lowQ_clusters
            }
            save_c2l_reference(params)