## Fetal Immune Atlas - save labels in anndata

In this notebook I load the raw merged anndata file and add the labels unified in `20201230_UniformCellLabels.ipynb`, fixing some inconsistencies on cell names between organs.

In [1]:
import os,sys
import numpy as np 
import pandas as pd
import scanpy as sc
import scanpy.external as sce
import anndata
from bbknn import bbknn
import scipy

## For geosketch integration
from geosketch import gs, uniform, srs, kmeanspp
from scanorama import transform
from scipy.sparse import csr_matrix, find
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
# from subprocess import Popen
# import sys
from time import time
from datetime import datetime



### Load merged dataset 

In [2]:
merged_raw = sc.read_h5ad('/nfs/team205/ed6/data/Fetal_immune/PAN.A01.v01.entire_data_normalised_log.wGut.h5ad')

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [3]:
merged_raw.obs['batch'] = [x+y for x,y in zip(merged_raw.obs['organ'],merged_raw.obs['method'])]
merged_raw.obs['bbk'] = [x+y for x,y in zip(merged_raw.obs['donor'],merged_raw.obs['method'])]

Change barcodes for gut cells

In [4]:
def _rename_gut_cells(x):
    if "FCA" not in x:
        x = x.split("_")[8].split('-')[1] + "-1"  + "_" + "_".join(x.split("_")[3:6])
    else: 
        x = x.split("_")[7].split('-')[1] + "-1" + "_" + "_".join(x.split("_")[3:5]) 
    return(x)

In [5]:
obs_names = merged_raw.obs_names.values
gut_ixs = np.where(merged_raw.obs.organ=="GU")[0]
for i in gut_ixs:
    obs_names[i] = _rename_gut_cells(obs_names[i])

In [6]:
merged_raw.obs_names = obs_names

### Add cell type labels

Made uniform in `notebooks/20201230_UniformCellLabels.ipynb`

In [201]:
annot_dir = '/home/jovyan/mount/gdrive/Pan_fetal/annotations/'
annot_df = pd.read_csv(annot_dir + "uniform_labels_full.csv", index_col=0)

In [202]:
### WARNING!! There are duplicated indices in the skin dataset, here I am solving the brute force way
annot_df.loc[annot_df.index[annot_df.index.duplicated(keep=False)],'uniform_label'] = "NEUTROPHIL"
annot_df.loc[annot_df.index[annot_df.index.duplicated(keep=False)],'cell.labels'] = "Neutrophile"

annot_df = annot_df[~annot_df.index.duplicated()]

Fix names to make them uniform w dataset

In [203]:
def _translate_obs_names(x, organ):
    if organ in ["sp", 'bm']:
        if "FCA" in x:
            obs_name = x.split("_")[3]+ '-' +x.split("_")[5]
        else:
            obs_name = x
    elif organ in ["ki"]:
        if "FCA" in x:
            obs_name = x.split("_")[0] + "-" + x.split("_")[-1].split('-')[0]
        else:
            obs_name = x
    elif organ in ["li", 'ys']:
        obs_name = x.split("_")[3]+'-'+x.split("_")[4]
    elif organ in ["sk"]:
        obs_name = x.split("-")[2]+'-'+x.split("-")[0]
    elif organ in ["gu"]:
        obs_name = "-".join(x.split("-")[:2]) + "_" + x.split("-")[2]
    else:
        obs_name = x
    return(obs_name)

In [204]:
# annot_df.index = annot_df["old_name"]
annot_df.index = ["GEX".join(x.split("prime")) for x in annot_df.index]
new_name = [_translate_obs_names(annot_df.index[i],annot_df.organ[i]) for i in range(annot_df.shape[0])]

In [205]:
annot_df["old_name"] = annot_df.index
annot_df.index = new_name

In [206]:
## Subset to cells in the adata
annot_df = annot_df.loc[merged_raw.obs_names[merged_raw.obs_names.isin(annot_df.index)]]

In [207]:
new_anno = pd.concat([merged_raw.obs, annot_df[['uniform_label', 'uniform_label_expanded_merged', 'uniform_label_lvl0']]], 1)

In [208]:
merged_raw.obs = new_anno[~new_anno.Sample.isna()].loc[merged_raw.obs_names]

## Add correct metadata

In [263]:
metadata = pd.read_csv("/home/jovyan/mount/gdrive/Pan_fetal/annotations/manifest_clean_110121.csv", index_col=0)

In [264]:
metadata['method'] = [x.split("prime")[0]+"GEX" if "prime" in x else x for x in metadata["Sequencing"]]

## Rename columns as they are in obs
metadata['donor'] = metadata['SAMPLE.NAME']

In [266]:
clean_metadata = metadata[["Sample.lanes", "Sort_id","age", "method", "donor", "sex", "Processing_method"]]
clean_metadata["file"] = clean_metadata['Sample.lanes']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
new_obs = merged_raw.obs.reset_index().merge(clean_metadata, on=['file', "donor", "method"], how='left', indicator=True)
new_obs = new_obs.set_index("index")

## Check that the merge has worked properly
if not new_obs.shape[0] == merged_raw.obs.shape[0]:
    print("--- WARNING!! The new obs has more rows than the old obs ---")

if np.any(new_obs._merge=="right_only"):
    print("--- WARNING!! Some values are unique to metadata ---")

if not new_obs.index.is_unique:
    print("--- WARNING!! Duplicate indices ---")
    


In [303]:
new_obs = new_obs.drop(["sort", "_merge"],1)

In [308]:
new_obs.to_csv("/nfs/team205/ed6/data/Fetal_immune/PAN.A01.v01.entire_data_normalised_log.wGut.full_obs.csv")