# Integration with adult Pan-immune data - Myeloid

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys,os
import scvi
import anndata
import numpy as np
import pandas as pd
import scanpy as sc
import scipy

In [3]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
## import utils
cwd = '.'
sys.path.append(cwd)

import map_query_utils

In [5]:
## r2py setup
import anndata2ri
import rpy2.rinterface_lib.callbacks
import logging
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

anndata2ri.activate()

In [6]:
%load_ext rpy2.ipython


In [7]:
%%R
library(tidyverse)
library(reshape2)
library(patchwork)

remove_x_axis <- function(){
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank())  
}

remove_y_axis <- function(){
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.title.y = element_blank())  
}

In [8]:
def _plot_sorted_violin(merged_adata_bcells, groupby, y_value):
    n_cells_anno = merged_adata_bcells.obs[[groupby,y_value]].value_counts(groupby)
    keep_anno = n_cells_anno.index[n_cells_anno > 30]

    mean_sim = merged_adata_bcells.obs[[groupby,y_value]].groupby(groupby).mean()
    anno_order = mean_sim.loc[keep_anno].sort_values(y_value, ascending=False).index.tolist()

    merged_adata_bcells.var_names_make_unique()
    plt.rcParams["figure.figsize"] = [14,6]
    sc.pl.violin(merged_adata_bcells[merged_adata_bcells.obs[groupby].isin(keep_anno)], 
                                     y_value, groupby=groupby, rotation=90, 
                                     order=anno_order)

## Prepare data for mapping 

### Load full PIP dataset 

Provided by Cecilia, and re-saved by me to have EnsemblIDs as `var_names`

In [9]:
pi_adata = sc.read_h5ad('/nfs/team205/ed6/data/Fetal_immune/panimmune_query.h5ad')

### Subset to lymphoid cells

In [10]:
mye_pi_adata = pi_adata[~pi_adata.obs["anno_mye"].isna()]

In [12]:
mye_pi_adata.write_h5ad("/nfs/team205/ed6/data/Fetal_immune/panimmune_MYELOID_query.h5ad")
mye_pi_adata.write_h5ad("/home/jovyan/mount/gdrive/Pan_fetal/data4gpu_node/panimmune_MYELOID_query.h5ad")

### Save MYELOID scVI model with EnsemblIDs

In [13]:
split = "MYELOID"
scvi_outs_dir = '/home/jovyan/mount/gdrive/Pan_fetal/data4gpu_node/'

## Read old varnames
model_dir='scvi_' + split + '_model/'
var_names_model = pd.read_csv(scvi_outs_dir + model_dir + "var_names.csv", header=None)[0].values

## Make new model folder for model with EnsemblIDs
new_model_dir = model_dir.strip("/") +"_ENSID/"
bash_call = 'cp -r {o} {n}'.format(o=scvi_outs_dir + model_dir, n=scvi_outs_dir+new_model_dir)
os.system(bash_call)

## Save var_names
adata_ref_var = pd.read_csv(scvi_outs_dir + 'PAN.A01.v01.entire_data_normalised_log.{t}.{s}.var.csv'.format(t=timestamp, s=split), index_col=0)
adata_ref_var.iloc[var_names_model]['GeneID'].to_csv(scvi_outs_dir + new_model_dir + "var_names.csv", header=None, index=False)

NameError: name 'timestamp' is not defined

### Train on scVI models

On GPU node, running the call:

We merge datasets and rerun embeddings running

In [None]:
python ./merge_query_2_reference.py --- --- 

### Load output

In [83]:
query_h5ad_file = '/home/jovyan/mount/gdrive/Pan_fetal/data4gpu_node/panimmune_MYELOID_query.mapped2MYELOID.h5ad'
split = "MYELOID"
ref_data_dir = '/home/jovyan/mount/gdrive/Pan_fetal/data4gpu_node/'
timestamp = '20210429'

In [None]:
merged_adata_bcells = map_query_utils._merge_query_and_reference(query_h5ad_file, split, ref_data_dir=ref_data_dir)

Variable names are not unique. To make them unique, call `.var_names_make_unique`.
