In [1]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager
import seaborn as sns
import anndata
import scanpy as sc
import re
import decoupler as dc
import sc_toolbox
import random
import seaborn.objects as so

In [2]:
data_dir = "/well/immune-rep/users/vbw431/Projects/Peppa/data/"
newdata_dir = "/well/immune-rep/users/vbw431/Projects/Peppa/new_analysis/data/"
plot_dir = "/well/immune-rep/users/vbw431/Projects/Peppa/new_out/final_plots/"

sys.path = sys.path + ["/well/immune-rep/users/vbw431/python_utils/"]

In [3]:
import anndata2ri
import logging

import rpy2.rinterface_lib.callbacks as rcb
import rpy2.robjects as ro

rcb.logger.setLevel(logging.ERROR)
ro.pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython



In [4]:
%%R

.libPaths(c(paste0("/well/immune-rep/users/vbw431/python/scvi_new_ivy/r_modules"), .libPaths()))
library(Seurat)
library(tidyverse)



cur.dir = "/well/immune-rep/users/vbw431/Projects/Peppa/data/DIMITRA_FASTQ/"
work.dir = "/well/immune-rep/users/vbw431/Projects/Peppa/"
out.dir = "/well/immune-rep/users/vbw431/Projects/Peppa/out/"
references = "/well/immune-rep/users/vbw431/reference/reference/refdata-gex-GRCh38-2020-A/"
com.out = "/well/immune-rep/users/vbw431/Projects/Peppa/reference_combat/"

##plotting
library(ggplot2); theme_set(theme_bw(base_size = 18)+
                              theme(strip.text = element_text(colour = 'black', face="bold",size=12), 
                                    panel.grid.major = element_blank(), 
                                    panel.grid.minor = element_blank(),
                                    panel.border = element_rect(size = 0.7),
                                    axis.ticks.length=unit(.10, "cm"),
                                    axis.ticks = element_line(size=0.7),
                                    strip.background = element_blank()))




    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

In [5]:
##load data
adata = sc.read_h5ad("/well/immune-rep/users/vbw431/Projects/Peppa/out/peppa_azi_combat.h5ad")

labels = ["NK", "CD8", "CD4", "Bcells", "Myeloid_Plt"]
cluster_list = {}
scvi_list = {}
umap_list = {}
for i in [0,1,2,3,4]:
    print("reading in " + labels[i])
    cluster_list[labels[i]] = pd.read_csv(os.path.join(newdata_dir + labels[i] +"_clustering_final/", f"Peppa_{labels[i]}_cluster_assignment.csv"), index_col =0)
    scvi_list[labels[i]] = pd.read_csv(os.path.join(newdata_dir + labels[i] +"_embeddings/", f"Peppa_{labels[i]}_scvi.csv"), index_col =0)
    umap_list[labels[i]] = pd.read_csv(os.path.join(newdata_dir + labels[i] +"_embeddings/", f"Peppa_{labels[i]}_umap.csv"), index_col =0)

##generate adata_list
adata_list = {}

for i in [0,1,2,3, 4]:
    adata_list[labels[i]] = adata[cluster_list[labels[i]].index].copy()
    adata_list[labels[i]].obsm["X_scVI"] = scvi_list[labels[i]].loc[adata_list[labels[i]].obs_names].values
    adata_list[labels[i]].obsm["X_umap"] = umap_list[labels[i]].loc[adata_list[labels[i]].obs_names].values
    adata_list[labels[i]].obs = adata_list[labels[i]].obs.merge(cluster_list[labels[i]], left_index=True, right_index=True, how="inner")
    adata_list[labels[i]].layers['counts'] = adata_list[labels[i]].X.copy()
    sc.pp.normalize_total(adata_list[labels[i]], target_sum=1e4)
    sc.pp.log1p(adata_list[labels[i]])
    adata_list[labels[i]].layers['normalized'] = adata_list[labels[i]].X.copy()
    
##update meta_data with new clinical
clin_meta = pd.read_csv(newdata_dir + "index_demo.csv", index_col = 0)
clin_meta = clin_meta[["disease_group", 
                       "bio_replicate", 
                       "study_disease", 
                       "scanpy_index", 
                       "study_ID", 
                       "Treatment_status",
                      "Ethnicity",
                      "Sex",
                      "Age",
                      "HBV_serostatus",
                      "HBV_sAg_titre",
                      "HBV_DNA_VL"]]

new_obs = {}

for name in labels:
    del adata_list[name].obs["disease_group"]
    adata_list[name].obs["study_disease_orig"] = adata_list[name].obs["study_disease"].copy()
    del adata_list[name].obs["study_disease"]
    new_df = pd.merge(adata_list[name].obs, clin_meta, how='left', left_on = ['bio_replicate','scanpy_index'], right_on = ['bio_replicate','scanpy_index'])
    new_df.index = adata_list[name].obs.index
    new_obs[name] = new_df.copy()
    adata_list[name].obs = new_df.copy()

                 
for name in labels:
    adata_list[name].obs["celltype_consensus"] = str(name)

for name in labels:
    adata_list[name].obs["celltype_consensus.l1"] = str(name)
    adata_list[name].obs["celltype_consensus.l2"] = adata_list[name].obs[str(name+".annotation.l1")]
    adata_list[name].obs["celltype_consensus.l3"] = adata_list[name].obs[str(name+".annotation.l2")]
    
##concat
adata_all = anndata.concat(adata_list, join= "outer", index_unique=None)



reading in NK
reading in CD8
reading in CD4
reading in Bcells
reading in Myeloid_Plt


In [6]:
pd.crosstab(adata_all.obs["study_disease_orig"], adata_all.obs["study_disease"])

study_disease,CTRL,HBV,HBV_HIV
study_disease_orig,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CTRL,122601,0,0
HBV,0,86895,0
HBV_HIV,0,0,43315


In [11]:
adata_all.obs[["HTO_maxID", "solo_classification", "study_disease"]]

Unnamed: 0,HTO_maxID,solo_classification,study_disease
AAACCTGTCGCATGGC-1-HIVHBV_C,Hashtag-2,singlet,CTRL
AAACGGGAGATATACG-1-HIVHBV_C,Hashtag-2,doublet,CTRL
AAACGGGAGTTACCCA-1-HIVHBV_C,Hashtag-2,doublet,CTRL
AAACGGGCATCGGTTA-1-HIVHBV_C,Hashtag-2,doublet,CTRL
AAAGATGTCTCGCTTG-1-HIVHBV_C,Hashtag-1,singlet,HBV_HIV
...,...,...,...
E2L8_TTTGGTTCATAATGCC,,,CTRL
E2L8_TTTGGTTGTATGGGAC,,,CTRL
E2L8_TTTGTTGGTGTGCCTG,,,CTRL
E2L8_TTTGTTGGTTGGCTAT,,,CTRL


In [11]:
data_mat = adata_all.X

barcodes = adata_all.obs_names.values

genes = adata_all.var_names.values

meta = adata_all.obs

In [12]:
%%R -i data_mat -i barcodes -i genes -i meta

## Create seurat object for downstream analysis
colnames(data_mat) <- genes
rownames(data_mat) <- barcodes

peppa <- CreateSeuratObject(counts = t(data_mat), meta.data = meta)

rm(peppa_all)

##save
print("saving rds object")
saveRDS(peppa, paste0(out.dir, "/peppa_pbmc_all.rds"))



[1] "adding "
[1] "saving rds object"
