In [1]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager
import seaborn as sns
import anndata
import scvi
import scanpy as sc
import re
from scipy import sparse


Global seed set to 0
  new_rank_zero_deprecation(
  return new_rank_zero_deprecation(*args, **kwargs)


In [2]:
cur_dir = "/well/immune-rep/users/vbw431/Projects/Peppa/data/DIMITRA_FASTQ/"
work_dir = "/well/immune-rep/users/vbw431/Projects/Peppa/"
out_dir = "/well/immune-rep/users/vbw431/Projects/Peppa/out/"
new_out_dir = "/well/immune-rep/users/vbw431/Projects/Peppa/new_analysis/data/"
plot_dir = "/well/immune-rep/users/vbw431/Projects/Peppa/new_analysis/out/plots/"

references = "/well/immune-rep/users/vbw431/reference/reference/refdata-gex-GRCh38-2020-A/"


In [5]:
##load data
adata = sc.read_h5ad("/well/immune-rep/users/vbw431/Projects/Peppa/out/peppa_azi_combat.h5ad")


In [9]:
tmp = adata.obs[["sample_ID","lane","Pool_ID"]].bfill(axis=1).iloc[:, 0]
tmp

adata.obs["sequencing_batch"] = tmp.copy()

AAACCTGAGAACAACT-1-HIVHBV_C    HIVHBV_C
AAACCTGAGAGACTAT-1-HIVHBV_C    HIVHBV_C
AAACCTGAGAGTACAT-1-HIVHBV_C    HIVHBV_C
AAACCTGAGAGTCGGT-1-HIVHBV_C    HIVHBV_C
AAACCTGAGCCAGTTT-1-HIVHBV_C    HIVHBV_C
                                 ...   
TTTATGCGTTCGGCAC-1-gPlexK7       gPlexK
TTTCCTCCACATGACT-1-gPlexK7       gPlexK
TTTGTCACAATGGTCT-1-gPlexK7       gPlexK
TTTGTCACATGTCGAT-1-gPlexK7       gPlexK
TTTGTCATCCTCTAGC-1-gPlexK7       gPlexK
Name: sample_ID, Length: 421390, dtype: object

(421390,)

## Aim here is to broadly map datasets to a reference atlas 
(https://www.biorxiv.org/content/10.1101/2022.11.10.515939v1)

In [None]:
##load dann reference v2
adata_ref = sc.read_h5ad("/well/immune-rep/users/vbw431/reference/dann_ref/adata.h5ad")


In [None]:
##Use dann reference gene names as an anchor to convert to gene symbols 
annot = adata_ref.var.set_index("gene_name").copy()

adata.var[annot.columns] = annot
cleanedList = adata.var_names[~adata.var["gene_id"].isnull()]
adata= adata[:, cleanedList].copy()

adata.shape


In [None]:
##set var names to ensemblID
adata.var["orig_gene_name"] = adata.var_names

adata.var_names = adata.var["gene_id"]

adata.var

In [None]:
##assign counts layer 
adata.layers["counts"] = adata.X.copy()


In [None]:

from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()
adata.obs["batch"] = LE.fit_transform(adata.obs["sequencing_batch"])



In [None]:
adata.obs["_scvi_batch"] = adata.obs["batch"].copy()


In [None]:
## prepare query data - Peppa lab
dir_path = "/well/immune-rep/users/vbw431/reference/dann_ref/model_covid"
adata.obs["_scvi_labels"] = 0

scvi.model.SCVI.prepare_query_anndata(adata, dir_path)


In [None]:
vae_ref= scvi.model.SCVI.load(dir_path)

In [None]:
vae_ref

In [None]:
#setup model query
vae_q = scvi.model.SCVI.load_query_data(
    adata,
    vae_ref,
)

In [None]:
##set surgery params

surgery_epochs = 200
train_kwargs_surgery = {
    "early_stopping": True,
    "early_stopping_monitor": "elbo_train",
    "early_stopping_patience": 10,
    "early_stopping_min_delta": 0.001,
    "plan_kwargs": {"weight_decay": 0.0},
}



In [None]:
##train model
vae_q.train(max_epochs=surgery_epochs,use_gpu = 1,batch_size=256, **train_kwargs_surgery)

vae_q.save("/well/immune-rep/users/vbw431/reference/dann_ref/model_covid_q", save_anndata=True,overwrite=True)


In [None]:
##generate latent space and run umap

adata.obsm["X_scVI"] = vae_q.get_latent_representation(adata)

sc.pp.neighbors(adata, use_rep="X_scVI", metric="cosine",n_neighbors=50)
sc.tl.umap(adata)


In [None]:
##plot umap

with plt.rc_context():
  sc.pl.umap(
    adata,
    color=["Ref_lab", "batch","percent.mt", "QC_label", "solo_classification"],
    frameon=False,
    ncols=1,
    title="SCVI_Core+Extension",
  #     legend_fontoutline=1,
  #     size=5,
  )
  plt.savefig(plot_dir+"/SCVI_dann_ref_umap.png", bbox_inches="tight", dpi=300)


In [None]:
## generate 2D mde dim reduction
import pymde
from scvi.model.utils import mde
adata.obsm["X_mde"] = mde(adata.obsm["X_scVI"])


In [None]:
##plot
with plt.rc_context():
    sc.pl.embedding(
        adata,
        basis="X_mde",
        color=["Ref_lab", "batch","percent.mt"],
        frameon=False,
        ncols=1,
    )
    plt.savefig(plot_dir+"SCVI_dann_ref_mde.png", bbox_inches="tight", dpi=300)


In [None]:
##save

adata.write_h5ad("/well/immune-rep/users/vbw431/Projects/Peppa/new_analysis/data/Peppa_azimuth_SCVI_dann_mapped.h5ad")


In [None]:
set(adata.obs["batch"])