In [None]:
import scanpy as sc
import pandas as pd
import anndata
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import requests
import sklearn.decomposition #import TruncatedSVD
import scipy 
import sparse, io
import scvi


#Load in the raw first spleen dataset
adata_pl_1_raw=sc.read_10x_h5("/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/Anno/matrix_filtered_clear/run2/output_filtered.h5")

#Load in the raw second spleen dataset
adata_pl_2_raw= sc.read_10x_h5('/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/Anno/matrix_filtered_clear/second_spleen/output_filtered.h5')

In [None]:
#Load in the souporcell annotated spleen dataset
souporcell_1=pd.read_table('/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nick/Pleuro/outs/20230704_spleen_R1_soup/20230603_24h_soup_res/clusters.tsv')
souporcell_2=pd.read_table('/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nick/Pleuro/outs/20230704_spleen_R2_soup/20230603_24h_soup_res/clusters.tsv')
#Replace the variables load in the triannotate file
mapper=pd.read_table("/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/aPlwal.pri.V2.genome.annots.tsv")
mapper_dict = mapper.set_index('#gene_id')['EggNM.Preferred_name'].to_dict()
status_1_mapper=dict(zip(souporcell_1['barcode'],souporcell_1['status']))
assignment_1_mapper=dict(zip(souporcell_1['barcode'],souporcell_1['assignment']))
status_2_mapper=dict(zip(souporcell_2['barcode'],souporcell_2['status']))
assignment_2_mapper=dict(zip(souporcell_2['barcode'],souporcell_2['assignment']))

In [None]:
#Add in the souporcell annotation to the first and second spleen dataset
adata_pl_1_raw.obs['status']='NA'
adata_pl_1_raw.obs['assignment']='NA'
adata_pl_1_raw.obs['batch']='1'
adata_pl_2_raw.obs['status']='NA'
adata_pl_2_raw.obs['assignment']='NA'
adata_pl_2_raw.obs['batch']='2'
for i in adata_pl_1_raw.obs.index:
    if i in status_1_mapper.keys():
        adata_pl_1_raw.obs.loc[i,'status']=status_1_mapper[i]
        adata_pl_1_raw.obs.loc[i,'assignment']=assignment_1_mapper[i]
for i in adata_pl_2_raw.obs.index:
    if i in status_2_mapper.keys():
        adata_pl_2_raw.obs.loc[i,'status']=status_2_mapper[i]
        adata_pl_2_raw.obs.loc[i,'assignment']=assignment_2_mapper[i]

In [None]:
#Filter out the cells
adata_pl_1_raw.var_names = [mapper_dict.get(x, x) if mapper_dict.get(x, x) != '.' else x for x in adata_pl_1_raw.var_names]
adata_pl_2_raw.var_names = [mapper_dict.get(x, x) if mapper_dict.get(x, x) != '.' else x for x in adata_pl_2_raw.var_names]
#Preprocess the data
mt_gene_patterns = ['COX1', 'COX2', 'ATP8', 'ATP6', 'COX3', 'NU1M', 'NU2M', 'NU3M', 'NU4M', 'NU4LM', 'NU5M', 'NU6M', 'CYB']
mt_gene_pattern = '|'.join(mt_gene_patterns)
for adata in [adata_pl_1_raw, adata_pl_2_raw]:
    sc.pp.filter_cells(adata, min_genes=400)
    sc.pp.filter_genes(adata, min_cells=3)
    adata.var['mt'] = adata.var_names.str.match(mt_gene_pattern)
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    adata = adata[adata.obs.pct_counts_mt < 10, :]
    adata.var_names_make_unique()

In [None]:
scvi.model.SCVI.setup_anndata(adata_pl_1_raw)
vae = scvi.model.SCVI(adata_pl_1_raw)
vae.train()
solo = scvi.external.SOLO.from_scvi_model(vae)
solo.train()
df_1 = solo.predict()
df_1['prediction'] = solo.predict(soft=False)

scvi.model.SCVI.setup_anndata(adata_pl_2_raw)
vae = scvi.model.SCVI(adata_pl_2_raw)
vae.train()
solo = scvi.external.SOLO.from_scvi_model(vae)
solo.train()
df_2 = solo.predict()
df_2['prediction'] = solo.predict(soft=False)

In [None]:
#Remove Doublets from the second spleen dataset
adata_pl_2_raw_dob = adata_pl_2_raw[(adata_pl_2_raw.obs['status'] == 'doublet') ]
cellid_soup=adata_pl_2_raw_dob.obs.index
cellid_scvi=df_2[df_2['prediction']=='doublet'].index
common_elements = set(cellid_scvi).intersection(cellid_soup)
cellid_scvi = set(cellid_scvi)
cellid_soup = set(cellid_soup)
from matplotlib_venn import venn2
import matplotlib.pyplot as plt
venn2([cellid_scvi, cellid_soup], ('cellid_scvi', 'cellid_soup'))
doublets= cellid_scvi.union(cellid_soup)
doublets=list(doublets)
doublets_in_adata_pl_2_raw = list(set(doublets).intersection(adata_pl_2_raw.obs_names))
adata_pl_2_raw.obs.loc[doublets_in_adata_pl_2_raw, 'status'] = 'doublet'
adata_pl_2_raw=adata_pl_2_raw[(adata_pl_2_raw.obs['status'] == 'singlet') ]

In [None]:
adata_pl_1_raw_dob = adata_pl_1_raw[(adata_pl_1_raw.obs['status'] == 'doublet') ]
cellid_soup=adata_pl_1_raw_dob.obs.index
cellid_scvi=df_1[df_1['prediction']=='doublet'].index
common_elements = set(cellid_scvi).intersection(cellid_soup)
cellid_scvi = set(cellid_scvi)
cellid_soup = set(cellid_soup)
from matplotlib_venn import venn2
import matplotlib.pyplot as plt
venn2([cellid_scvi, cellid_soup], ('cellid_scvi', 'cellid_soup'))
doublets= cellid_scvi.union(cellid_soup)
doublets=list(doublets)

In [None]:
doublets_in_adata_pl_1_raw= list(set(doublets).intersection(adata_pl_1_raw.obs_names))
adata_pl_1_raw.obs.loc[doublets_in_adata_pl_1_raw, 'status'] = 'doublet'
adata_pl_1_raw=adata_pl_1_raw[adata_pl_1_raw.obs['status'] == 'singlet']

#Rename the animals in the second spleen dataset to be 3 and 4
adata_pl_2_raw.obs['assignment']=adata_pl_2_raw.obs['assignment'].replace({'1':'3','2':'4'})

In [None]:
#Integrate the two spleen datasets
sc.pp.normalize_total(adata_pl_1_raw, target_sum=1e4)
sc.pp.normalize_total(adata_pl_2_raw, target_sum=1e4)
sc.pp.log1p(adata_pl_1_raw)
sc.pp.log1p(adata_pl_2_raw)
sc.pp.highly_variable_genes(adata_pl_1_raw, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pp.highly_variable_genes(adata_pl_2_raw, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pp.scale(adata_pl_1_raw, max_value=10)
sc.pp.scale(adata_pl_2_raw, max_value=10)
sc.tl.pca(adata_pl_1_raw, svd_solver='arpack')
sc.pl.pca_variance_ratio(adata_pl_1_raw, log=True)
sc.tl.pca(adata_pl_2_raw, svd_solver='arpack')
sc.pl.pca_variance_ratio(adata_pl_2_raw, log=True)

In [None]:
#Clustering the data
sc.pp.neighbors(adata_pl_1_raw, n_neighbors=30, n_pcs=40)
sc.tl.umap(adata_pl_1_raw)
sc.tl.leiden(adata_pl_1_raw,resolution=0.5)
sc.pl.umap(adata_pl_1_raw, color=['leiden'])
sc.pp.neighbors(adata_pl_2_raw, n_neighbors=30, n_pcs=30)
sc.tl.umap(adata_pl_2_raw)
sc.tl.leiden(adata_pl_2_raw,resolution=0.5)
sc.pl.umap(adata_pl_2_raw, color=['leiden'])

In [None]:
#Integrate the two spleen datasets
adata_pl_1_raw.var_names_make_unique()
adata_pl_1_raw.obs["dataset"]="1"
adata_pl_2_raw.var_names_make_unique()
adata_pl_2_raw.obs["dataset"]="2"
var_names= adata_pl_1_raw.var_names.intersection(adata_pl_2_raw.var_names)
adata_pl_1_raw=adata_pl_1_raw[:,var_names]
adata_pl_2_raw=adata_pl_2_raw[:,var_names]
spleen_merged=adata_pl_1_raw.concatenate(adata_pl_2_raw)

In [None]:
#Batch correct using Harmony
sc.external.pp.harmony_integrate(spleen_merged, ['assignment','dataset'])

#Re cluster the integrated spleen dataset
sc.pp.neighbors(spleen_merged, n_neighbors=30, n_pcs=30,use_rep='X_pca_harmony')
sc.tl.umap(spleen_merged)
sc.tl.leiden(spleen_merged,resolution=0.5)
sc.pl.umap(spleen_merged, color=['leiden'])