In [None]:
import scanpy as sc
import anndata as ann
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import glob
from matplotlib import rcParams
from matplotlib import colors

import seaborn as sb

sc.settings.verbosity = 3


plt.rcParams['figure.figsize']=(8,8) #rescale figures
sc.settings.verbosity = 3
sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()

In [None]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [None]:
# Set size for plots
sb.set_context(context='paper')

In [None]:
#set analysis version
version = "V1"
#set output files_path
output_files_path = "/Sunshine_DeRisi_RSV_files/"

fig_path = "/Sunshine_DeRisi_RSV_files/figures/"

In [None]:
name = "2024_RSV_annotated_unfiltered_mouse_virus"##+version
preprocessed_path = output_files_path+version+'_'+name+'.h5ad'

adata_mouse_virus = sc.read_h5ad(preprocessed_path)

In [None]:
adata_mouse_virus.obs['mouse_n_genes'] = (adata_mouse_virus.X > 0).sum(1)

## quick qc plots

In [None]:
#QC Histograms
rcParams['figure.figsize']=(20,5)
fig_ind=np.arange(131, 134)
fig = plt.figure()
fig.subplots_adjust(hspace=0.4, wspace=0.6)
adata_mouse_virus
p3_adata_mouse_virus = sb.histplot(adata_mouse_virus.obs['n_counts'], 
                 kde=False, #kde=false means not normalized
                 ax=fig.add_subplot(fig_ind[0]))
p4_adata_mouse_virus = sb.histplot(adata_mouse_virus.obs['n_counts'][adata_mouse_virus.obs['n_counts']<5000], 
                 kde=False, bins=60, 
                 ax=fig.add_subplot(fig_ind[1]))
p5_adata_mouse_virus = sb.histplot(adata_mouse_virus.obs['n_counts'][adata_mouse_virus.obs['n_counts']>10000], 
                 kde=False, bins=60, 
                 ax=fig.add_subplot(fig_ind[2]))
plt.show()

In [None]:
#Thresholding decision: genes
rcParams['figure.figsize']=(20,5)
fig_ind=np.arange(131, 133)
fig = plt.figure()
fig.subplots_adjust(hspace=0.4, wspace=0.6) #create a grid for subplots

p6_adata_mouse_virus = sb.histplot(adata_mouse_virus.obs['n_genes'], kde=False, bins=60, ax=fig.add_subplot(fig_ind[0]))


p7_adata_mouse_virus = sb.histplot(adata_mouse_virus.obs['n_genes'][adata_mouse_virus.obs['n_genes']<2500], 
                 kde=False, bins=60, ax=fig.add_subplot(fig_ind[1])) 

plt.show()

In [None]:
#Calcuate mouse cell summary statistics
# Calculate summary statistics
    #first extract n_count and n_genes for each cell
#n_count for each cell
mouse_n_genes_only = adata_mouse_virus.obs ['n_genes']
mouse_n_genes_only.describe()
mouse_n_genes_only_mean = np.mean(mouse_n_genes_only)
print(mouse_n_genes_only_mean)
mouse_n_genes_only_median = np.median(mouse_n_genes_only)
print(mouse_n_genes_only_median)
mouse_n_genes_only_range = np.ptp(mouse_n_genes_only)
print(mouse_n_genes_only_range)
mouse_n_genes_only_standard_deviation = np.std(mouse_n_genes_only) #sq root of variance
print(mouse_n_genes_only_standard_deviation)

# Now calculate uppper and lower limit for n_counts and n_genes 

print (mouse_n_genes_only_mean) 
mouse_n_genes_only_1SD_upper = mouse_n_genes_only_mean + mouse_n_genes_only_standard_deviation
print(mouse_n_genes_only_1SD_upper)
mouse_n_genes_only_1SD_lower = mouse_n_genes_only_mean - mouse_n_genes_only_standard_deviation
print(mouse_n_genes_only_1SD_lower)
mouse_n_genes_only_2SD_upper = mouse_n_genes_only_mean + (2*mouse_n_genes_only_standard_deviation)
print (mouse_n_genes_only_2SD_upper)
mouse_n_genes_only_2SD_lower = mouse_n_genes_only_mean - (2*mouse_n_genes_only_standard_deviation)
print (mouse_n_genes_only_2SD_lower)
mouse_n_genes_only_3SD_upper = mouse_n_genes_only_mean + (3*mouse_n_genes_only_standard_deviation)
print (mouse_n_genes_only_3SD_upper)
mouse_n_genes_only_3SD_lower = mouse_n_genes_only_mean - (3*mouse_n_genes_only_standard_deviation)
print (mouse_n_genes_only_3SD_lower)


# Calculate summary statistics
mouse_n_counts_only = adata_mouse_virus.obs ['mouse_n_counts']
mouse_n_counts_only.describe()
mouse_n_counts_only_mean = np.mean(mouse_n_counts_only)
print(mouse_n_counts_only_mean)
mouse_n_counts_only_median = np.median(mouse_n_counts_only)
print(mouse_n_counts_only_median)
mouse_n_counts_only_range = np.ptp(mouse_n_counts_only)
print(mouse_n_counts_only_range)
mouse_n_counts_only_standard_deviation = np.std(mouse_n_counts_only) #sq root of variance
print(mouse_n_counts_only_standard_deviation)



# Now calculate uppper and lower limit for mouse_n_counts 

mouse_n_counts_only_1SD_upper = mouse_n_counts_only_mean + mouse_n_counts_only_standard_deviation
print(mouse_n_counts_only_1SD_upper)
mouse_n_counts_only_1SD_lower = mouse_n_counts_only_mean - mouse_n_counts_only_standard_deviation
print(mouse_n_counts_only_1SD_lower)
mouse_n_counts_only_2SD_upper = mouse_n_counts_only_mean + (2*mouse_n_counts_only_standard_deviation)
print (mouse_n_counts_only_2SD_upper)
mouse_n_counts_only_2SD_lower = mouse_n_counts_only_mean - (2*mouse_n_counts_only_standard_deviation)
print (mouse_n_counts_only_2SD_lower)
mouse_n_counts_only_3SD_upper = mouse_n_counts_only_mean + (3*mouse_n_counts_only_standard_deviation)
print (mouse_n_counts_only_3SD_upper)
mouse_n_counts_only_3SD_lower = mouse_n_counts_only_mean - (3*mouse_n_counts_only_standard_deviation)
print (mouse_n_counts_only_3SD_lower)

#Thresholding decision: genes
rcParams['figure.figsize']=(20,5)
fig_ind=np.arange(131, 133)
fig = plt.figure()
fig.subplots_adjust(hspace=0.4, wspace=0.6) #create a grid for subplots

p6_adata_mouse_virus = sb.histplot(adata_mouse_virus.obs['n_genes'], kde=False, bins=60, ax=fig.add_subplot(fig_ind[0]))


plt.axvline(mouse_n_genes_only_2SD_lower, color='b')


p7_adata_mouse_virus = sb.histplot(adata_mouse_virus.obs['n_genes'][adata_mouse_virus.obs['n_genes']<2500], 
                 kde=False, bins=60, ax=fig.add_subplot(fig_ind[1])) 
plt.axvline(mouse_n_genes_only_3SD_lower, color='g')
plt.axvline(mouse_n_genes_only_2SD_lower, color='b')
plt.axvline(mouse_n_genes_only_1SD_lower, color='r')
plt.show()




rcParams['figure.figsize']=(20,5)
fig_ind=np.arange(131, 134)
fig = plt.figure()
fig.subplots_adjust(hspace=0.4, wspace=0.6)

p3_adata_mouse_virus = sb.histplot(adata_mouse_virus.obs['mouse_n_counts'], 
                 kde=False, #kde=false means not normalized
                 ax=fig.add_subplot(fig_ind[0]))
plt.axvline(mouse_n_counts_only_2SD_upper, color='b')
plt.axvline(mouse_n_counts_only_2SD_lower, color='b')
plt.axvline(mouse_n_counts_only_1SD_upper, color='r')
plt.axvline(mouse_n_counts_only_1SD_lower, color='r')

p4_adata_mouse_virus = sb.histplot(adata_mouse_virus.obs['mouse_n_counts'][adata_mouse_virus.obs['mouse_n_counts']<6000], 
                 kde=False, bins=60, 
                 ax=fig.add_subplot(fig_ind[1]))
plt.axvline(mouse_n_counts_only_2SD_lower, color='b')
plt.axvline(mouse_n_counts_only_1SD_lower, color='r')

p5_adata_mouse_virus = sb.histplot(adata_mouse_virus.obs['mouse_n_counts'][adata_mouse_virus.obs['mouse_n_counts']>10000], 
                 kde=False, bins=60, 
                 ax=fig.add_subplot(fig_ind[2]))
plt.axvline(mouse_n_counts_only_2SD_upper, color='b')
plt.axvline(mouse_n_counts_only_1SD_upper, color='r')

plt.show()

In [None]:
adata_mouse_virus

In [None]:
#Filter mouse cells according to identified QC thresholds:
print('Total number of cells: {:d}'.format(adata_mouse_virus.n_obs))

sc.pp.filter_genes(adata_mouse_virus, min_cells=3)
print('Number of genes after min cell filter: {:d}'.format(adata_mouse_virus.n_obs))

sc.pp.filter_cells(adata_mouse_virus, min_counts = mouse_n_counts_only_2SD_lower)
print('Number of cells after min count filter: {:d}'.format(adata_mouse_virus.n_obs))


sc.pp.filter_cells(adata_mouse_virus, min_genes = mouse_n_genes_only_2SD_lower)
print('Number of cells after gene filter: {:d}'.format(adata_mouse_virus.n_obs))

# Normalize, log transform, scale

In [None]:
import logging

In [None]:
#Normlaize to median value, log transform
sc.pp.normalize_per_cell(adata_mouse_virus)
logging.info('Normalizing total counts to median cell count')
#log transform data
sc.pp.log1p(adata_mouse_virus)
logging.info('Log transforming data')
adata_mouse_virus.raw = adata_mouse_virus
logging.info('Saving log(counts)+1 in adata_mouse_virus.raw')
#Identify highly variable genes
sc.pp.highly_variable_genes(adata_mouse_virus, min_mean=0.0125, max_mean=3, min_disp=0.5) #scanpy default settings
sc.pp.scale(adata_mouse_virus, max_value=10) #Ye lab lupus paper

In [None]:

sc.pp.pca(adata_mouse_virus, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata_mouse_virus)

sc.tl.tsne(adata_mouse_virus)
sc.tl.umap(adata_mouse_virus)
sc.tl.diffmap(adata_mouse_virus)
sc.tl.draw_graph(adata_mouse_virus)

## Figure S1A

In [None]:
rcParams['figure.figsize']=(4,4)
fig= sc.pl.umap(adata_mouse_virus, color='viral_transcript_n_counts',cmap = 'magma_r',
          frameon=False,show=False)

plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Arial'
plt.rcParams['font.size'] = 12.0
plt.rcParams['legend.fontsize'] = 12.0


plt.savefig(fig_path + 'umap_mouse_viral_transcripts.pdf')

In [None]:
mouse_virus_plot_df = pd.DataFrame(adata_mouse_virus.obs[['viral_transcript_n_counts','batch']])

In [None]:
rcParams['figure.figsize']=(3,3)
ax = sb.violinplot(x='batch',y='viral_transcript_n_counts', data=mouse_virus_plot_df,
             jitter=True, color='slategray',scale="count",
                  order=['3','2','1','0'])
sb.despine(right=True)

ax.set_xlabel('Time Point Batch (hours)')
ax.set_ylabel('Raw Viral Transcript Counts')
ax.set_xticklabels(['0','4','8','12'])
ax.grid(False)

plt.savefig(fig_path + 'violin_mouse_viral_transcripts_tp.pdf')