In [None]:
import scanpy as sc
import anndata as ann
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import glob
from matplotlib import rcParams
from matplotlib import colors
import logging

import seaborn as sb

sc.settings.verbosity = 3


plt.rcParams['figure.figsize']=(8,8) #rescale figures
sc.settings.verbosity = 3
sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()

In [None]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [None]:
# Set size for plots
sb.set_context(context='paper')

In [None]:
#set analysis version
version = "V1"
#set output files_path
output_files_path = "/Sunshine_DeRisi_RSV_files/"

fig_path = "/Sunshine_DeRisi_RSV_files/figures/"

In [None]:
sc.settings.figdir = fig_path

In [None]:
name = "2024_RSV_annotated_filtered_human_virus"
preprocessed_path = output_files_path+version+'_'+name+'.h5ad'
adata_human_virus = sc.read_h5ad(preprocessed_path)

In [None]:
#id for read origin
RSV = [name for name in adata_human_virus.var_names if name.startswith('RSV')]
RSV_genome = [name for name in adata_human_virus.var_names if name.startswith('genome_RSV')]
human_genes = adata_human_virus.var_names.str.startswith('GRCh38_')
virus_genes = RSV + RSV_genome

## for downstream ISG analysis

In [None]:
Hein_isgs = ['PSMB8','PSMB9','PSME1','PSME2','ISG15','ISG20','IRF7','MX1','MX2','GBP1','GBP2','GBP3','IFI6','IFI44','IFI35','IFI16','IFI27','IFIH1','IFI44L','IFIT1','IFIT2','IFIT3','IFIT5','IFITM1','IFITM2','IFITM3','EIF2AK2','OAS1','OAS2','OAS3','CNP','PLSCR1','BST2','BTN3A2','XAF1','CASP1','CASP4','CASP7','GSDMD']
isg_gene_list = ['GRCh38_' + gene for gene in Hein_isgs]
# determine what genes have at least 5 counts in at least 10 cells across this time point
# while this should not change the gene score, subsetting to well expressed genes for gene scoring analysis downstream

adata_human_virus_isgsubset = adata_human_virus[:,adata_human_virus.var_names.isin(isg_gene_list)]
cell_ids = adata_human_virus_isgsubset.obs.index
count_matrix = adata_human_virus_isgsubset.X.toarray()
counts_df = pd.DataFrame(count_matrix, index = cell_ids, columns = adata_human_virus_isgsubset.var_names)

cells_with_counts = (counts_df >= 5).sum(axis=0)
genes_above_10 = cells_with_counts[cells_with_counts > 10]
isg_subset = genes_above_10.index.tolist()
isg_subset

adata_human_virus = adata_human_virus[:,~adata_human_virus.var_names.isin(virus_genes)].copy() 
adata_human_virus

# Normalize, log1p,scale

In [None]:
sc.pp.normalize_per_cell(adata_human_virus)

adata_human_virus.obs['n_counts_norm'] = adata_human_virus.X.sum(1)
adata_human_virus.obs['n_counts_norm_log'] = np.log1p(adata_human_virus.obs['n_counts_norm'])

#Sum the number of human and viral transcripts per cell POST NORM
adata_human_virus.obs ['human_n_counts_norm'] = np.sum(adata_human_virus[:, human_genes].X, axis=1).A1

adata_human_virus.obs['viral_transcript_n_counts_norm'] = np.sum(adata_human_virus[:, RSV].X, axis=1).A1

#Sum the number of human and virus transcripts per cell and log transform (ln+1)
adata_human_virus.obs ['viral_transcript_n_counts_norm_log'] = np.log1p(np.sum(adata_human_virus[:, RSV].X, axis=1).A1)
adata_human_virus.obs ['human_n_counts_norm_log'] = np.log1p(np.sum(adata_human_virus[:, human_genes].X, axis=1).A1)

In [None]:
# filter genes not present within this time point
sc.pp.filter_genes(adata_human_virus, min_cells=3)
human_genes = adata_human_virus.var_names.str.startswith('GRCh38_')

In [None]:
#log, scale, HVG isolate
sc.pp.log1p(adata_human_virus)
logging.info('Log transforming data')
adata_human_virus.raw = adata_human_virus
logging.info('Saving log(counts)+1 in .raw')

In [None]:
#Identify highly variable genes
sc.pp.highly_variable_genes(adata_human_virus, min_mean=0.0125, max_mean=10, min_disp=0.5)
print('\n','Number of highly variable genes: {:d}'.format(np.sum(adata_human_virus.var['highly_variable'])))

In [None]:
sc.pp.scale(adata_human_virus, max_value=10)

# ISG analysis

In [None]:
sc.tl.score_genes(adata_human_virus,isg_subset, score_name='isg_subset_score')

In [None]:
adata_human_virus.obs['treatment_infectionstatus'] = adata_human_virus.obs['treatment'].astype(str)+'_'+adata_human_virus.obs['infection_status'].astype(str)
adata_human_virus.obs['treatment_infectionstatus'] = adata_human_virus.obs['treatment_infectionstatus'].astype("category")

# subset to only include infected/bystander cells for each time point 

In [None]:
adata_human_virus_subset = adata_human_virus[((adata_human_virus.obs.infection_status != 'buffer')),:].copy()
adata_human_virus_subset

In [None]:
cols_of_interest = ['batch','new_multiseq_id','treatment','infection_status','treatment_infectionstatus','isg_subset_score']
adata_human_virus_subset_df = adata_human_virus_subset.obs[cols_of_interest].copy()
adata_human_virus_subset_df

In [None]:
pd.crosstab(adata_human_virus_subset_df['new_multiseq_id'],adata_human_virus_subset_df['infection_status'])

## Figure S3B

In [None]:
order = ['0hr_VC','4hr_VC','8hr_VC','12hr_VC',
         '0hr_HK','4hr_HK','8hr_HK','12hr_HK',
         '0hr_RSV','4hr_RSV','8hr_RSV','12hr_RSV',
        ]
plt.figure(figsize=(16, 4))


palette = {'infected': 'lightcoral', 'uninfected': 'gainsboro'}

sb.violinplot(x='new_multiseq_id', y='isg_subset_score', hue='infection_status', data=adata_human_virus_subset_df, 
               split=True, 
              palette=['lightcoral','gainsboro'],
            inner=None,
              dodge=False, 
              alpha=0.1,
              scale='count',
              linewidth=0.5,
              gap=5, order = order,saturation=0.9
             #density_norm='count'
             )

sb.stripplot(x='new_multiseq_id', y='isg_subset_score', hue='infection_status', 
             data=adata_human_virus_subset_df,dodge=True, 
             marker='o', alpha=1, size=0.2,
             palette = palette, 
             order = order,
            edgecolor=['black'],
             linewidth=0.1)

plt.legend(title='legend', loc='upper left', bbox_to_anchor=(1, 1))

plt.xlabel("Time Point - Treatment")  # Rename x-axis
plt.ylabel("ISG Score") 

sb.despine()


plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Arial'
plt.rcParams['font.size'] = 12.0
plt.rcParams['legend.fontsize'] = 12.0

#plt.savefig(fig_path+'violin_ISGscore_alltimepoints.pdf')