# Notebook to run Differential Gene Expression analysis across Genotypes using `PyDESeq2`

**Developed by** : **Srivalli Kolla**

**Created on** : 15 July, 2024

**Last modififed on** : 15 July, 2024

**Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**


# Importing packages

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import time

from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
from anndata import AnnData

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma', dpi_save = 300, vector_friendly = True, format = 'svg')
timestamp = time.strftime("%d_%m_%Y")

-----
anndata     0.10.8
scanpy      1.10.2
-----
PIL                 10.3.0
asttokens           NA
colorama            0.4.6
comm                0.2.2
cycler              0.12.1
cython_runtime      NA
dateutil            2.9.0.post0
debugpy             1.8.2
decorator           5.1.1
django              5.0.6
executing           2.0.1
h5py                3.11.0
ipykernel           6.29.5
jedi                0.19.1
joblib              1.4.2
kiwisolver          1.4.5
legacy_api_wrap     NA
llvmlite            0.43.0
matplotlib          3.9.1
mpl_toolkits        NA
natsort             8.4.0
numba               0.60.0
numpy               1.26.4
packaging           24.1
pandas              2.2.2
parso               0.8.4
pickleshare         0.7.5
platformdirs        4.2.2
prompt_toolkit      3.0.47
psutil              6.0.0
pure_eval           0.2.2
pydeseq2            0.4.10
pydev_ipython       NA
pydevconsole        NA
pydevd              2.9.5
pydevd_file_utils   NA
pydevd_plugins      

# Data import

In [3]:
inputs = '../data/'

In [4]:
file1 = inputs + "DMD-Mdx_CMC-Immune_ctl240711.log.h5ad"
file2 = inputs + "DMD-MdxSCID_CMC-Immune_ctl240711.log.h5ad"
file3 = inputs +"DMD-WT_CMC-Immune_ctl240711.log.h5ad"

adata1 = sc.read_h5ad(file1)
adata2 = sc.read_h5ad(file2)
adata3 = sc.read_h5ad(file3)

# Inspect the structure of the data
print(adata1)
print(adata2)
print(adata3)

AnnData object with n_obs × n_vars = 16196 × 16060
    obs: 'cell_source', 'cell_type', 'donor', 'cell_states', 'genotype', 'compartment', 'object', 'samples', 'n_counts'
    var: 'gene_ids-CMC'
    uns: 'log1p'
AnnData object with n_obs × n_vars = 29502 × 16060
    obs: 'cell_source', 'cell_type', 'donor', 'cell_states', 'genotype', 'compartment', 'object', 'samples', 'n_counts'
    var: 'gene_ids-CMC'
    uns: 'log1p'
AnnData object with n_obs × n_vars = 69819 × 16060
    obs: 'cell_source', 'cell_type', 'donor', 'cell_states', 'genotype', 'compartment', 'object', 'samples', 'n_counts'
    var: 'gene_ids-CMC'
    uns: 'log1p'


## Data concatenation

In [5]:
adata_combined = adata1.concatenate(adata2, adata3)

adata_combined = AnnData(adata_combined)
adata_combined

AnnData object with n_obs × n_vars = 115517 × 16060
    obs: 'cell_source', 'cell_type', 'donor', 'cell_states', 'genotype', 'compartment', 'object', 'samples', 'n_counts', 'batch'
    var: 'gene_ids-CMC'

In [6]:
adata_combined.obs['cell_states'].value_counts()

cell_states
CD4Tnaive        26482
NK               11156
CD8Tnaive        10600
MHCII+MØtr        8702
NØ                6975
MØinf             6756
Ly6CloMo          6212
CD4Th             4856
TLF+MØ            4683
CD8Tctl           2880
vCM2              2869
Isg15+MØ          2845
vCM1              2733
vCM4              2548
B_cells           2087
B_naive           1703
Ly6ChiMo          1691
CD8Tcm            1584
DC2               1394
Mast              1376
T                 1149
CD8Temra          1048
Ccr2+MHCII+MØ      949
B_mem              751
gdT                385
Treg               363
CD4Tctl            280
ILC                195
vCM3               107
DC                 105
Spp1+Gpnmb+MØ       28
CD8Tem              15
Plasma_cells         9
MAIT                 1
Name: count, dtype: int64

## Raw counts checking

In [7]:
def X_is_raw(adata_combined):
    return np.array_equal(adata_combined.X.sum(axis=0).astype(int), adata_combined.X.sum(axis=0))

is_raw = X_is_raw(adata_combined)
print(f"Is X raw? {is_raw}")

Is X raw? False


In [8]:
adata_combined.X = adata_combined.X.astype(int)

In [9]:
is_raw = X_is_raw(adata_combined)
print(f"Is X raw? {is_raw}")

Is X raw? True


In [10]:
adata_combined.write_h5ad(inputs + f'concatenated_CMC-{timestamp}.h5ad')

## Data preparation

In [11]:
counts_df = adata_combined.X.toarray()

metadata = adata_combined.obs
metadata_df = pd.DataFrame(metadata)
metadata_df["genotype"] = metadata_df["genotype"].astype('category')

# PyDESeq2 analysis

## Data preparation

In [12]:
# Prepare the DESeq2 dataset
dds = DeseqDataSet(
    counts=counts_df,
    metadata=metadata_df,
    design_factors=["genotype"],
    n_cpus = 16
)

## Differential expression analysis

In [13]:
dds.deseq2()

Fitting size factors...
  self.fit_size_factors()
Fitting dispersions...
... done in 77.23 seconds.

Fitting MAP dispersions...
... done in 72.09 seconds.



KeyboardInterrupt: 

## Results

In [None]:
# Extract results
results = DeseqStats(dds)
results_df = results.summary()

results_wt_vs_mdx = results(contrast=['genotype', 'WT', 'Mdx'])
print("WT vs. Mdx Comparison:")
print(results_wt_vs_mdx)

# Extract results for WT vs. MdxSCID
results_wt_vs_mdxscid = results(contrast=['genotype', 'WT', 'MdxSCID'])
print("WT vs. MdxSCID Comparison:")
print(results_wt_vs_mdxscid)

# Extract results for Mdx vs. MdxSCID
results_mdx_vs_mdxscid = results(contrast=['genotype', 'Mdx', 'MdxSCID'])
print("Mdx vs. MdxSCID Comparison:")
print(results_mdx_vs_mdxscid)

# Optionally, save results to CSV files
results_wt_vs_mdx.to_csv(f"./outputs/results_wt_vs_mdx-{timestamp}.csv")
results_wt_vs_mdxscid.to_csv(f"./outputs/results_wt_vs_mdxscid-{timestamp}.csv")
results_mdx_vs_mdxscid.to_csv(f"./outputs/results_mdx_vs_mdxscid-{timestamp}.csv")

AssertionError: Please provide a fitted DeseqDataSet by first running the `deseq2` method.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Define the function to create and save a volcano plot
def plot_volcano(results_df, title, filename):
    plt.figure(figsize=(10, 6))
    sns.scatterplot(
        x='log2FoldChange', y='-log10(padj)', data=results_df,
        hue='significant', edgecolor=None, palette={True: 'red', False: 'black'}, legend=False
    )
    plt.title(title)
    plt.xlabel('Log2 Fold Change')
    plt.ylabel('-Log10 Adjusted P-value')
    plt.axhline(y=-np.log10(0.05), color='blue', linestyle='--')
    plt.axvline(x=-1, color='green', linestyle='--')
    plt.axvline(x=1, color='green', linestyle='--')
    plt.savefig(filename)
    plt.show()

# Example for WT vs Mdx
results_wt_vs_mdx['-log10(padj)'] = -np.log10(results_wt_vs_mdx['padj'])
results_wt_vs_mdx['significant'] = (results_wt_vs_mdx['padj'] < 0.05) & (np.abs(results_wt_vs_mdx['log2FoldChange']) > 1)

plot_volcano(results_wt_vs_mdx, 'Volcano Plot: WT vs Mdx', 'volcano_plot_wt_vs_mdx.png')

# Repeat for other comparisons
results_wt_vs_mdxscid['-log10(padj)'] = -np.log10(results_wt_vs_mdxscid['padj'])
results_wt_vs_mdxscid['significant'] = (results_wt_vs_mdxscid['padj'] < 0.05) & (np.abs(results_wt_vs_mdxscid['log2FoldChange']) > 1)

plot_volcano(results_wt_vs_mdxscid, 'Volcano Plot: WT vs MdxSCID', 'volcano_plot_wt_vs_mdxscid.png')

results_mdx_vs_mdxscid['-log10(padj)'] = -np.log10(results_mdx_vs_mdxscid['padj'])
results_mdx_vs_mdxscid['significant'] = (results_mdx_vs_mdxscid['padj'] < 0.05) & (np.abs(results_mdx_vs_mdxscid['log2FoldChange']) > 1)

plot_volcano(results_mdx_vs_mdxscid, 'Volcano Plot: Mdx vs MdxSCID', 'volcano_plot_mdx_vs_mdxscid.png')


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc

def plot_heatmap(adata, title, filename):
    sc.pl.heatmap(
        adata, var_names=adata.var_names[:50], groupby='genotype', 
        cmap='viridis', dendrogram=True, show=False
    )
    plt.title(title)
    plt.savefig(filename)
    plt.show()

# Subset your AnnData object to include only significant genes
significant_genes_wt_vs_mdx = results_wt_vs_mdx[results_wt_vs_mdx['significant']].index
adata_wt_vs_mdx = adata_combined[:, significant_genes_wt_vs_mdx]

plot_heatmap(adata_wt_vs_mdx, 'Heatmap: WT vs Mdx', 'heatmap_wt_vs_mdx.png')

# Repeat for other comparisons
significant_genes_wt_vs_mdxscid = results_wt_vs_mdxscid[results_wt_vs_mdxscid['significant']].index
adata_wt_vs_mdxscid = adata_combined[:, significant_genes_wt_vs_mdxscid]

plot_heatmap(adata_wt_vs_mdxscid, 'Heatmap: WT vs MdxSCID', 'heatmap_wt_vs_mdxscid.png')

significant_genes_mdx_vs_mdxscid = results_mdx_vs_mdxscid[results_mdx_vs_mdxscid['significant']].index
adata_mdx_vs_mdxscid = adata_combined[:, significant_genes_mdx_vs_mdxscid]

plot_heatmap(adata_mdx_vs_mdxscid, 'Heatmap: Mdx vs MdxSCID', 'heatmap_mdx_vs_mdxscid.png')
