In [None]:
import scanpy as sc
import pandas as pd
import numpy as np

from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
from anndata import AnnData

In [None]:
inputs = '../data/DE_data/'

In [None]:
file1 = inputs + "DMD-Mdx_CMC-Immune_ctl240711.log.h5ad"
file2 = inputs + "DMD-MdxSCID_CMC-Immune_ctl240711.log.h5ad"
file3 = inputs +"DMD-WT_CMC-Immune_ctl240711.log.h5ad"

adata1 = sc.read_h5ad(file1)
adata2 = sc.read_h5ad(file2)
adata3 = sc.read_h5ad(file3)

# Inspect the structure of the data
print(adata1)
print(adata2)
print(adata3)

In [None]:
# Create a combined AnnData object
adata_combined = adata1.concatenate(adata2, adata3)


adata_combined = AnnData(adata_combined)
adata_combined

In [None]:
def X_is_raw(adata_combined):
    return np.array_equal(adata_combined.X.sum(axis=0).astype(int), adata_combined.X.sum(axis=0))

is_raw = X_is_raw(adata_combined)
print(f"Is X raw? {is_raw}")

In [None]:
adata_combined.X = adata_combined.X.astype(int)

In [None]:
is_raw = X_is_raw(adata_combined)
print(f"Is X raw? {is_raw}")

In [None]:
counts_df = adata_combined.X.toarray()

# Extract the metadata
metadata = adata_combined.obs


# Create the metadata DataFrame
metadata_df = pd.DataFrame(metadata)

# Ensure the genotype column is categorical
metadata_df["genotype"] = metadata_df["genotype"].astype('category')

In [None]:
# Prepare the DESeq2 dataset
dds = DeseqDataSet(
    counts=counts_df,
    metadata=metadata_df,
    design_factors=["genotype"]
)

# Perform differential expression analysis
dds.deseq2()

In [None]:
# Extract results
results = DeseqStats(dds)
results_df = results.summary()

# Display the results
print(results_df.head())