# DIFFERENTIAL GENE EXPRESSION ANALYSIS

# Python packages

In [None]:
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import logging

import sc_toolbox
import pertpy
import anndata2ri

import rpy2.robjects as robjects
import rpy2.rinterface_lib.callbacks
from rpy2.robjects import pandas2ri, r
from rpy2.robjects.conversion import localconverter

from modules.visualize import *
from modules.deag_tools import *
from modules.utils import *

# R packages

In [None]:
sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

# Load dataset

In [None]:
adata = sc.read(
    filename="data/fede_data/scdi_hypothalamus_clustered.h5ad"
)

# Create pseudosamples

In [None]:
# Creates 3 pseudosamples for each combination of experimental group and cell type
cell_types = adata.obs["cluster_subclass_name"].cat.categories
adata_pb = aggregate_and_filter(adata, 
                                cell_types[0], 
                                condition_key="Sample_Tag", 
                                cell_identity_key="cluster_subclass_name", 
                                obs_to_keep=['Sample_Tag', 'cluster_subclass_name'])

for i, cell_type in enumerate(cell_types[1:], start=2):
    adata_cell_type = aggregate_and_filter(adata, 
                                           cell_type, 
                                           condition_key="Sample_Tag", 
                                           cell_identity_key="cluster_subclass_name", 
                                           obs_to_keep=['Sample_Tag', 'cluster_subclass_name'])
    adata_pb = adata_pb.concatenate(adata_cell_type)

adata_pb.layers['counts'] = adata_pb.X.copy()

### Visualize the pseudosample embedding

In [None]:
sc.pp.normalize_total(adata_pb, target_sum=1e6)
sc.pp.log1p(adata_pb)
sc.pp.pca(adata_pb)

In [None]:
sc.settings.figdir = 'figures/pca_plot_'
color_vars = adata_pb.obs.columns
for var in color_vars:
    sc.pl.pca(adata_pb, color=var, ncols=1, size=100, save=f'pca_plot_{var}.png')

# Save/Load adata object

In [None]:
adata_pb.layers['counts'] = np.array(adata_pb.layers['counts'], dtype=int)

In [None]:
# Write data to specified path
adata_pb.write("data/fede_data/scdi_hypothalamus_pb.h5ad")

In [None]:
# Load data from specified path
adata_pb = sc.read(
    filename="data/fede_data/scdi_hypothalamus_pb.h5ad"
)

# Differential gene expression analysis

### Create control and condition anndata objects

In [None]:
# Specify the names of control and condition groups
ctr_name = ''
cnd_name = ''

In [None]:
# Experimental groups to include for control and condition

ctr_sample_tags = list(set(adata.obs['Sample_Tag'].values))
#ctr_sample_tags = ['WT-DMSO']

cnd_sample_tags = list(set(adata.obs['Sample_Tag'].values))
#cnd_sample_tags = ['WT-SCDi']

In [None]:
adata_pb.X = adata_pb.layers['counts']

### DESeq2

In [None]:
# Cell types to include for control and condition
ctr_cell_types = list(set([x for x in adata_pb.obs['cluster_subclass_name'].values if x.startswith('Astro-NT') and x != 'Astro-NT_1']))
#ctr_cell_types = ['Astro-NT_3']

cnd_cell_types = list(set([x for x in adata_pb.obs['cluster_subclass_name'].values if x.startswith('Astro-NT')]))
#cnd_cell_types = ['Astro-NT_1']

In [None]:
# Create control and condition dataframes with specified experimental groups and cell types
control_df = adata_pb[(adata_pb.obs['Sample_Tag'].isin(ctr_sample_tags)) & (adata_pb.obs['cluster_subclass_name'].isin(ctr_cell_types))].to_df()
condition_df = adata_pb[(adata_pb.obs['Sample_Tag'].isin(cnd_sample_tags)) & (adata_pb.obs['cluster_subclass_name'].isin(cnd_cell_types))].to_df()

In [None]:
results_df = deseq2_dea(control_df, condition_df, save_path=None)

### MAST

In [None]:
# Cell types to include for control and condition
ctr_cell_types = list(set([x for x in adata.obs['cluster_subclass_name'].values if x.startswith('Astro-NT') and x != 'Astro-NT_1']))
#ctr_cell_types = ['Astro-NT_3']

cnd_cell_types = list(set([x for x in adata.obs['cluster_subclass_name'].values if x.startswith('Astro-NT')]))
#cnd_cell_types = ['Astro-NT_1']

In [None]:
# Create control and condition dataframes with specified experimental groups and cell types
control_df = adata[(adata.obs['Sample_Tag'].isin(ctr_sample_tags)) & (adata.obs['cluster_subclass_name'].isin(ctr_cell_types))].to_df()
condition_df = adata[(adata.obs['Sample_Tag'].isin(cnd_sample_tags)) & (adata.obs['cluster_subclass_name'].isin(cnd_cell_types))].to_df()

In [None]:
# shifted logarithm with Counts per Millions (CPM)
sc.pp.normalize_total(adata, target_sum=1e6)
sc.pp.log1p(adata)

In [None]:
results_df = mast_dea(adata, control_df, condition_df, save_path=None)

### Volcano plot

In [None]:
volcano_plot(results_df,
             min_fold_change=0.25,
             max_p_value=0.05,
             fig_title=f'{ctr_name} vs {cnd_name}',
             save_path=f'figures/{ctr_name}_{cnd_name}_volcano_plot.png')

### Sort DEGs by lowest adjusted p-value

In [None]:
ranked_genes = results_df.sort_values(by='padj')

In [None]:
ranked_genes.head(10)

In [None]:
df = results_df
filtered_df = df[(df['padj'] < 0.05) & (df['log2FoldChange'] > 0.25)]
names_list = filtered_df['names'].str.upper().tolist()
with open('filtered_names.txt', 'w') as f:
    for name in names_list:
        f.write(f"{name}\n")
print(f"Number of genes after filtering: {len(names_list)}")
print("Names have been written to 'filtered_names.txt'")

# Gene set and pathway enrichment analysis

In [None]:
UP_genes_name, DOWN_genes_name = get_DEGs(results_df,
                                          max_pval=0.05,
                                          min_fold_change=0.25)

In [None]:
UP_GO = go_enrichment_analysis(UP_genes_name, save_path=None)
DOWN_GO = go_enrichment_analysis(DOWN_genes_name, save_path=None)

In [None]:
display_go_enrichment(UP_GO,
                      namespace='BP',
                      fig_title=f'UP BP - {ctr_name} vs {cnd_name}',
                      save_path=f'figures/{ctr_name}_{cnd_name}_display_GO_enrichment_UP_BP')

display_go_enrichment(UP_GO, 
                      namespace='MF',
                      fig_title=f'UP MF - {ctr_name} vs {cnd_name}',
                      save_path=f'figures/{ctr_name}_{cnd_name}_display_GO_enrichment_UP_MF')

display_go_enrichment(UP_GO, 
                      namespace='CC',
                      fig_title=f'UP CC - {ctr_name} vs {cnd_name}',
                      save_path=f'figures/{ctr_name}_{cnd_name}_display_GO_enrichment_UP_CC')

In [None]:
display_go_enrichment(DOWN_GO, 
                      namespace='BP',
                      fig_title=f'DOWN BP - {ctr_name} vs {cnd_name}',
                      save_path=f'figures/{ctr_name}_{cnd_name}_display_GO_enrichment_DOWN_BP')

display_go_enrichment(DOWN_GO, 
                      namespace='MF',
                      fig_title=f'DOWN MF - {ctr_name} vs {cnd_name}',
                      save_path=f'figures/{ctr_name}_{cnd_name}_display_GO_enrichment_DOWN_MF')

display_go_enrichment(DOWN_GO, 
                      namespace='CC',
                      fig_title=f'DOWN CC - {ctr_name} vs {cnd_name}',
                      save_path=f'figures/{ctr_name}_{cnd_name}_display_GO_enrichment_DOWN_CC')

In [None]:
UP_KEGG = kegg_enrichment_analysis(UP_genes_name, 
                                   save_path=None)

In [None]:
DOWN_KEGG = kegg_enrichment_analysis(DOWN_genes_name, 
                                     save_path=None)

In [None]:
display_kegg_enrichment(UP_KEGG,
                        fig_title=f'UP pathway - {ctr_name} vs {cnd_name}',
                        save_path=f'figures/{ctr_name}_{cnd_name}_display_KEGG_enrichment_UP')

In [None]:
display_kegg_enrichment(DOWN_KEGG,
                        fig_title=f'DOWN pathway - {ctr_name} vs {cnd_name}',
                        save_path=f'figures/{ctr_name}_{cnd_name}_display_KEGG_enrichment_DOWN')