In [None]:
import anndata
import scanpy as sc
import pandas as pd
from scipy import sparse
from modules.process_data import *
from collections import Counter
from matplotlib.ticker import PercentFormatter
from modules.utils import *
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from scipy import stats
from statsmodels.stats.multitest import multipletests
import re
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import fdrcorrection
from tqdm import tqdm

In [None]:
mapping = {'Multiplet': 'Multiplet',
           'SampleTag17_flex': 'WT-DMSO',
           'SampleTag18_flex': '3xTg-DMSO',
           'SampleTag19_flex': 'WT-SCDi',
           'SampleTag20_flex': '3xTg-SCDi',
           'Undetermined': 'Undetermined'}

In [None]:
#load sc data
adata = anndata.read_h5ad("data/fede_count.h5ad")

In [None]:
#data cleaning
adata = rm_high_mt(adata, threshold=0.6)
adata = filter_cells_by_gene_counts(adata)

In [None]:
adata = rm_low_exp(adata, threshold=0.01)
sc.pp.normalize_total(adata, target_sum=1, exclude_highly_expressed=True, max_fraction=0.2)

In [None]:
sample_tags = pd.DataFrame(adata.obs.Sample_Tag)
sample_tags.index = sample_tags.index.astype('int64')
sample_tags = pd.DataFrame(sample_tags['Sample_Tag'].map(mapping))
sc_df = pd.DataFrame(adata.X.toarray() if hasattr(adata.X, 'toarray') else adata.X, index=adata.obs_names, columns=adata.var_names)
sc_df.index = sc_df.index.astype('int64')
sc_df = sc_df.join(sample_tags)

In [None]:
#remove cell with sample tags "multiplet" or "undetermined"
sc_df = sc_df[~sc_df['Sample_Tag'].isin(['Multiplet', 'Undetermined'])]

In [None]:
adata1_df = sc_df[sc_df['Sample_Tag'] == 'WT-DMSO'].drop(['Sample_Tag'], axis=1)

In [None]:
adata2_df = sc_df[sc_df['Sample_Tag'] == '3xTg-DMSO'].drop(['Sample_Tag'], axis=1)

In [None]:
# Initialize lists to store the gene names, fold changes, and p-values
gene_names = []
fold_changes = []
p_values = []
log10_pvals = []

# Iterate over each gene (column) in the DataFrames
for gene in tqdm(adata1_df.columns):
    # Get the expression values for this gene in both datasets
    wt_values = adata1_df[gene].values
    tg_values = adata2_df[gene].values

    # Perform the t-test
    t_stat, p_value = ttest_ind(tg_values, wt_values, equal_var=False)

    # Calculate the fold change
    mean_tg = np.mean(tg_values)
    mean_wt = np.mean(wt_values)

    fold_change = np.log2(np.nan_to_num(np.divide(mean_tg, mean_wt), nan=1))

    #p_value = max(p_value, 1e-300)
    log10_pval = -np.log10(p_value)

    # Store the results
    gene_names.append(gene)
    fold_changes.append(fold_change)
    p_values.append(p_value)
    log10_pvals.append(log10_pval)


In [None]:
# Create a DataFrame from the results
results_df = pd.DataFrame({
    'Gene': gene_names,
    'log2_fold_change': fold_changes,
    'p-value': p_values,
    'log10_p-value': log10_pvals
})

In [None]:
# Assuming result_df is a pandas DataFrame with the necessary data
fold_change = results_df['log2_fold_change']
p_value = results_df['p-value']
log10_pvalue = -np.log10(p_value)

In [None]:
# Define significant genes
significant = p_value < 0.05
non_significant = ~significant
significant_outside_range = significant & ((fold_change > 1) | (fold_change < -1))
significant_inside_range = significant & ((fold_change >= -1) & (fold_change <= 1))

In [None]:
# Exclude genes with log10 p-value of infinity
finite_log10_pvalue = log10_pvalue != np.inf

# Get the top 20 significant genes outside the range -1 to 1 for cleaner annotation
top_genes = results_df[significant_outside_range & finite_log10_pvalue].nlargest(20, 'log10_p-value')

# Create the volcano plot
plt.scatter(fold_change[significant_outside_range & (fold_change > 0)], log10_pvalue[significant_outside_range & (fold_change > 0)], s=1, c='red')
plt.scatter(fold_change[significant_outside_range & (fold_change <= 0)], log10_pvalue[significant_outside_range & (fold_change <= 0)], s=1, c='blue')
plt.scatter(fold_change[non_significant | significant_inside_range], log10_pvalue[non_significant | significant_inside_range], s=1, c='grey')

# Add dotted lines
plt.axvline(x=-1, color='black', linestyle='--', linewidth=0.5)
plt.axvline(x=1, color='black', linestyle='--', linewidth=0.5)
plt.axhline(y=0, color='black', linestyle='--', linewidth=0.5)

# Annotate the top genes
for _, row in top_genes.iterrows():
    plt.annotate(row['Gene'], (row['log2_fold_change'] + 0.2, -np.log10(row['p-value'])), ha='left', va='center', fontsize=6)

# Count and annotate DEGs
positive_deg_num = (significant_outside_range & (fold_change > 0)).sum()
negative_deg_num = (significant_outside_range & (fold_change <= 0)).sum()

plt.annotate(f'{positive_deg_num} DEGs', xy=(9, 230), ha='right', va='top', fontsize=10, color='red')
plt.annotate(f'{negative_deg_num} DEGs', xy=(-9, 230), ha='left', va='top', fontsize=10, color='blue')

plt.xlabel('Log2 Fold Change')
plt.ylabel('-log10(p-value)')
plt.xlim(-10, 10)
plt.ylim(-10, 250)

plt.savefig('volcano_plot.png')
plt.show()

In [None]:
positive_deg = results_df[(results_df["p-value"] < 0.05) & (results_df["log2_fold_change"] > 1)]['Gene'].tolist()
positive_deg = [x.upper() for x in positive_deg]

negative_deg = results_df[(results_df["p-value"] < 0.05) & (results_df["log2_fold_change"] <= -1)]['Gene'].tolist()
negative_deg = [x.upper() for x in negative_deg]

In [None]:
with open('positive_deg', 'w') as file:
    for gene in positive_deg:
        file.write(gene + '\n')

with open('negative_deg', 'w') as file:
    for gene in negative_deg:
        file.write(gene + '\n')

In [None]:
tsv_file_path = 'positive_go.txt'
df = pd.read_csv(tsv_file_path, delimiter='\t', skiprows=11)

In [None]:
df['upload_1 (fold Enrichment)'] = pd.to_numeric(df['upload_1 (fold Enrichment)'], errors='coerce')
df['upload_1 (FDR)'] = pd.to_numeric(df['upload_1 (FDR)'], errors='coerce')

df = df.dropna(subset=['upload_1 (fold Enrichment)', 'upload_1 (FDR)'])

df = df[df['upload_1 (fold Enrichment)'] >= 1]

df['-log10(FDR)'] = -np.log10(df['upload_1 (FDR)'])

top_processes = df.nlargest(20, '-log10(FDR)')

top_processes['GO biological process complete'] = top_processes['GO biological process complete'].apply(lambda x: re.sub(r'\s*\([^)]*\)', '', x))

top_processes = top_processes.sort_values(by='-log10(FDR)', ascending=False)

plt.figure(figsize=(10, 8))
norm = plt.Normalize(top_processes['-log10(FDR)'].min(), top_processes['-log10(FDR)'].max())
colors = plt.cm.viridis(norm(top_processes['-log10(FDR)']))
bars = plt.barh(top_processes['GO biological process complete'], top_processes['upload_1 (fold Enrichment)'], color=colors)

plt.xlabel('Fold Enrichment')
plt.ylabel('UP-Biological Process')
plt.gca().invert_yaxis()  # To display the highest -log10(FDR) at the top

sm = plt.cm.ScalarMappable(cmap='viridis', norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=plt.gca())
cbar.ax.set_title('-log10(FDR)', pad=20)

plt.tight_layout()
plt.savefig('up_bio_process.png')
plt.show()


In [None]:
positive_deg = significant & (fold_change >= 1)
negative_deg = significant & (fold_change <= -1)

positive_deg = results_df[positive_deg].nlargest(100, 'log10_p-value')['Gene'].tolist()
negative_deg = results_df[negative_deg].nlargest(100, 'log10_p-value')['Gene'].tolist()

In [None]:
adata1 = anndata.AnnData(X=adata1_df.values, obs=pd.DataFrame(index=adata1_df.index), var=pd.DataFrame(index=adata1_df.columns))

In [None]:
adata2 = anndata.AnnData(X=adata2_df.values, obs=pd.DataFrame(index=adata2_df.index), var=pd.DataFrame(index=adata2_df.columns))

In [None]:
import anndata
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt

adata1_subset = adata1[:, positive_deg].copy()
adata2_subset = adata2[:, positive_deg].copy()
adata1_subset.obs['batch'], adata2_subset.obs['batch'] = 'WT-DMSO', '3xTg-DMSO'

combined_adata = anndata.concat([adata1_subset, adata2_subset])
sc.pp.scale(combined_adata, max_value=10)

mean_expr = combined_adata.to_df().groupby(combined_adata.obs['batch']).mean().T

sns.set(context='notebook', font_scale=1.2)
cg = sns.clustermap(mean_expr, cmap='coolwarm', linewidths=.5, figsize=(10, 8), row_cluster=True, col_cluster=False)

cg.ax_heatmap.set_yticklabels([])
cg.cax.set_position([0.9, .2, .03, .45]) 
cg.ax_heatmap.xaxis.set_label_position('top')
cg.ax_heatmap.xaxis.tick_top()
cg.ax_heatmap.set_xlabel('')
for tick in cg.ax_heatmap.get_xticklabels():
    tick.set_color('black')
    tick.set_ha('center')
    tick.set_rotation(0)
# Adjust layout and save the figure with the colorbar included
plt.savefig('positive_heatmap.png', bbox_inches='tight')
plt.show()


In [None]:
adata1_subset = adata1[:, negative_deg].copy()
adata2_subset = adata2[:, negative_deg].copy()
adata1_subset.obs['batch'], adata2_subset.obs['batch'] = 'WT-DMSO', '3xTg-DMSO'

combined_adata = anndata.concat([adata1_subset, adata2_subset])
sc.pp.scale(combined_adata, max_value=10)

mean_expr = combined_adata.to_df().groupby(combined_adata.obs['batch']).mean().T

sns.set(context='notebook', font_scale=1.2)
cg = sns.clustermap(mean_expr, cmap='coolwarm', linewidths=.5, figsize=(10, 8), row_cluster=True, col_cluster=False)

cg.ax_heatmap.set_yticklabels([])
cg.cax.set_position([0.9, .2, .03, .45]) 
cg.ax_heatmap.xaxis.set_label_position('top')
cg.ax_heatmap.xaxis.tick_top()
cg.ax_heatmap.set_xlabel('')
for tick in cg.ax_heatmap.get_xticklabels():
    tick.set_color('black')
    tick.set_ha('center')
    tick.set_rotation(0)
plt.savefig('negative_heatmap.png', bbox_inches='tight')
plt.show()