In [None]:
import anndata
import scanpy as sc
import pandas as pd
from scipy import sparse
from modules.process_data import *
from collections import Counter
from matplotlib.ticker import PercentFormatter
from modules.process_data import mapping1, mapping2
from modules.utils import *
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import umap
from sklearn.cluster import KMeans
from matplotlib.lines import Line2D
from scipy import stats
from statsmodels.stats.multitest import multipletests
import gseapy as gp

In [None]:
mapping = {'Multiplet': 'Multiplet',
           'SampleTag17_flex': 'WT-DMSO',
           'SampleTag18_flex': '3xTg-DMSO',
           'SampleTag19_flex': 'WT-SCDi',
           'SampleTag20_flex': '3xTg-SCDi',
           'Undetermined': 'Undetermined'}

In [None]:
#load sc data
adata = anndata.read_h5ad("data/fede_count.h5ad")

In [None]:
#data cleaning
adata = rm_high_mt(adata, threshold=0.6)
adata = filter_cells_by_gene_counts(adata)

In [None]:
#mt_genes = adata.var_names.str.startswith('mt-')
#adata.X[:, mt_genes] = 0

In [None]:
#normalize, log and scale data
sc.pp.normalize_total(adata, target_sum=1)
#sc.pp.log1p(adata)
#sc.pp.scale(adata, max_value=2)

In [None]:
#load annotation data
anno_df = pd.read_csv("data/fede_mapping.csv", skiprows=4)
anno_df = anno_df.set_index('cell_id')[['class_name']]
anno_df.index = anno_df.index.astype('int64')

In [None]:
#Create df with metadata - sample_tag and cell type
sample_tags = pd.DataFrame(adata.obs.Sample_Tag)
sample_tags.index = sample_tags.index.astype('int64')
sample_tags = pd.DataFrame(sample_tags['Sample_Tag'].map(mapping))
sc_df = pd.DataFrame(adata.X.toarray() if hasattr(adata.X, 'toarray') else adata.X, index=adata.obs_names, columns=adata.var_names)
sc_df.index = sc_df.index.astype('int64')
sc_df = sc_df.join(sample_tags)
sc_df = sc_df.join(anno_df)

In [None]:
#remove cell with sample tags "multiplet" or "undetermined"
sc_df = sc_df[~sc_df['Sample_Tag'].isin(['Multiplet', 'Undetermined'])]

In [None]:
#keep only a specific cell type
#sc_df = sc_df[sc_df["class_name"] == "30 Astro-Epen"]

In [None]:
#split cell type, pheno and gene_expression 
X = sc_df.drop(['Sample_Tag', 'class_name'], axis=1).values
cell_pheno = sc_df['Sample_Tag'].values
cell_type = sc_df['class_name'].values

In [None]:
#PCA dim. reduction
pca = PCA(n_components=10, random_state=42)
pca_result = pca.fit_transform(X)

In [None]:
#UMAP dim. reduction
reducer = UMAP(n_neighbors=100, n_components=2, random_state=42)
umap_result = reducer.fit_transform(pca_result)

In [None]:
#KMEANS clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans_result = kmeans.fit_predict(umap_result)

In [None]:
visualize_umap(umap_result, cell_type)

In [None]:
get_clustering(umap_result, kmeans_result)

In [None]:
cluster_composition = get_cluster_composition(kmeans_result, cell_type)

In [None]:
get_pie_chart(cluster_composition[1], min_pct=5)

In [None]:
wt_dmso = sc_df[sc_df['Sample_Tag'] == 'WT-DMSO'].drop(['class_name', 'Sample_Tag'], axis=1)
tg_dmso = sc_df[sc_df['Sample_Tag'] == '3xTg-DMSO'].drop(['class_name', 'Sample_Tag'], axis=1)

In [None]:
wt_dmso = sc_df[sc_df['Sample_Tag'] == 'WT-SCDi'].drop(['class_name', 'Sample_Tag'], axis=1)
tg_dmso = sc_df[sc_df['Sample_Tag'] == '3xTg-SCDi'].drop(['class_name', 'Sample_Tag'], axis=1)

In [None]:
# Combine the data into a single DataFrame
combined_data = pd.concat([wt_dmso, tg_dmso], axis=0)
labels = ['wt'] * wt_dmso.shape[0] + ['tg'] * tg_dmso.shape[0]

# Create AnnData object
adata = sc.AnnData(combined_data)
adata.obs['condition'] = labels

# Identify highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=3000)

# Handle potential NaNs or infinite values
adata.raw = adata  # Save the raw data before filtering
adata = adata[:, adata.var['highly_variable']]

# Perform differential expression analysis
sc.tl.rank_genes_groups(adata, groupby='condition', method='t-test')

# Extract results and ensure proper numerical values
degs = adata.uns['rank_genes_groups']
pvals = degs['pvals']['tg']  # Extract p-values for the 'tg' group
logfoldchanges = degs['logfoldchanges']['tg']  # Extract log fold changes for the 'tg' group
genes = degs['names']['tg']  # Extract gene names for the 'tg' group

# Adjust p-values for multiple testing using the Benjamini-Hochberg procedure
adjusted_pvals = multipletests(pvals, method='fdr_bh')[1]

# Create a DataFrame for plotting
volcano_data = pd.DataFrame({
    'gene': genes,
    'log2_fold_change': logfoldchanges,
    'p_value': pvals,
    'adjusted_p_value': adjusted_pvals
})

In [None]:
# Calculate -log10(p-value)
volcano_data['-log10_p_value'] = -np.log10(volcano_data['adjusted_p_value'])

# Plot the volcano plot
plt.figure(figsize=(10, 8))

# Plot non-significant points in gray
plt.scatter(volcano_data['log2_fold_change'], volcano_data['-log10_p_value'], c='black', alpha=0.5, s=10)

# Highlight significant genes
significant = volcano_data['adjusted_p_value'] < 0.01

# Plot significant points: red for positive log2 fold change, blue for negative
plt.scatter(volcano_data.loc[significant & (volcano_data['log2_fold_change'] > 0), 'log2_fold_change'],
            volcano_data.loc[significant & (volcano_data['log2_fold_change'] > 0), '-log10_p_value'],
            c='red', s=10)

plt.scatter(volcano_data.loc[significant & (volcano_data['log2_fold_change'] < 0), 'log2_fold_change'],
            volcano_data.loc[significant & (volcano_data['log2_fold_change'] < 0), '-log10_p_value'],
            c='blue', s=10)

# Calculate the number of significant DEGs
upregulated = significant & (volcano_data['log2_fold_change'] > 0)
downregulated = significant & (volcano_data['log2_fold_change'] < 0)
num_upregulated = upregulated.sum()
num_downregulated = downregulated.sum()

# Annotate the number of DEGs
plt.text(-13, max(volcano_data['-log10_p_value']) - 1, f'{num_downregulated} DEGs', color='blue', fontsize=12)
plt.text(13, max(volcano_data['-log10_p_value']) - 1, f'{num_upregulated} DEGs', color='red', fontsize=12, horizontalalignment='right')

# Annotate some top genes
top_genes = volcano_data[significant].nlargest(20, '-log10_p_value')
for i, row in top_genes.iterrows():
    plt.text(row['log2_fold_change'], row['-log10_p_value'], row['gene'], fontsize=8)

plt.xlabel('Log2 Fold Change')
plt.ylabel('-Log10 Adjusted P-value')
plt.title('Volcano plot (wt_dmso vs tg_dmso DEGs)')
plt.savefig('volcano_plot_wt_tg_dmso.png')
plt.show()

In [None]:
downregulated_genes = volcano_data.loc[significant & (volcano_data['log2_fold_change'] < 0)]['gene'].values

In [None]:
upregulated_genes = volcano_data.loc[significant & (volcano_data['log2_fold_change'] > 0)]['gene'].values

In [None]:
', '.join(upregulated_genes)

In [None]:
', '.join(downregulated_genes)

In [None]:
np.random.seed(0)
n_genes = 250
n_cells = 1000
genes = [f'Gene{i}' for i in range(1, n_genes + 1)]
conditions = ['WT-DMSO', '3xTg-DMSO']

# Generate random expression data
data = np.random.rand(n_cells, n_genes)
cell_conditions = np.random.choice(conditions, n_cells)

# Create an AnnData object
adata = sc.AnnData(X=data, obs={'condition': cell_conditions}, var=pd.DataFrame(index=genes))

# Select some top genes (in this case, all genes)
top_genes = pd.DataFrame({'gene': genes})

# Subset the data to the top genes
markers = top_genes['gene'].values
adata_subset = adata[:, markers]

# Average expression per condition
mean_expression = adata_subset.to_df().groupby(adata_subset.obs['condition']).mean()

# Transpose the dataframe to have genes as rows and conditions as columns
heatmap_data = mean_expression.T

# Create a color palette for the conditions
condition_colors = {
    'WT-DMSO': 'black',  
    '3xTg-DMSO': 'black',  
}

# Create a color palette list for conditions
condition_palette = [condition_colors[condition] for condition in heatmap_data.columns]

# Generate a clustermap without the condition dendrogram
sns.set(context='notebook', font_scale=1.2)
cg = sns.clustermap(
    heatmap_data,
    cmap='coolwarm',
    linewidths=.5,
    figsize=(10, 8),
    row_cluster=True,  # Cluster rows (genes)
    col_cluster=False  # Do not cluster columns (conditions)
)

# Customize the condition colors (color bar)
for tick_label, tick_color in zip(cg.ax_heatmap.get_xticklabels(), condition_palette):
    tick_label.set_color(tick_color)

# Hide the gene names on the right
cg.ax_heatmap.set_yticklabels([])

# Move the color legend to the right
cg.cax.set_position([1, .2, .03, .45])

# Move the column labels to the top and center them
cg.ax_heatmap.xaxis.set_label_position('top')
cg.ax_heatmap.xaxis.tick_top()

# Center the x-tick labels
for tick in cg.ax_heatmap.get_xticklabels():
    tick.set_ha('center')

# Remove the 'Condition' word
cg.ax_heatmap.set_xlabel('')
cg.ax_heatmap.set_xticklabels(cg.ax_heatmap.get_xticklabels(), rotation=0)

#plt.ylabel('')

# Show the plot
plt.show()


In [None]:
# Assuming wt_dmso and tg_dmso are defined DataFrames with gene expression data
# Combine the data into a single DataFrame
combined_data = pd.concat([wt_dmso, tg_dmso], axis=0)
labels = ['wt'] * wt_dmso.shape[0] + ['tg'] * tg_dmso.shape[0]

# Create AnnData object
adata = sc.AnnData(combined_data)
adata.obs['condition'] = labels

# Identify highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=500)

# Handle potential NaNs or infinite values
adata.raw = adata  # Save the raw data before filtering
adata = adata[:, adata.var['highly_variable']]

# Perform differential expression analysis
sc.tl.rank_genes_groups(adata, groupby='condition', method='t-test')

# Extract results and ensure proper numerical values
degs = adata.uns['rank_genes_groups']
pvals = degs['pvals']['tg']  # Extract p-values for the 'tg' group
logfoldchanges = degs['logfoldchanges']['tg']  # Extract log fold changes for the 'tg' group
genes = degs['names']['tg']  # Extract gene names for the 'tg' group

# Adjust p-values for multiple testing using the Benjamini-Hochberg procedure
adjusted_pvals = multipletests(pvals, method='fdr_bh')[1]

# Create a DataFrame for plotting
volcano_data = pd.DataFrame({
    'gene': genes,
    'log2_fold_change': logfoldchanges,
    'p_value': pvals,
    'adjusted_p_value': adjusted_pvals
})

# Filter to get the most differentially expressed genes
most_deg_genes = volcano_data.loc[volcano_data['adjusted_p_value'] < 0.01]

# Ensure we only use genes that are present in adata.var_names
valid_top_genes = [gene for gene in most_deg_genes['gene'].values if gene in adata.var_names]

# Subset the data to include only the valid top genes
adata_subset = adata[:, valid_top_genes]

# Average expression per condition
mean_expression = adata_subset.to_df().groupby(adata_subset.obs['condition']).mean()

# Transpose the dataframe to have genes as rows and conditions as columns
heatmap_data = mean_expression.T

# Create a color palette for the conditions
condition_colors = {
    'wt': 'black',  
    'tg': 'black',  
}

# Create a color palette list for conditions
condition_palette = [condition_colors[condition] for condition in heatmap_data.columns]

# Generate a clustermap without the condition dendrogram
sns.set(context='notebook', font_scale=1.2)
cg = sns.clustermap(
    heatmap_data,
    cmap='coolwarm',
    linewidths=.5,
    figsize=(10, 8),
    row_cluster=True,  # Cluster rows (genes)
    col_cluster=False  # Do not cluster columns (conditions)
)

# Customize the condition colors (color bar)
for tick_label, tick_color in zip(cg.ax_heatmap.get_xticklabels(), condition_palette):
    tick_label.set_color(tick_color)

# Hide the gene names on the right
cg.ax_heatmap.set_yticklabels([])

# Move the color legend to the right
cg.cax.set_position([1, .2, .03, .45])

# Move the column labels to the top and center them
cg.ax_heatmap.xaxis.set_label_position('top')
cg.ax_heatmap.xaxis.tick_top()

# Center the x-tick labels
for tick in cg.ax_heatmap.get_xticklabels():
    tick.set_ha('center')

# Remove the 'Condition' word
cg.ax_heatmap.set_xlabel('')
cg.ax_heatmap.set_xticklabels(cg.ax_heatmap.get_xticklabels(), rotation=0)

# Show the plot
plt.show()
