# Volcano Plot

In [22]:
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix
from scipy.sparse import issparse
import scanpy as sc
import scipy.stats
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
print(ad.__version__)

0.11.4


In [23]:
# There are 9 excel files in the folder results
# The excel files have two collumns: 'Gene' and 'Number of Significant Differences'

# The excel files are named: "results/{}_significant_differences.xlsx".format(cell_type)
cell_types = [
    # "CD8-positive, alpha-beta T cell",
    # "CD8-positive, alpha-beta memory T cell",
    "CD4-positive, alpha-beta T cell",
    # "central memory CD4-positive, alpha-beta T cell",
    "effector memory CD4-positive, alpha-beta T cell",
    "gamma-delta T cell",
    "regulatory T cell",
    # "double negative T regulatory cell",
    # "innate lymphoid cell"
]
    

In [24]:
# Load all the excel files of the same cell type

excel_files_top_diff_genes = {}

for cell_type in cell_types:
    file_path = f"results/{cell_type}_top_diff_genes.xlsx"
    excel_files_top_diff_genes[cell_type] = pd.read_excel(file_path)


In [None]:
# Create a volcano plot for each cell type
# On the x-axis: is the difference between 'Correlation Young' and 'Correlation Old'
# On the y-axis: is the -log10 of the p-value

def z_score_to_p_value(z):
    """Convert a z-score to a p-value. for a two-tailed test."""
    return 2 * (1 - scipy.stats.norm.cdf(np.abs(z)))


def create_volcano_plot(df, cell_type):
    df['log_p_value'] = -np.log10(z_score_to_p_value(df['Z Diff Young-Old']))
    df['Significant'] = df['log_p_value'] > -np.log10(0.01)
    df['correlation_diff'] = df['Correlation Young'] - df['Correlation Old']
    
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df, x='correlation_diff', y='log_p_value', hue='Significant', alpha=0.7)
    
    plt.axhline(y=-np.log10(0.01), color='red', linestyle='--', label='p-value = 0.01')
    plt.axvline(x=0, color='grey', linestyle='--')
    
    plt.title(f'Volcano Plot for {cell_type}')
    plt.xlabel('Difference in Correlation (Young - Old)')
    plt.ylabel('-log10(p-value)')
    plt.legend()
    plt.grid()

    # add labels for the points that have a -log10(p-value) > 0.9 * max(log_p_value)
    max_log_p_value = df['log_p_value'].max()
    threshold = 0.9 * max_log_p_value
    for i, row in df.iterrows():
        if row['log_p_value'] > threshold and (i % 2 == 0):
            label = row['Gene1'] + '-' + row['Gene2']
            print(f"Labeling point: {label} at ({row['correlation_diff']}, {row['log_p_value']})")

            # If the point is left of the y-axis, adjust the label position
            if row['correlation_diff'] < 0:
                plt.text(row['correlation_diff'], row['log_p_value'], label, fontsize=8, ha='left', va='bottom')
            else:
                plt.text(row['correlation_diff'], row['log_p_value'], label, fontsize=8, ha='right', va='bottom')
    
    plt.savefig(f"results/volcano_plot_{cell_type}.png")
    plt.close()



In [30]:
for cell_type, df in excel_files_top_diff_genes.items():
    # Ensure the DataFrame has the expected columns
    if 'Correlation Young' in df.columns and 'Correlation Old' in df.columns:
        create_volcano_plot(df, cell_type)
    else:
        print(f"Skipping {cell_type} due to missing columns.")

Labeling point: ENSG00000125691-ENSG00000124614 at (-0.8629148271310099, 9.115057549373226)
Labeling point: ENSG00000272888-ENSG00000169564 at (-0.8610180938458138, 9.708645490021697)
Labeling point: ENSG00000009307-ENSG00000241343 at (-1.0332775288474583, 11.034508204992855)
Labeling point: ENSG00000054654-ENSG00000128340 at (-1.0256427131087253, 10.501396843135854)
Labeling point: ENSG00000166441-ENSG00000118816 at (1.0143106422223658, 9.997943110523664)
Labeling point: ENSG00000124942-ENSG00000117523 at (1.2014289993571166, 15.35252977886304)
Labeling point: ENSG00000122566-ENSG00000100201 at (-0.9261038806628832, 10.17315898144163)
