# Correlation Analysis

In [10]:
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix
from scipy.sparse import issparse
import scanpy as sc
import scipy.stats
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
print(ad.__version__)

0.11.4


In [4]:
# Load young and old data
# cell_type = "CD8-positive, alpha-beta cytotoxic T cell"
cell_type = "erythrocyte"
young_path = "subsets/{}_young_donors.h5ad".format(cell_type)
old_path = "subsets/{}_old_donors.h5ad".format(cell_type)
adata_young = ad.read_h5ad(young_path)
adata_old = ad.read_h5ad(old_path)
print(adata_young)
print(adata_old)

AnnData object with n_obs × n_vars = 26 × 132
    obs: 'reference_genome', 'gene_annotation_version', 'alignment_software', 'intronic_reads_counted', 'library_id', 'assay_ontology_term_id', 'sequenced_fragment', 'cell_number_loaded', 'institute', 'is_primary_data', 'cell_type_ontology_term_id', 'author_cell_type', 'sample_id', 'sample_preservation_method', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'sample_collection_method', 'donor_BMI_at_collection', 'tissue_type', 'suspension_derivation_process', 'suspension_enriched_cell_types', 'cell_viability_percentage', 'suspension_uuid', 'suspension_type', 'self_reported_ethnicity_ontology_term_id', 'donor_living_at_sample_collection', 'organism_ontology_term_id', 'disease_ontology_term_id', 'sex_ontology_term_id', 'Country', 'nCount_RNA', 'nFeature_RNA', 'TCR_VDJdb', 'TCRa_V_gene', 'TCRa_D_gene', 'TCRa_J_gene', 'TCRa_C_gene', 'TCRb_V_gene', 'TCRb_D_gene', 'TCRb_J_gene', 'TCRb_C_gene', 'TCR_Clonality', 'TCR_Clone_ID', 'BC

In [5]:
# Check that every gene is expressed at least once in each dataset in .X
young_check = np.sum(adata_young.X != 0, axis=0)
old_check = np.sum(adata_old.X != 0, axis=0)

# Check that every number in young_check is greater than 0
if np.all(young_check > 0):
    print("All genes expressed at least once in young dataset")

# Check that every number in old_check is greater than 0
if np.all(old_check > 0):
    print("All genes expressed at least once in old dataset")

All genes expressed at least once in young dataset
All genes expressed at least once in old dataset


In [14]:
# Get the expression matrix (cells x genes)
if issparse(adata_young.X):
    expr_matrix = adata_young.X.toarray()  # Convert sparse to dense if needed
else:
    expr_matrix = adata_young.X

# Transpose to genes x cells for correlation calculation
expr_matrix = expr_matrix.T

# Calculate correlation matrix (Pearson)
corr_matrix = np.corrcoef(expr_matrix)

# Convert to DataFrame for easier handling
gene_names = adata_young.var_names
corr_df_young = pd.DataFrame(corr_matrix, index=gene_names, columns=gene_names)

In [15]:
# # Get the expression matrix (cells x genes)
# if issparse(adata_young.X):
#     expr_matrix = adata_young.X.toarray()  # Convert sparse to dense if needed
# else:
#     expr_matrix = adata_young.X

# # Transpose to genes x cells for correlation calculation
# expr_matrix = expr_matrix.T

# # Get gene names
# gene_names = adata_young.var_names
# n_genes = len(gene_names)

# # Initialize correlation and p-value matrices
# corr_matrix = np.zeros((n_genes, n_genes))
# p_matrix = np.zeros((n_genes, n_genes))

# # Calculate correlation and p-value for each gene pair
# for i in range(n_genes):
#     for j in range(i, n_genes):  # Take advantage of symmetry
#         corr, p_val = pearsonr(expr_matrix[i], expr_matrix[j])
#         corr_matrix[i, j] = corr
#         corr_matrix[j, i] = corr  # Symmetric
#         p_matrix[i, j] = p_val
#         p_matrix[j, i] = p_val  # Symmetric

# # Convert to DataFrames
# corr_df_young = pd.DataFrame(corr_matrix, index=gene_names, columns=gene_names)
# p_value_df_young = pd.DataFrame(p_matrix, index=gene_names, columns=gene_names)

# # Print the p-value DataFrame
# print(corr_df_young)
# print(p_value_df_young)

In [16]:
# Print the diagonal of the correlation matrix and check for NaN values
print("Diagonal of the correlation matrix (should be 1):")
print(corr_df_young.values.diagonal())
print("Number of NaN values in the correlation matrix:")
print(np.isnan(corr_df_young.values).sum())

Diagonal of the correlation matrix (should be 1):
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Number of NaN values in the correlation matrix:
0


In [8]:
# Print the first few rows of the correlation matrix
print("First few rows of the correlation matrix:")
print(corr_df_young.head())

# Plot the correlation matrix
# plt.figure(figsize=(10, 8))
# sns.heatmap(corr_df_young, cmap='coolwarm', center=0, square=True, cbar_kws={"shrink": .8})
# plt.title("Correlation Matrix of Gene Expression")
# plt.xlabel("Genes")
# plt.ylabel("Genes")
# plt.show()

First few rows of the correlation matrix:
                 ENSG00000116251  ENSG00000142676  ENSG00000142669  \
ENSG00000116251         1.000000         0.721429        -0.248207   
ENSG00000142676         0.721429         1.000000        -0.209977   
ENSG00000142669        -0.248207        -0.209977         1.000000   
ENSG00000142937         0.768505         0.909732        -0.452678   
ENSG00000177606         0.242342         0.367620        -0.179291   

                 ENSG00000142937  ENSG00000177606  ENSG00000122406  \
ENSG00000116251         0.768505         0.242342         0.668391   
ENSG00000142676         0.909732         0.367620         0.874849   
ENSG00000142669        -0.452678        -0.179291        -0.154893   
ENSG00000142937         1.000000         0.388147         0.845123   
ENSG00000177606         0.388147         1.000000         0.395468   

                 ENSG00000163220  ENSG00000197956  ENSG00000196154  \
ENSG00000116251        -0.050978         0.065

In [9]:
# Find top correlated genes for a specific gene of interest
target_gene = "ENSG00000166710"
if target_gene in corr_df_young.columns:
    target_correlations = corr_df_young[target_gene].sort_values(ascending=False)
    print(f"Top genes correlated with {target_gene}:")
    print(target_correlations.head(10))
else:
    print(f"{target_gene} not found in the dataset")

Top genes correlated with ENSG00000166710:
ENSG00000166710    1.000000
ENSG00000204525    0.879935
ENSG00000234745    0.860340
ENSG00000206503    0.831529
ENSG00000108518    0.758690
ENSG00000177954    0.732153
ENSG00000118181    0.701874
ENSG00000108654    0.700075
ENSG00000198242    0.696914
ENSG00000184009    0.692602
Name: ENSG00000166710, dtype: float64


In [74]:
# Get the expression matrix for old donors
if issparse(adata_old.X):
    expr_matrix_old = adata_old.X.toarray()  # Convert sparse to dense if needed
else:
    expr_matrix_old = adata_old.X

# Transpose to genes x cells for correlation calculation
expr_matrix_old = expr_matrix_old.T

# Calculate correlation matrix (Pearson) for old donors
corr_matrix_old = np.corrcoef(expr_matrix_old)

# Convert to DataFrame for easier handling
corr_df_young_old = pd.DataFrame(corr_matrix_old, index=gene_names, columns=gene_names)

In [75]:
# Print the diagonal of the correlation matrix and check for NaN values
print("Diagonal of the correlation matrix (should be 1):")
print(corr_df_young_old.values.diagonal())
print("Number of NaN values in the correlation matrix:")
print(np.isnan(corr_df_young_old.values).sum())

Diagonal of the correlation matrix (should be 1):
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Number of NaN values in the correlation matrix:
0


In [76]:
# Find top correlated genes for a specific gene of interest in old donors
if target_gene in corr_df_young_old.columns:
    target_correlations_old = corr_df_young_old[target_gene].sort_values(ascending=False)
    print(f"Top genes correlated with {target_gene} in old donors:")
    print(target_correlations_old.head(10))
else:
    print(f"{target_gene} not found in the old donor dataset")

Top genes correlated with ENSG00000166710 in old donors:
ENSG00000166710    1.000000
ENSG00000204525    0.794538
ENSG00000105374    0.734490
ENSG00000234745    0.728718
ENSG00000271503    0.712339
ENSG00000206503    0.690381
ENSG00000204592    0.667698
ENSG00000115523    0.614709
ENSG00000108518    0.574090
ENSG00000127528    0.557906
Name: ENSG00000166710, dtype: float64


In [77]:
# Find the differences in correlation between young and old donors
diff_corr = corr_df_young - corr_df_young_old
print("Difference in correlation between young and old donors:")
print(diff_corr.head())

# Assert that the diagonal of the difference correlation matrix is 0
assert np.all(diff_corr.values.diagonal() <= 1.0e-10), "Diagonal of the difference correlation matrix is not zero"

# Print the dimensions of the correlation matrices
print("Dimensions of the correlation matrix for young donors:", corr_df_young.shape)
print("Dimensions of the correlation matrix for old donors:", corr_df_young_old.shape)

Difference in correlation between young and old donors:
                 ENSG00000116251  ENSG00000142676  ENSG00000142669  \
ENSG00000116251         0.000000         0.028675     2.425776e-01   
ENSG00000142676         0.028675         0.000000     3.019102e-01   
ENSG00000142669         0.242578         0.301910     1.110223e-16   
ENSG00000142937         0.066972         0.041282     1.739678e-01   
ENSG00000177606         0.168205         0.355668    -1.426803e-01   

                 ENSG00000142937  ENSG00000177606  ENSG00000122406  \
ENSG00000116251         0.066972     1.682046e-01        -0.019041   
ENSG00000142676         0.041282     3.556684e-01        -0.008988   
ENSG00000142669         0.173968    -1.426803e-01         0.442757   
ENSG00000142937         0.000000     2.872648e-01        -0.112311   
ENSG00000177606         0.287265     2.220446e-16         0.283976   

                 ENSG00000163220  ENSG00000197956  ENSG00000196154  \
ENSG00000116251        -0.046119

In [78]:
# Find the top 10 genes pairs with the largest absolute difference in correlation
top_diff_genes = diff_corr.abs().unstack().sort_values(ascending=False)

In [79]:
print("Top 10 gene pairs with the largest absolute difference in correlation:")
print(top_diff_genes.head(10))

Top 10 gene pairs with the largest absolute difference in correlation:
ENSG00000198712  ENSG00000197756    1.151111
ENSG00000197756  ENSG00000198712    1.151111
ENSG00000147403  ENSG00000184009    1.142995
ENSG00000184009  ENSG00000147403    1.142995
ENSG00000161016  ENSG00000034510    1.141795
ENSG00000034510  ENSG00000161016    1.141795
ENSG00000184009  ENSG00000122406    1.133046
ENSG00000122406  ENSG00000184009    1.133046
ENSG00000184009  ENSG00000161016    1.119225
ENSG00000161016  ENSG00000184009    1.119225
dtype: float64


In [80]:
# We define a significant difference as a difference greater than 0.3
significant_diff = 0.3

# We want to create a network of genes with a significant difference in correlation between young and old donors
# So we want to make a list of all the genes and for every gene we want a list with all the genes that are significantly different
# in correlation with that gene so for example if gene A has a significant difference in correlation with gene B and gene C
# we want to add B and C to the list of gene A [A: [B, C]]
# We will use a dictionary to store the results
significant_diff_dict = {}
for gene in corr_df_young.columns:
    # Get the differences in correlation for the current gene
    diff_for_gene = diff_corr[gene]
    
    # Find genes with significant differences
    significant_genes = diff_for_gene[diff_for_gene.abs() > significant_diff].index.tolist()
    
    # Store the results in the dictionary
    significant_diff_dict[gene] = significant_genes

In [81]:
# Plot the genes with the most significant differences
# We will plot the top 10 genes with the most significant differences in the dictionary
top_genes = sorted(significant_diff_dict.items(), key=lambda x: len(x[1]), reverse=True)


In [83]:
print("Top 10 gene pairs with the largest absolute difference in correlation:")
top_genes_replace_list_with_length = {}
for gene, diff_genes in top_genes[:10]:
    print(f"{gene}: {len(diff_genes)}")
    top_genes_replace_list_with_length[gene] = len(diff_genes)
print(top_genes_replace_list_with_length)

# Plot the top 10 genes with the least significant differences in the dictionary
top_genes_replace_list_with_length = {}
for gene, diff_genes in top_genes[-10:]:
    print(f"{gene}: {len(diff_genes)}")
    top_genes_replace_list_with_length[gene] = len(diff_genes)
print(top_genes_replace_list_with_length)

Top 10 gene pairs with the largest absolute difference in correlation:
ENSG00000184009: 103
ENSG00000034510: 99
ENSG00000150991: 98
ENSG00000198712: 95
ENSG00000198804: 94
ENSG00000132475: 93
ENSG00000146278: 92
ENSG00000197728: 92
ENSG00000198840: 92
ENSG00000198886: 92
{'ENSG00000184009': 103, 'ENSG00000034510': 99, 'ENSG00000150991': 98, 'ENSG00000198712': 95, 'ENSG00000198804': 94, 'ENSG00000132475': 93, 'ENSG00000146278': 92, 'ENSG00000197728': 92, 'ENSG00000198840': 92, 'ENSG00000198886': 92}
ENSG00000100906: 22
ENSG00000087086: 22
ENSG00000251562: 18
ENSG00000090382: 17
ENSG00000130066: 17
ENSG00000173812: 16
ENSG00000204592: 15
ENSG00000163220: 14
ENSG00000206172: 11
ENSG00000188536: 10
{'ENSG00000100906': 22, 'ENSG00000087086': 22, 'ENSG00000251562': 18, 'ENSG00000090382': 17, 'ENSG00000130066': 17, 'ENSG00000173812': 16, 'ENSG00000204592': 15, 'ENSG00000163220': 14, 'ENSG00000206172': 11, 'ENSG00000188536': 10}
