# Canonical Correlation Analysis (CCA)
Here I calculate the Canonical Correlation Coefficients and the canonical variables for the two datasets. 
I also plot the correlation coefficients and the canonical variables.

This analysis is based on the following article:
https://brainder.org/2019/12/27/redundancy-in-canonical-correlation-analysis/


In [1]:
# import libraries
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.cross_decomposition import CCA
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [2]:
# Parameters
cell_type = "PBMC"
Shuffle = True

In [3]:
# set paths to data
morphology_data_path = pathlib.Path(
    f"../../data/{cell_type}_preprocessed_sc_norm_aggregated.parquet"
).resolve(strict=True)
nomic_data_path = pathlib.Path(
    f"../../2.Nomic_nELISA_Analysis/Data/clean/Plate2/nELISA_plate_430420_{cell_type}_clean.parquet"
).resolve(strict=True)

# output path
results_file_path = pathlib.Path(f"../results/{cell_type}_redundancy_analysis.csv")
results_file_path.parent.mkdir(parents=True, exist_ok=True)

# read data
morphology_data = pd.read_parquet(morphology_data_path)
nomic_data = pd.read_parquet(nomic_data_path)

In [4]:
# get the columns that contain metadata
morphology_metadata = morphology_data[
    morphology_data.columns[morphology_data.columns.str.contains("Metadata")]
]
morphology_data = morphology_data.drop(morphology_metadata.columns, axis=1)

nomic_data_values = nomic_data[
    nomic_data.columns[nomic_data.columns.str.contains("[NSU]", regex=True)]
]
nomic_metadata = nomic_data.drop(nomic_data_values.columns, axis=1)

#### Data Needs to be in standard scalar format for CCA

In [5]:
# standardize the data for nomic standard scalar
scaler = StandardScaler()
nomic_data_values = scaler.fit_transform(nomic_data_values)
nomic_data_values = pd.DataFrame(
    nomic_data_values,
    columns=nomic_data.columns[nomic_data.columns.str.contains("[NSU]", regex=True)],
)

In [6]:
# check the scale of the data
nomic_data_values.describe()

Unnamed: 0,Activin A [NSU],AITRL (GITR Ligand) [NSU],Amphiregulin [NSU],Amyloid beta [NSU],APRIL [NSU],BAFF [NSU],BCMA (TNFRSF17) [NSU],BDNF [NSU],BMP2 [NSU],BMP3 [NSU],...,TWEAK [NSU],uPA [NSU],VCAM-1 [NSU],VEGF Receptor 2 (Flk-1) [NSU],VEGF-A (165) [NSU],VEGF-C [NSU],VEGF-D [NSU],VEGFR-1 [NSU],WISP-1 (CCN4) [NSU],XCL1 (Lymphotactin) [NSU]
count,154.0,154.0,154.0,154.0,154.0,154.0,154.0,154.0,154.0,154.0,...,154.0,154.0,154.0,154.0,154.0,154.0,154.0,154.0,154.0,154.0
mean,-1.038131e-16,2.883696e-17,3.784851e-16,-1.787892e-16,-3.388343e-16,5.767392e-18,-4.6139140000000003e-17,1.557196e-16,-5.306001e-16,0.0,...,6.113436e-16,4.6139140000000003e-17,-2.3069570000000002e-17,2.3069570000000002e-17,2.883696e-18,-3.258577e-16,-2.306957e-16,6.920871e-17,0.0,3.979501e-16
std,1.003263,1.003263,1.003263,1.003263,1.003263,1.003263,1.003263,1.003263,1.003263,1.003263,...,1.003263,1.003263,1.003263,1.003263,1.003263,1.003263,1.003263,1.003263,1.003263,1.003263
min,-0.7753361,-2.40466,-1.626729,-2.623379,-2.502733,-2.063749,-3.267475,-2.126498,-2.74497,-2.045397,...,-2.444083,-2.35935,-2.39054,-2.812137,-2.100477,-3.24126,-2.075041,-1.405828,-2.423406,-2.034209
25%,-0.5852376,-0.797728,-0.7345144,-0.6888167,-0.7773053,-0.6836681,-0.7810839,-0.6254525,-0.5719509,-0.664659,...,-0.7489428,-0.6543184,-0.7170485,-0.5184526,-0.6810349,-0.7408295,-0.6358911,-0.6591589,-0.681727,-0.6995932
50%,-0.5077906,-0.06492239,-0.2339748,-0.1011396,-0.08989044,0.0205661,0.09340619,-0.08593829,-0.1425891,-0.229429,...,-0.03713431,-0.0831994,0.01192415,-0.07199159,-0.07061398,0.0717067,-0.1133556,-0.3474098,0.033378,-0.2057293
75%,0.2378032,0.6132403,0.8189925,0.5894068,0.638516,0.6156226,0.763854,0.495348,0.4344105,0.502222,...,0.6029268,0.6905801,0.6410902,0.5976565,0.5186115,0.7108837,0.5324321,0.425747,0.697235,0.631363
max,2.910307,2.647989,2.247566,3.007529,2.991157,3.399755,2.458591,3.847185,3.326857,3.710686,...,2.7143,3.273316,3.54974,3.46328,2.442682,2.662569,2.864553,3.071173,2.991102,3.797019


In [7]:
# shuffle the data both rows and columns
if Shuffle:
    for column in nomic_data_values:
        np.random.shuffle(nomic_data_values[column].values)
    for column in morphology_data:
        np.random.shuffle(morphology_data[column].values)

### Variables
$Y_{N \times P} = MorphologyData$  
$X_{N \times Q} = NomicData$  
Where  
$N = Rows of each data set$   
note that each data set is paired so N is the same for both  
$P = Columns of MorphologyData$  
$Q = Columns of NomicData$  
$K = Number of Canonical Variables$  
Where  
$K = min(P,Q)$  
unless $N < min(P,Q)$  
then $K = min(N, P, Q)$

In [8]:
# define the variables
N = morphology_data.shape[0]
P = morphology_data.shape[1]

N = nomic_data_values.shape[0]
Q = nomic_data_values.shape[1]
print("N:", N, "P:", P, "Q:", Q)
K = min(N, P, Q)
print("K:", K)

N: 154 P: 1245 Q: 187
K: 154


#### Calculate the Canonical Correlation Coefficients  
X = Morphology Data  
Y = Nomic Data

In [9]:
# define the cca model
cca = CCA(n_components=K)
# fit the model to the paired data sets
cca.fit(morphology_data, nomic_data_values)
# transform the data to the canonical space
# get the canonical coefficients for both data sets
X_c, Y_c = cca.transform(morphology_data, nomic_data_values)
# r2 score of the model fit
r2_model = [cca.score(morphology_data, nomic_data_values), X_c, Y_c][0]
print("The R2 score for the Canonical Correlation is:", r2_model)

The R2 score for the Canonical Correlation is: -5.375612848080362




#### Extract the canonical loadings from the CCA
In the absence of scikit-learn canonical loadings.  
We would calculate the loads as follows:  
$\tilde{A} = corr(Y,U)$  
$\tilde{B} = corr(X,V)$  
Where $X$ and $Y$ are the original data matrices  
and $U$ and $V$ are the canonical variates

In [10]:
A_tilde = cca.x_loadings_.T
B_tilde = cca.y_loadings_.T

From the canonical coefficients we can calculate the variance extracted by each canonical variable.  
$u_k = \frac{1}{P} \sum^P_{p=1} \tilde a^2_{pk}$  
  
$v_k = \frac{1}{Q} \sum^Q_{q=1} \tilde b^2_{qk}$  
Where $k$ is the canonical variable number and $p$ and $q$ are the variables in the original data sets.


In [11]:
u_k = []
v_k = []
for i in A_tilde:
    u_k.append(np.mean(i**2))
for i in B_tilde:
    v_k.append(np.mean(i**2))

We can caluculate the r2 score for each canonical variable as follows:

In [12]:
# coefficients of determination for each canonical variable
r2 = r2_score(u_k, v_k)
r2

-22498.077451058954

We then caluclate the Redundancy Index (RI) for each canonical variable as follows:  
$RI_u = u_k * r^2_k$  
$RI_v = v_k * r^2_k$

In [13]:
# calculate the redundancy index for each canonical variable
RI_u = []
RI_v = []

for i in u_k:
    RI_u.append(i * r2)
for i in v_k:
    RI_v.append(i * r2)

We then caculate the total redundancy of both data sets as follows:  
#### $RI_{total} = \sum^K_{k=1} RI_u + \sum^K_{k=1} RI_v$  
From the total redundancy we can calculate the percentage contribution of each data set to the total redundancy as follows:  
#### $RI_{u\%} = \frac{\sum^K_{k=1} RI_u}{RI_{total}}$  
#### $RI_{v\%} = \frac{\sum^K_{k=1} RI_v}{RI_{total}}$

In [14]:
RI_u_min = np.min(RI_u)
RI_v_min = np.min(RI_v)
RI_u_max = np.max(RI_u)
RI_v_max = np.max(RI_v)
global_min = np.min([RI_u_min, RI_v_min])
global_max = np.max([RI_u_max, RI_v_max])

# Calulate the global redundancy index
global_RI_u_v = np.sum(RI_u) + np.sum(RI_v)
global_RI_u = np.sum(RI_u) / global_RI_u_v * 100
global_RI_v = np.sum(RI_v) / global_RI_u_v * 100

In [15]:
# make a dataframe of the results
results_df = pd.DataFrame(
    {
        "RI_u": RI_u,
        "RI_v": RI_v,
        "u_k": u_k,
        "v_k": v_k,
        "r2": r2,
        "Shuffle": Shuffle,
        "global_RI_u": global_RI_u,
        "global_RI_v": global_RI_v,
        "global_RI_u_v": global_RI_u_v,
        "global_min": global_min,
        "global_max": global_max,
    }
)
results_df.head(5)

Unnamed: 0,RI_u,RI_v,u_k,v_k,r2,Shuffle,global_RI_u,global_RI_v,global_RI_u_v,global_min,global_max
0,-20.680679,-269.941699,0.000919,0.011998,-22498.077451,True,7.541602,92.458398,-39019.982604,-928.382353,-0.0
1,-20.620108,-928.382353,0.000917,0.041265,-22498.077451,True,7.541602,92.458398,-39019.982604,-928.382353,-0.0
2,-20.331814,-202.780922,0.000904,0.009013,-22498.077451,True,7.541602,92.458398,-39019.982604,-928.382353,-0.0
3,-20.269692,-579.151852,0.000901,0.025742,-22498.077451,True,7.541602,92.458398,-39019.982604,-928.382353,-0.0
4,-20.081381,-215.744773,0.000893,0.009589,-22498.077451,True,7.541602,92.458398,-39019.982604,-928.382353,-0.0


In [16]:
# check for file existence
if results_file_path.is_file():
    print("The results file exists.")
    #  read the results file
    existing_file_df = pd.read_csv(results_file_path)
    # check for if it is full for shuffle type
    if len(existing_file_df["Shuffle"].unique()) > 1:
        # delete the existing file
        results_file_path.unlink()
    elif not existing_file_df["Shuffle"].unique() == Shuffle:
        pd.concat([existing_file_df, results_df]).to_csv(results_file_path, index=False)
else:
    results_df.to_csv(results_file_path, index=False)
    print("The results file is created.")

The results file exists.
