In [1]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import CCA
from scipy.stats import chi2

In [2]:
df = pd.read_csv('./cca_data.csv')

df.head(1)

Unnamed: 0,locus_of_control,self_concept,motivation,read,write,math,science
0,-0.84,-0.24,1.0,54.8,64.5,44.5,52.6


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   locus_of_control  600 non-null    float64
 1   self_concept      600 non-null    float64
 2   motivation        600 non-null    float64
 3   read              600 non-null    float64
 4   write             600 non-null    float64
 5   math              600 non-null    float64
 6   science           600 non-null    float64
dtypes: float64(7)
memory usage: 32.9 KB


In [5]:
df.describe()

Unnamed: 0,locus_of_control,self_concept,motivation,read,write,math,science
count,600.0,600.0,600.0,600.0,600.0,600.0,600.0
mean,0.096533,0.004917,0.660833,51.901833,52.384833,51.849,51.763333
std,0.67028,0.705513,0.342729,10.102983,9.726455,9.414736,9.706179
min,-2.23,-2.62,0.0,28.3,25.5,31.8,26.0
25%,-0.3725,-0.3,0.33,44.2,44.3,44.5,44.4
50%,0.21,0.03,0.67,52.1,54.1,51.3,52.6
75%,0.51,0.44,1.0,60.1,59.9,58.375,58.65
max,1.36,1.19,1.0,76.0,67.1,75.5,74.2


In [6]:
df.shape

(600, 7)

In [7]:
df.shape[1]

7

In [20]:
# 'split' the data set into phychological and subject sets
x = df[['locus_of_control', 'self_concept', 'motivation']]
y = df[['read', 'write', 'math', 'science']]

# standardize
X = StandardScaler().fit_transform(x)
Y = StandardScaler().fit_transform(y)

# instantiate cca with Physocological group
cca = CCA(n_components=X.shape[1])
U, V = cca.fit_transform(X, Y)

# canonical correlations
corrs = np.array([np.corrcoef(U[:, i], V[:, i])[0, 1] for i in range(X.shape[1])])

# Compute Wilks' Lambda, chi-square, p-values
n, p, q = X.shape[0], X.shape[1], Y.shape[1]
wilks = np.cumprod(1 - corrs**2)
stats = []
for i, j in enumerate(wilks):
    df_val = (p - i) * (q - i)
    m = n - 1 - 0.5 * (p + q + 1)
    chi_stat = -m * np.log(j)
    p_val = 1 - chi2.cdf(chi_stat, df_val)
    stats.append({
        'Dimension': i + 1,
        'Canonical Corr': corrs[i],
        "Wilks' Lambda": j,
        'Chi-Square': chi_stat,
        'df': df_val,
        'p-value': f"{p_val:.5f}"
    })
summary_df = pd.DataFrame(stats)

# Canonical loadings (coeffs)
loadings_X = np.corrcoef(X.T, U.T)[0:p, p:]
loadings_Y = np.corrcoef(Y.T, V.T)[0:q, q:]
loadings_X_df = pd.DataFrame(loadings_X, index=x.columns, columns=[f'Dim{i+1}' for i in range(X.shape[1])])
loadings_Y_df = pd.DataFrame(loadings_Y, index=y.columns, columns=[f'Dim{i+1}' for i in range(X.shape[1])])

# Redundancy indices
redundancy_X = loadings_X_df.pow(2).mean()
redundancy_Y = loadings_Y_df.pow(2).mean()
redundancy = pd.DataFrame({
    'redundancy_X': redundancy_X,
    'redundancy_Y': redundancy_Y,
    'Redundancy_X to Y': redundancy_X * corrs**2,
    'Redundancy_Y to X': redundancy_Y * corrs**2
}, index=[f'Dim{j+1}' for j in range(X.shape[1])])

summary_df, loadings_X_df, loadings_Y_df, redundancy

(   Dimension  Canonical Corr  Wilks' Lambda  Chi-Square  df  p-value
 0          1        0.446436       0.800694  132.254127  12  0.00000
 1          2        0.153359       0.781863  146.415113   6  0.00000
 2          3        0.022503       0.781467  146.716501   2  0.00000,
                       Dim1      Dim2      Dim3
 locus_of_control  0.914292 -0.393641 -0.095478
 self_concept      0.099976 -0.421308  0.901390
 motivation        0.585314  0.606133  0.538526,
              Dim1      Dim2      Dim3
 read     0.880434 -0.244905  0.273057
 write    0.910126  0.220975 -0.339797
 math     0.799992 -0.187928  0.283571
 science  0.694107 -0.675884 -0.237673,
       redundancy_X  redundancy_Y  Redundancy_X to Y  Redundancy_Y to X
 Dim1      0.396173      0.681316           0.078959           0.135790
 Dim2      0.233283      0.150236           0.005487           0.003533
 Dim3      0.370544      0.081731           0.000188           0.000041)

In [28]:
# cross-loadings
# corr between phychological and subjects
cross_X_V = np.corrcoef(X.T, V.T)[0:X.shape[1], X.shape[1]:]

# corr between subjects and phychological 
cross_Y_U = np.corrcoef(Y.T, U.T)[0:Y.shape[1], Y.shape[1]:]

# to df
cross_X_V_df = pd.DataFrame(cross_X_V, index=x.columns, columns=[f'Dim{i+1}' for i in range(X.shape[1])])
cross_Y_U_df = pd.DataFrame(cross_Y_U, index=y.columns, columns=[f'Dim{i+1}' for i in range(X.shape[1])])

cross_X_V_df, cross_Y_U_df

(                      Dim1      Dim2      Dim3
 locus_of_control  0.408171 -0.060368 -0.002149
 self_concept      0.044630 -0.064611  0.020284
 motivation        0.261310  0.092956  0.012119,
              Dim1      Dim2      Dim3
 read     0.393058 -0.037552  0.006145
 write    0.406313  0.033895 -0.007647
 math     0.357146 -0.028815  0.006381
 science  0.309875 -0.103648 -0.005349)

In [21]:
summary_df

Unnamed: 0,Dimension,Canonical Corr,Wilks' Lambda,Chi-Square,df,p-value
0,1,0.446436,0.800694,132.254127,12,0.0
1,2,0.153359,0.781863,146.415113,6,0.0
2,3,0.022503,0.781467,146.716501,2,0.0


In [36]:
print('physcological group loadings')
loadings_X_df

physcological group loadings


Unnamed: 0,Dim1,Dim2,Dim3
locus_of_control,0.914292,-0.393641,-0.095478
self_concept,0.099976,-0.421308,0.90139
motivation,0.585314,0.606133,0.538526


In [37]:
print('subject group loadings')
loadings_Y_df

subject group loadings


Unnamed: 0,Dim1,Dim2,Dim3
read,0.880434,-0.244905,0.273057
write,0.910126,0.220975,-0.339797
math,0.799992,-0.187928,0.283571
science,0.694107,-0.675884,-0.237673


In [25]:
redundancy

Unnamed: 0,redundancy_X,redundancy_Y,Redundancy_X to Y,Redundancy_Y to X
Dim1,0.396173,0.681316,0.078959,0.13579
Dim2,0.233283,0.150236,0.005487,0.003533
Dim3,0.370544,0.081731,0.000188,4.1e-05


In [34]:
print('Cross loadings of psychological variables (X) on academic canonical variates (V)')
cross_X_V_df

Cross loadings of psychological variables (X) on academic canonical variates (V)


Unnamed: 0,Dim1,Dim2,Dim3
locus_of_control,0.408171,-0.060368,-0.002149
self_concept,0.04463,-0.064611,0.020284
motivation,0.26131,0.092956,0.012119


In [35]:
print('Cross loadings of academic variables (Y) on psychological canonical variates (U)')
cross_Y_U_df

Cross loadings of academic variables (Y) on psychological canonical variates (U)


Unnamed: 0,Dim1,Dim2,Dim3
read,0.393058,-0.037552,0.006145
write,0.406313,0.033895,-0.007647
math,0.357146,-0.028815,0.006381
science,0.309875,-0.103648,-0.005349


In [38]:
# Get canonical weights (coefficients used to compute U and V from X and Y)
weights_X = pd.DataFrame(cca.x_weights_, index=x.columns, columns=[f'Dim{i+1}' for i in range(X.shape[1])])
weights_Y = pd.DataFrame(cca.y_weights_, index=y.columns, columns=[f'Dim{i+1}' for i in range(X.shape[1])])

weights_X, weights_Y


(                      Dim1      Dim2      Dim3
 locus_of_control  0.876809 -0.472235 -0.090557
 self_concept     -0.174754 -0.488415  0.854934
 motivation        0.447959  0.733788  0.510771,
              Dim1      Dim2      Dim3
 read     0.617204 -0.335039  0.496473
 write    0.743148  0.257804 -0.599997
 math     0.253335 -0.121205  0.465700
 science -0.051115 -0.898107 -0.420289)

In [42]:
print('Psy weights')
weights_X

Psy weights


Unnamed: 0,Dim1,Dim2,Dim3
locus_of_control,0.876809,-0.472235,-0.090557
self_concept,-0.174754,-0.488415,0.854934
motivation,0.447959,0.733788,0.510771


In [43]:
print('subject weights')
weights_Y

subject weights


Unnamed: 0,Dim1,Dim2,Dim3
read,0.617204,-0.335039,0.496473
write,0.743148,0.257804,-0.599997
math,0.253335,-0.121205,0.4657
science,-0.051115,-0.898107,-0.420289
