In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
df = pd.read_excel("tvdata.xlsx")
df

Unnamed: 0,led,hed,net,arti,com,man
0,86,43,85,43,93,71
1,99,74,99,78,99,89
2,37,22,10,27,24,33
3,5,19,56,13,11,38
4,45,43,55,39,54,58
5,21,32,21,34,35,32
6,36,78,48,75,42,78
7,69,31,85,32,70,52
8,40,98,36,99,64,86
9,26,14,40,8,25,21


## 典型相关分析（CCA）

借助 Python `sklearn` 包进行

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import CCA

标准化数据：均值化为0，方差化为1

In [21]:
df1 = df[["led", "hed", "net"]]
ss = StandardScaler()
df1_std = ss.fit(df1).transform(df1)
df2 = df[["arti", "com", "man"]]
df2_std = ss.fit(df2).transform(df2)
np.mean(df1_std, axis=0), np.var(df1_std, axis=0)

(array([-9.25185854e-17,  1.03620816e-16, -2.96059473e-17]),
 array([1., 1., 1.]))

In [22]:
cca = CCA()
xc, yc = cca.fit(df1_std, df2_std).transform(df1_std, df2_std)

In [23]:
np.shape(xc), np.shape(yc)

((30, 2), (30, 2))

In [24]:
np.corrcoef(xc[:, 0], yc[:, 0])[0, 1], np.corrcoef(xc[:, 1], yc[:, 1])[0, 1]

(0.9954346966586444, 0.9528250144492953)

## 另一份数据

In [26]:
health = pd.read_excel("health.xlsx").values
health

array([[191,  36,  50,   5, 162,  60],
       [189,  37,  52,   2, 110,  60],
       [193,  38,  58,  12, 101, 101],
       [162,  35,  62,  12, 105,  37],
       [189,  35,  46,  13, 155,  58],
       [182,  36,  56,   4, 101,  42],
       [211,  38,  56,   8, 101,  38],
       [167,  34,  60,   6, 125,  40],
       [176,  31,  74,  15, 200,  40],
       [154,  33,  56,  17, 251, 250],
       [169,  34,  50,  17, 120,  38],
       [166,  33,  52,  13, 210, 115],
       [154,  34,  64,  14, 215, 105],
       [247,  46,  50,   1,  50,  50],
       [193,  36,  46,   6,  70,  31],
       [202,  37,  62,  12, 210, 120],
       [176,  37,  54,   4,  60,  25],
       [157,  32,  52,  11, 230,  80],
       [156,  33,  54,  15, 225,  73],
       [138,  33,  68,   2, 110,  43]], dtype=int64)

In [29]:
x = health[:, 0:3]; y = health[:, 3:6]
x, y

(array([[191,  36,  50],
        [189,  37,  52],
        [193,  38,  58],
        [162,  35,  62],
        [189,  35,  46],
        [182,  36,  56],
        [211,  38,  56],
        [167,  34,  60],
        [176,  31,  74],
        [154,  33,  56],
        [169,  34,  50],
        [166,  33,  52],
        [154,  34,  64],
        [247,  46,  50],
        [193,  36,  46],
        [202,  37,  62],
        [176,  37,  54],
        [157,  32,  52],
        [156,  33,  54],
        [138,  33,  68]], dtype=int64),
 array([[  5, 162,  60],
        [  2, 110,  60],
        [ 12, 101, 101],
        [ 12, 105,  37],
        [ 13, 155,  58],
        [  4, 101,  42],
        [  8, 101,  38],
        [  6, 125,  40],
        [ 15, 200,  40],
        [ 17, 251, 250],
        [ 17, 120,  38],
        [ 13, 210, 115],
        [ 14, 215, 105],
        [  1,  50,  50],
        [  6,  70,  31],
        [ 12, 210, 120],
        [  4,  60,  25],
        [ 11, 230,  80],
        [ 15, 225,  73],
        [ 

In [35]:
cca = CCA(3)
cca.fit(x, y)
u, v = cca.transform(x, y)

In [33]:
np.shape(u), np.shape(v)

((20, 3), (20, 3))

计算典型相关系数

In [34]:
r = [np.corrcoef(u[:, i], v[:, i])[0, 1] for i in [0, 1, 2]]
r # 典型相关系数

[0.7956081542547285, 0.2005560410240742, 0.07257028625549243]

假设检验：验证典型相关系数是否需要保留。

In [42]:
from scipy.stats import chi2

In [41]:
n = health.shape[0]
p = q = 3
f = p * q; m = n - 1 - (p + q + 1) / 2
n, p, q, m, f

(20, 3, 3, 15.5, 9)

In [51]:
alpha = 0.05
Q = - m * np.log(np.prod(1 - np.array(r) ** 2))
chi2_alpha = chi2.ppf(1 - alpha, f)
Q, chi2_alpha, Q > chi2_alpha

(16.254957511492844, 16.918977604620448, False)

In [54]:
p_value = 1 - chi2.cdf(Q, f)
p_value

0.06174455787956812

检验统计量落入接受域中，不能否认原假设，即认为该变量不典型。