# 第10章 典型相关分析

## 10.1 引言

## 10.2 相关分析的架构

In [None]:
#%run init.py

In [None]:
#%cd "F:\\python"

### 10.2.1 简单相关分析

In [None]:
import pandas as pd
pd.set_option('display.precision',4)  #数据框输出精度
d101=pd.read_excel('mvsData.xlsx','d101');#d101

In [None]:
d101.corr()

In [None]:
pd.plotting.scatter_matrix(d101); #sns.pairplot(d10_1)

### 10.2.2 多元相关分析

In [None]:
from statsmodels.formula.api import ols
fmy1=ols(formula='y1~x1+x2+x3',data=d101).fit()
print(fmy1.summary())

In [None]:
fmy2=ols(formula='y2~x1+x2+x3',data=d101).fit()
print(fmy2.summary())

In [None]:
fmy3=ols(formula='y3~x1+x2+x3',data=d101).fit()
print(fmy3.summary())

## 10.3 典型相关分析原理

### 10.3.1 典型相关的思想

### 10.3.2 典型相关的求法

In [None]:
X=d101[['x1','x2','x3']]  #第一组数据
Y=d101[['y1','y2','y3']]  #第二组数据

In [None]:
from sklearn.cross_decomposition import CCA
import numpy as np
n,p=np.shape(X); n,q = np.shape(Y)
ca=CCA(n_components=min(p,q)).fit(X,Y); #取最小变量个数

In [None]:
from pandas import DataFrame as DF
u_coef=ca.x_rotations_.T  #X的典型变量系数
print(DF(u_coef,['u1','u2','u3'],X.columns))
v_coef=ca.y_rotations_.T  #Y的典型变量系数
print(DF(v_coef,['v1','v2','v3'],Y.columns))

In [None]:
u_scores,v_scores=ca.transform(X,Y) #典型变量u、v得分
U=DF(u_scores);V=DF(v_scores)       #典型变量得分数据框
U,V

In [None]:
CR=U.corrwith(V);CR                 #典型变量的相关系数

### 10.3.3 典型变量的性质

### 10.3.4 典型相关的检验

In [None]:
def CR_test(n,p,q,r):  #典型相关检验函数
    m=len(r); 
    import numpy as np
    Q=np.zeros(m); P=np.zeros(m)
    L=1  #lambda=1
    from math import log
    for k in range(m-1,-1,-1):  
        L=L*(1-r[k]**2)  
        Q[k]=-log(L)
    from scipy import stats                
    for k in range(0,m):
        Q[k]=(n-k-1/2*(p+q+3))*Q[k] #检验的卡方值
        P[k]=1-stats.chi2.cdf(Q[k],(p-k)*(q-k)) #P值
    CR=DF({'CR':r,'Q':Q,'P':P}).round(4)
    return CR

In [None]:
print(CR_test(n,p,q,CR))

### 10.4 典型相关分析步骤

### 10.4.1 计算典型系数及变量

In [None]:
def cancor(X,Y,pq=None,plot=False):  #pq指定典型变量的个数
    import numpy as np
    n,p=np.shape(X); n,q=np.shape(Y)
    if pq==None: pq=min(p,q)
    cca=CCA(n_components=pq).fit(X,Y); 
    u_scores,v_scores=cca.transform(X,Y) 
    r=DF(u_scores).corrwith(DF(v_scores));  
    CR=CR_test(n,p,q,r)           
    print('典型相关系数检验：\n',CR)   
    print('\n典型相关变量系数：\n')
    u_coef=DF(cca.x_rotations_.T,['u%d'%(i+1) for i in range(pq)],X.columns)
    v_coef=DF(cca.y_rotations_.T,['v%d'%(i+1) for i in range(pq)],Y.columns)        
    if plot: #显示第一对典型变量的关系图
        import matplotlib.pyplot as plt    
        plt.plot(u_scores[:,0],v_scores[:,0],'o')
    return u_coef,v_coef

In [None]:
cancor(X,Y,plot=True)

### 10.4.2 典型相关的实证分析

In [None]:
d102=pd.read_excel('mvsData.xlsx','d102');d102

In [None]:
#cancor(d102[['x1','x2','x3','x4']],d102[['y1','y2','y3','y4','y5','y6']])
X=d102[['x1','x2','x3','x4']];Y=d102[['y1','y2','y3','y4','y5','y6']]
cancor(X,Y)

In [None]:
cancor(X,Y,2,plot=True) #去前两对典型变量并绘制第一对典型变量的散点图

## 案例10：R&D投入与产出的典型相关分析

In [None]:
Case10=pd.read_excel('mvsCase.xlsx','Case10');Case10

In [None]:
Case10.corr()

In [None]:
pd.plotting.scatter_matrix(Case10,figsize=(10,8));

In [None]:
cancor(Case10[['x1','x2','x3']],Case10[['y1','y2','y3','y4','y5','y6']])

In [None]:
cancor(Case10[['x1','x2','x3']],Case10[['y1','y2','y3','y4','y5','y6']]
       ,2,True)