In [1]:
import pymc as pm
import arviz as az
import seaborn as sns
import scipy.stats as st
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr
import pandas as pd
import ipywidgets

import warnings
warnings.filterwarnings("ignore")

In [2]:
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')
# 选取本小组站点的数据
df = df_raw[df_raw["Site"] == "VCU"]

In [3]:
# 选取本次作业涉及的变量
df = df[["romantic", "avoidance_r", "anxiety_r"]]
#重新编码，编码后的数据：1 = "yes"; 2 = "no"
df["romantic"] =  np.where(df['romantic'] == 2, 0, 1)
#设置索引
df["index"] = range(len(df))
df = df.set_index("index")

#剔除缺失值
df=df.dropna()

In [4]:
#检查是否有缺失值
df.isnull().values.any()

False

In [5]:
#展示数据
df

Unnamed: 0_level_0,romantic,avoidance_r,anxiety_r
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0.099036,2.132340
1,1,-0.982006,0.295121
2,0,0.038978,1.064189
3,1,-1.102122,-0.730304
4,0,0.579499,-0.473948
...,...,...,...
146,0,1.300193,0.166943
147,0,0.459383,1.106915
148,1,1.720599,-0.773030
149,1,-0.381427,-2.482070


# 模型定义  
1. 自变量：回避倾向（avoidance_r）、焦虑程度（anxiety_r）  

2. 因变量：恋爱关系（romantic）  

3. 数据关系：  

  $$ \begin{array}{lcrl}\text{data:} & \hspace{.01in} & Y_i|\beta_0,\beta_1,\beta_2 & \stackrel{ind}{\sim}\text{Bern}(\pi_i)\;\;\text{ with }\;\;\pi_i=\frac{e^{\beta_0+\beta_1X_{i1}+\beta_2X_{i2}}}{1+e^{\beta_0+\beta_1X_{i1}+\beta_2X_{i2}}} \\ \text{priors:} & & \beta_0 & \sim N\left(0,0.5^2\right) \\ & & \beta_1 & \sim N\left(0,0.5^2\right) \\& & \beta_2 & \sim N\left(0,0.5^2\right) \\{}\end{array} $$

In [6]:
with pm.Model() as log_model1:
    log_model1.add_coord('obs_id',df.index.values, mutable=True)
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    anxiety = pm.MutableData("anxiety", df.anxiety_r, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')
    #先验
    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)                  
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)         
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)
    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance+ beta_2 * anxiety, dims="obs_id")
    #使用Logistic sigmoid function：pm.math.invlogit
    #相当于进行了如下计算 (1 / (1 + exp(-mu))
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")


In [7]:
log1_prior = pm.sample_prior_predictive(samples=50, 
                                          model=log_model1,
                                          random_seed=84735)

Sampling: [beta_0, beta_1, beta_2, y_est]


In [8]:
log1_prior

# MCMC采样 & 模型诊断  


In [9]:
with log_model1:
    # MCMC 近似后验分布
    log_model1_trace = pm.sample(
                                draws=5000,                   # draws为采样次数
                                tune=1000,                    # tune为调整采样策略的次数
                                chains=4,                     # 链数
                                discard_tuned_samples= True,  
                                random_seed=84735)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 19 seconds.


In [10]:
az.plot_trace(log_model1_trace,
              var_names=["beta_0","beta_1","beta_2"],
              figsize=(15,8),
              compact=False)
plt.show()

# 后验参数解释  
下图的结果显示：  
- $\beta_0 = 0.12$，那么 $e^{\beta_0} = 1.3$， 表明 X1、X2为 0时，个体恋爱的发生比为 1.3。  
- $\beta_1 = -0.067$， $e^{\beta_0} = 0.95$， 表明回避分数每增加1个单位，个体恋爱的发生比变为之前的0.95倍。  
- $\beta_2 = -0.715$， $e^{\beta_0} = 0.5$， 表明焦虑分数每增加1个单位，个体恋爱的发生比变为之前的0.5倍。  
- 然而，$\beta_1$ 的94%HDI包括0，说明回避分数不能有效预测恋爱发生的概率;  $\beta_2$ 的94%HDI不包括0，说明焦虑分数能有效预测恋爱发生的概率

In [11]:
az.summary(log_model1_trace, var_names=["beta_0","beta_1","beta_2"])

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
beta_0,0.221,0.164,-0.097,0.523,0.001,0.001,30406.0,16085.0,1.0
beta_1,-0.067,0.165,-0.383,0.239,0.001,0.001,31044.0,15666.0,1.0
beta_2,-0.715,0.178,-1.052,-0.377,0.001,0.001,29472.0,16126.0,1.0


In [12]:
# 通过 np.exp 将 beta 参数进行转换
az.plot_posterior(log_model1_trace, var_names=["beta_0","beta_1","beta_2"], transform = np.exp)
plt.show()

In [13]:
with log_model1:
    log_model1_ppc = pm.sample_posterior_predictive(log_model1_trace, random_seed=84735)

Sampling: [y_est]


In [14]:
log_model1_trace

# 绘制后验回归模型  
使用az.plot_hdi

In [15]:
az.plot_hdi(
    df.avoidance_r,
    log_model1_trace.posterior.pi,
    hdi_prob=0.95,
    fill_kwargs={"alpha": 0.25, "linewidth": 0},
    color="C8"
)
post_mean = log_model1_trace.posterior.pi.mean(("chain", "draw"))
sns.lineplot(x = df.avoidance_r, 
             y= post_mean, 
             label="posterior mean", 
             color="C8")
sns.scatterplot(x = df.avoidance_r, 
                y= df.romantic,label="observed data", 
                color='#C00000', 
                alpha=0.5)
plt.legend(loc="upper right",
           bbox_to_anchor=(1.5, 1),
           fontsize=12)
sns.despine()

az.plot_hdi(
    df.anxiety_r,
    log_model1_trace.posterior.pi,
    hdi_prob=0.95,
    fill_kwargs={"alpha": 0.25, "linewidth": 0},
    color="C2"
)
post_mean = log_model1_trace.posterior.pi.mean(("chain", "draw"))
sns.lineplot(x = df.anxiety_r, 
             y= post_mean, 
             label="posterior mean", 
             color="C2")
sns.scatterplot(x = df.anxiety_r, 
                y= df.romantic,label="observed data", 
                color='#C00000', 
                alpha=0.5)
plt.legend(loc="upper right",
           bbox_to_anchor=(1.5, 1),
           fontsize=12)
sns.despine()

# 新数据预测 & 评估  


In [16]:
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')
# 选取新组站点的数据
df = df_raw[df_raw["Site"] == "Oxford"]

In [17]:
# 选取本次作业涉及的变量
df = df[["romantic", "avoidance_r", "anxiety_r"]]
#重新编码，编码后的数据：1 = "yes"; 2 = "no"
df["romantic"] =  np.where(df['romantic'] == 2, 0, 1)
#设置索引
df["index"] = range(len(df))
df = df.set_index("index")

#剔除缺失值
df=df.dropna()

In [18]:
df

Unnamed: 0_level_0,romantic,avoidance_r,anxiety_r
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,0.781027,-1.975260
1,1,1.681138,-0.323469
2,1,0.893541,-1.172962
3,1,0.218458,0.384442
4,0,-1.244223,0.148472
...,...,...,...
132,0,-0.737910,0.337248
133,1,-0.681653,-0.370663
134,1,0.612256,-1.644902
135,0,-0.512882,0.903577


In [19]:
pred_coords ={"obs_id":range(0,137)} 

with log_model1:
    pm.set_data({"avoidance":df["avoidance_r"],
                 "anxiety": df["anxiety_r"],
                 "y": df["romantic"]},
                 coords=pred_coords
                ) 
#生成对因变量的预测
    prediction = pm.sample_posterior_predictive(log_model1_trace, 
                                                var_names=["y_est"],
                                                predictions=True,
                                                extend_inferencedata=True,
                                                random_seed=84735)


Sampling: [y_est]


In [20]:
prediction

In [21]:
y_pred = prediction.predictions["y_est"].stack(sample=("chain","draw","obs_id")).values
y_pred_freq = np.bincount(y_pred)/len(y_pred)
bars = plt.bar([0, 1], y_pred_freq, color="#70AD47")
for bar, freq in zip(bars, y_pred_freq):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{freq:.2f}", ha='center', va='bottom')
plt.xticks([0, 1])
plt.suptitle("Out-of-sample prediction(X=1)")
plt.xlabel("romantic")
plt.ylabel("proportion")
sns.despine()
print(y_pred)

[1 1 1 ... 1 1 0]


In [22]:
coords = {"obs_id": df.index}
with pm.Model(coords=coords) as log_model1:
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    anxiety=pm.MutableData("anxiety", df.anxiety_r, dims="obs_id")

    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)                   
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)           
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5) 
    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance+beta_2*anxiety, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi, observed=df.romantic,dims="obs_id")

    log_model1_trace = pm.sample(draws=5000,                 
                                tune=1000,                  
                                chains=4,                     
                                discard_tuned_samples= True, 
                                random_seed=84735)
    log_model1_ppc = pm.sample_posterior_predictive(log_model1_trace)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 19 seconds.
Sampling: [y_est]


In [23]:
y_pred = log_model1_ppc.posterior_predictive.y_est.stack(sample = ("chain", "draw")).mean(dim="sample")
y_pred_df = pd.DataFrame(y_pred, columns=["y_pred"])
log_model1_df = pd.DataFrame({"avoidance": log_model1_ppc.constant_data.avoidance, "anxiety": log_model1_ppc.constant_data.anxiety,"romantic": log_model1_ppc.observed_data.y_est})
y_pred_df = pd.concat([y_pred_df, log_model1_df], axis=1)
#根据分类标准（50-50）生成最终的分类结果
y_pred_df["romantic_2"] = np.where(np.array(y_pred) >= 0.5, 1, 0)
print(y_pred_df)

      y_pred  avoidance   anxiety  romantic  romantic_2
0    0.79870   0.781027 -1.975260         1           1
1    0.57455   1.681138 -0.323469         1           1
2    0.69205   0.893541 -1.172962         1           1
3    0.43765   0.218458  0.384442         1           0
4    0.45010  -1.244223  0.148472         0           0
..       ...        ...       ...       ...         ...
132  0.42255  -0.737910  0.337248         0           0
133  0.54550  -0.681653 -0.370663         1           1
134  0.75730   0.612256 -1.644902         1           1
135  0.34110  -0.512882  0.903577         0           0
136  0.21775  -0.231598  1.894651         0           0

[137 rows x 5 columns]


In [24]:
# 使用`pd.crosstab`生成混淆矩阵
confusion_matrix = pd.crosstab(y_pred_df["romantic"], y_pred_df["romantic_2"], 
                              rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,45,24
1,26,42


In [25]:
true_positive = confusion_matrix.at[0,1]
false_positive = confusion_matrix.at[1,1]
true_negative = confusion_matrix.at[0, 0]
false_negative = confusion_matrix.at[1, 0]
accuracy = (true_positive + false_negative) /(true_positive + false_positive + true_negative + false_negative)
sensitivity = (true_positive) /(true_positive + false_negative)
specificity = (true_negative) / (true_negative + false_positive)

print("准确性:", accuracy)
print("敏感性:", sensitivity)
print("特异性:", specificity)

准确性: 0.36496350364963503
敏感性: 0.48
特异性: 0.5172413793103449


**模型对于数据的预测准确性为0.36，说明模型对于预测被试是否会恋爱的准确率为0.36  
敏感性为0.48，说明模型对正例的预测能力为0.48  
特异性为0.52，说明模型对负例的预测能力为0.52**

# 对本数据集的预测结果进行评估

In [40]:
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')
# 选取本小组站点的数据
df = df_raw[df_raw["Site"] == "VCU"]

In [41]:
# 选取本次作业涉及的变量
df = df[["romantic", "avoidance_r", "anxiety_r"]]
#重新编码，编码后的数据：1 = "yes"; 2 = "no"
df["romantic"] =  np.where(df['romantic'] == 2, 0, 1)
#设置索引
df["index"] = range(len(df))
df = df.set_index("index")

#剔除缺失值
df=df.dropna()

In [42]:
#需要重新运行模型定义和MCMC采样部分

coords = {"obs_id": df.index}
with pm.Model(coords=coords) as log_model1:
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    anxiety = pm.MutableData("anxiety", df.anxiety_r, dims="obs_id")
    #先验
    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)                   
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)           
    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance+ beta_2 * anxiety, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi, observed=df.romantic,dims="obs_id")

    log_model1_trace = pm.sample(draws=5000,                 
                                tune=1000,                  
                                chains=4,                     
                                discard_tuned_samples= True, 
                                random_seed=84735)
    log_model1_ppc = pm.sample_posterior_predictive(log_model1_trace)


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 19 seconds.
Sampling: [y_est]


In [43]:
az.plot_ppc(log_model1_ppc, num_pp_samples=50)

<Axes: xlabel='y_est / y_est'>

In [44]:
log_model1_ppc.posterior_predictive.y_est.stack(sample = ("chain", "draw"))

In [45]:
pred_pi = log_model1_ppc.posterior_predictive.y_est.stack(sample = ("chain", "draw")).mean(dim="sample")
# 转换为数据框
pred_pi = pred_pi.to_dataframe()

In [46]:
#将x、y存入数据框
pred_pi["avoidance"] = log_model1_ppc.constant_data.avoidance.values
pred_pi["anxiety"] = log_model1_ppc.constant_data.anxiety.values
pred_pi["romantic"] = log_model1_ppc.observed_data.y_est.values
#根据分类标准（50-50）生成最终的分类结果
pred_pi["romantic_pred"] = np.where(pred_pi["y_est"] >= 0.5, 1, 0)
pred_pi

Unnamed: 0_level_0,y_est,avoidance,anxiety,romantic,romantic_pred
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.22275,0.099036,2.132340,0,0
1,0.51955,-0.982006,0.295121,1,1
2,0.36860,0.038978,1.064189,0,0
3,0.69200,-1.102122,-0.730304,1,1
4,0.63230,0.579499,-0.473948,0,1
...,...,...,...,...,...
146,0.50090,1.300193,0.166943,0,1
147,0.35945,0.459383,1.106915,0,0
148,0.64940,1.720599,-0.773030,1,1
149,0.87345,-0.381427,-2.482070,1,1


In [47]:
# 使用`pd.crosstab`生成混淆矩阵
confusion_matrix = pd.crosstab(pred_pi["romantic"], pred_pi["romantic_pred"], 
                              rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,36,31
1,20,64


In [48]:
# 计算a b c d的数量
true_positive = confusion_matrix.at[0,1]
false_positive = confusion_matrix.at[1,1]
true_negative = confusion_matrix.at[0, 0]
false_negative = confusion_matrix.at[1, 0]
# 代入公式
accuracy = (true_positive + false_negative) /(true_positive + false_positive + true_negative + false_negative)
sensitivity = (true_positive) /(true_positive + false_negative)
specificity = (true_negative) / (true_negative + false_positive)

print("准确性:", accuracy)
print("敏感性:", sensitivity)
print("特异性:", specificity)

准确性: 0.33774834437086093
敏感性: 0.6078431372549019
特异性: 0.36


**模型对于数据的预测准确性为0.34，说明模型对于预测被试是否会恋爱的准确率为0.34  
敏感性为0.61，说明模型对正例的预测能力为0.61  
特异性为0.36，说明模型对负例的预测能力为0.36**

# 模型比较

In [49]:
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')
#进入VCU站点
df = df_raw[df_raw["Site"] == "VCU"]
df = df[["romantic", "avoidance_r", "sex","anxiety_r"]]
df["romantic"] =  np.where(df['romantic'] == 2, 0, 1)
df["index"] = range(len(df))
df = df.set_index("index")
#剔除缺失值
df=df.dropna()
print(df)

       romantic  avoidance_r  sex  anxiety_r
index                                       
0             0     0.099036  2.0   2.132340
1             1    -0.982006  2.0   0.295121
2             0     0.038978  2.0   1.064189
3             1    -1.102122  1.0  -0.730304
4             0     0.579499  1.0  -0.473948
...         ...          ...  ...        ...
143           0    -0.861890  2.0   0.508751
144           0     0.759672  2.0   0.679655
145           0    -0.441485  2.0   1.833257
146           0     1.300193  1.0   0.166943
147           0     0.459383  2.0   1.106915

[148 rows x 4 columns]


**定义模型**

In [50]:
with pm.Model() as log_model2:
    log_model2.add_coord('obs_id',df.index, mutable=True)
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    anxiety= pm.MutableData("anxiety", df.anxiety_r, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)                    
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)

    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance+beta_2*anxiety, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")
    
with pm.Model() as log_model3:
    log_model3.add_coord('obs_id',df.index, mutable=True)
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)                   
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)          
    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")

with pm.Model() as log_model4:
    log_model4.add_coord('obs_id',df.index, mutable=True)
    sex= pm.MutableData("sex",df.sex, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)                   
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)           
    mu = pm.Deterministic("mu", beta_0 + beta_2 *sex, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")

**后验参数采样**

In [51]:
with log_model2:
    log_model2_trace = pm.sample(
                                draws=5000,                   # 使用mcmc方法进行采样，draws为采样次数
                                tune=1000,                    # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                                chains=4,                     # 链数
                                discard_tuned_samples= True,  # tune的结果将在采样结束后被丢弃
                                random_seed=84735)

with log_model3:
    log_model3_trace = pm.sample(draws=5000,                  # 使用mcmc方法进行采样，draws为采样次数
                      tune=1000,                    # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                      chains=4,                     # 链数
                      discard_tuned_samples= True,  # tune的结果将在采样结束后被丢弃
                      idata_kwargs={"log_likelihood": True},
                      random_seed=84735)

with log_model4:
    log_model4_trace = pm.sample(draws=5000,                   # 使用mcmc方法进行采样，draws为采样次数
                        tune=1000,                   # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                        chains=4,                    # 链数
                        discard_tuned_samples= True, # tune的结果将在采样结束后被丢弃
                        idata_kwargs={"log_likelihood": True},
                        random_seed=84735)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 19 seconds.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 16 seconds.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 24 seconds.


In [52]:
#后验预测
with log_model2:
    log_model2_ppc = pm.sample_posterior_predictive(log_model2_trace, random_seed=84735)
with log_model3:
    log_model3_ppc = pm.sample_posterior_predictive(log_model3_trace, random_seed=84735)
with log_model4:
    log_model4_ppc = pm.sample_posterior_predictive(log_model4_trace, random_seed=84735)

Sampling: [y_est]


Sampling: [y_est]


Sampling: [y_est]


**用MAE评估后验预测**

In [53]:
from statistics import median
def MAE(model_ppc):
    pre_x = model_ppc.posterior_predictive["y_est"].stack(sample=("chain", "draw"))
    pre_y_mean = pre_x.mean(axis=1).values

    MAE = pd.DataFrame({
        "romantic_ppc_mean": pre_y_mean,
        "romantic_original": df.romantic
    })

    MAE["pre_error"] = abs(MAE["romantic_original"] -\
                            MAE["romantic_ppc_mean"])

    MAE = median(MAE.pre_error)
    return MAE

log_model2_MAE = MAE(log_model2_ppc)
print(f"模型2 MAE: {log_model2_MAE:.2f}")
log_model3_MAE = MAE(log_model3_ppc)
print(f"模型3 MAE: {log_model3_MAE:.2f}")
log_model4_MAE = MAE(log_model4_ppc)
print(f"模型4 MAE: {log_model4_MAE:.2f}")

模型2 MAE: 0.42
模型3 MAE: 0.49
模型4 MAE: 0.48


模型2的误差小于模型3、4，即2的预测能力强于3、4的

In [54]:
log_likelihood = pm.compute_log_likelihood(log_model2_trace, model=log_model2)
print(log_model2_trace)
print(log_model3_trace)
print(log_model4_trace)

Inference data with groups:
	> posterior
	> log_likelihood
	> sample_stats
	> observed_data
	> constant_data
Inference data with groups:
	> posterior
	> log_likelihood
	> sample_stats
	> observed_data
	> constant_data
Inference data with groups:
	> posterior
	> log_likelihood
	> sample_stats
	> observed_data
	> constant_data


In [55]:
comparison_list = {
    "log_model2(aviodance,anxiety)":log_model4_trace,
    "log_model3(aviodance)":log_model2_trace,
    "log_model4(sex)":log_model3_trace,
}
az.compare(comparison_list)

Unnamed: 0,rank,elpd_loo,p_loo,elpd_diff,weight,se,dse,warning,scale
log_model3(aviodance),0,-95.52332,2.636752,0.0,0.9987327,3.819293,0.0,False,log
"log_model2(aviodance,anxiety)",1,-102.960799,1.249903,7.437479,0.001267303,1.260216,3.694906,False,log
log_model4(sex),2,-103.606955,1.853957,8.083635,2.220446e-16,1.211354,3.689784,False,log
