### 导入并清洗数据

In [4]:
# 导入 pymc 模型包，和 arviz 等分析工具 
import pymc as pm
import arviz as az
import seaborn as sns
import scipy.stats as st
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr
import pandas as pd
import ipywidgets

# 忽略不必要的警告
import warnings
warnings.filterwarnings("ignore")

In [5]:
# 通过 pd.read_csv 加载数据 Data_Sum_HPP_Multi_Site_Share.csv
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')

# 选取牛津站点的数据
df = df_raw[df_raw["Site"] == "Oxford"]

# 选取需要研究的变量
df = df[["romantic", "attachhome", "attachphone"]]

#重新编码，编码后的数据：1 = "yes"; 2 = "no"
df["romantic"] = np.where(df['romantic'] == 2, 0, 1)

#设置索引
df["index"] = range(len(df))
df = df.set_index("index")

# 展示数据
df.head(10)

Unnamed: 0_level_0,romantic,attachhome,attachphone
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,2.222222,1.0
1,1,3.333333,1.777778
2,1,4.111111,2.555556
3,1,3.777778,2.555556
4,0,4.0,3.0
5,0,3.888889,1.0
6,0,3.444444,1.0
7,1,4.111111,2.0
8,0,3.555556,1.666667
9,1,1.555556,2.0


### 模型定义  

1 自变量：回家水平、手机使用水平  

2 因变量：恋爱情况  

3 数据关系：  

$$  
\begin{array}{lcrl}  
\text{data:} & \hspace{.01in} & Y_i|\beta_0,\beta_3,\beta_4 & \stackrel{ind}{\sim} \text{Bern}(\pi_i) \;\; \text{ with } \;\; \pi_i = \frac{e^{\beta_0 + \beta_3 X_{i1} + \beta_4 X_{i2}}}{1 + e^{\beta_0 + \beta_3 X_{i1} + \beta_4 X_{i2}}} \\  
\text{priors:} & & \beta_{0}  &  \sim N\left(0, 0.5^2 \right)  \\  
               & & \beta_3  & \sim N\left(0, 0.5^2 \right)\\  
							 & & \beta_4  & \sim N\left(0, 0.5^2 \right)\\  
\end{array}  
$$  

In [6]:
with pm.Model() as log_model3:
    log_model3.add_coord('obs_id',df.index, mutable=True)
    attachhome = pm.MutableData("attachhome", df.attachhome, dims="obs_id")
    attachphone = pm.MutableData("attachphone", df.attachphone, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    #先验
    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)          #定义beta_0          
    beta_3 = pm.Normal("beta_3", mu=0, sigma=0.5)          #定义beta_3
    beta_4 = pm.Normal("beta_4", mu=0, sigma=0.5)          #定义beta_4
    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_3 * attachhome + beta_4 * attachphone, dims="obs_id")
    #注意此处使用了Logistic sigmoid function：pm.math.invlogit
    #相当于进行了如下计算 (1 / (1 + exp(-mu))
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")

In [7]:
# 可视化模型（贝叶斯变量因果图）
pm.model_to_graphviz(log_model3)

### MCMC采样

In [8]:
with log_model3:
    # MCMC 近似后验分布
    log_model3_trace = pm.sample(
                                draws=5000,                   # 使用mcmc方法进行采样，draws为采样次数
                                tune=1000,                    # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                                chains=4,                     # 链数
                                discard_tuned_samples= True,  # tune的结果将在采样结束后被丢弃
                                random_seed=84735)                          

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_3, beta_4]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 30 seconds.


In [9]:
az.plot_trace(log_model3_trace,
              var_names=["beta_0","beta_3","beta_4"],
              figsize=(15,8),
              compact=False)
plt.show()

### 后验参数解释  

In [69]:
az.summary(log_model3_trace, var_names=["beta_0","beta_3","beta_4"])

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
beta_0,0.13,0.441,-0.708,0.945,0.004,0.004,9740.0,9495.0,1.0
beta_3,0.026,0.14,-0.231,0.293,0.001,0.001,9643.0,10071.0,1.0
beta_4,-0.093,0.159,-0.393,0.202,0.002,0.001,9330.0,9822.0,1.0


In [70]:
# 通过 np.exp 将 beta 参数进行转换
az.plot_posterior(log_model3_trace, var_names=["beta_0","beta_3","beta_4"], transform = np.exp)
plt.show()

以上的结果显示：  
- $\beta_0 = 0.13$，那么 $e^{\beta_0} = 1.3$， 表明 X1 为 0时，个体恋爱的可能性为 1.3。  
- $\beta_3 = 0.03$， $e^{\beta_0} = 1$， 表明回家水平每增加1个单位，个体恋爱的发生比变为之前的1倍。  
- $\beta_4 = -0.09$， $e^{\beta_0} = 0.92$， 表明手机使用水平每增加1个单位，个体恋爱的发生比变为之前的0.92倍。   
- 然而，$\beta_3$、$\beta_4$ 的94%HDI包括0，说明回家水平、手机使用水平不能有效预测恋爱发生的概率。 

### 绘制后验预测回归线

In [10]:
#画出每个自变量对应的恋爱概率94%hdi值
az.plot_hdi(
    df.attachhome,
    log_model3_trace.posterior.pi,
    hdi_prob=0.95,
    fill_kwargs={"alpha": 0.25, "linewidth": 0},
    color="C1"
)

#得到每个自变量对应的恋爱概率均值，并使用sns.lineplot连成一条光滑的曲线
post_mean = log_model3_trace.posterior.pi.mean(("chain", "draw"))
sns.lineplot(x = df.attachhome, 
             y= post_mean, 
             label="posterior mean", 
             color="C1")

#绘制真实数据散点图
sns.scatterplot(x = df.attachhome, 
                y= df.romantic,label="observed data", 
                color='#C00000', 
                alpha=0.5)

#设置图例位置
plt.legend(loc="upper right",
           bbox_to_anchor=(1.5, 1),
           fontsize=12)
sns.despine()

In [11]:
#画出每个自变量对应的恋爱概率94%hdi值
az.plot_hdi(
    df.attachphone,
    log_model3_trace.posterior.pi,
    hdi_prob=0.95,
    fill_kwargs={"alpha": 0.25, "linewidth": 0},
    color="C2"
)

#得到每个自变量对应的恋爱概率均值，并使用sns.lineplot连成一条光滑的曲线
sns.lineplot(x = df.attachphone, 
             y= post_mean, 
             label="posterior mean", 
             color="C2")  

#绘制真实数据散点图
sns.scatterplot(x = df.attachphone, 
                y= df.romantic,label="observed data", 
                color='#C00000', 
                alpha=0.5)  

#设置图例位置
plt.legend(loc="upper right",
           bbox_to_anchor=(1.5, 1),
           fontsize=12)
sns.despine()

### 对新数据进行预测&分类

In [12]:
# 通过 pd.read_csv 加载数据 Data_Sum_HPP_Multi_Site_Share.csv
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')

# 选取波兰站点的数据
df_new = df_raw[df_raw["Site"] == "Poland"]

# 选取需要研究的变量
df_new = df_new[["romantic", "attachhome", "attachphone"]]

#重新编码，编码后的数据：1 = "yes"; 2 = "no"
df_new["romantic"] = np.where(df_new['romantic'] == 2, 0, 1)

#设置索引
df_new["index"] = range(len(df_new))
df_new = df_new.set_index("index")

# 展示数据
df.head(10)

Unnamed: 0_level_0,romantic,attachhome,attachphone
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,2.222222,1.0
1,1,3.333333,1.777778
2,1,4.111111,2.555556
3,1,3.777778,2.555556
4,0,4.0,3.0
5,0,3.888889,1.0
6,0,3.444444,1.0
7,1,4.111111,2.0
8,0,3.555556,1.666667
9,1,1.555556,2.0


In [13]:
pred_coords = {"obs_id":range(0,136)}
with log_model3:
    # 传入数据
    pm.set_data({"attachhome": df_new["attachhome"],
                "attachphone": df_new["attachphone"],
                "y": df_new["romantic"]},
                coords=pred_coords
                )   
    # 生成对因变量的预测
    prediction = pm.sample_posterior_predictive(log_model3_trace, 
                                                var_names=["y_est"],
                                                predictions=True,
                                                extend_inferencedata=True,
                                                random_seed=84735)

Sampling: [y_est]


In [14]:
# 提取储存在 predicitons中的预测值
y_pred = prediction.predictions["y_est"].stack(sample=("chain","draw","obs_id")).values
# 统计其中0和1的个数，并除以总数，得到0和1对应的比例值
y_pred_freq = np.bincount(y_pred)/len(y_pred)
#绘制柱状图
bars = plt.bar([0, 1], y_pred_freq, color="#70AD47")
#用于在柱状图上标明比例值
for bar, freq in zip(bars, y_pred_freq):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{freq:.2f}", ha='center', va='bottom')
#对刻度、标题、坐标轴标题进行设置
plt.xticks([0, 1])
plt.suptitle("Out-of-sample prediction(X=1)")
plt.xlabel("romantic")
plt.ylabel("proportion")
sns.despine()

* 由图可得，在对新结果的预测中，有50%的可能将其预测为0，有50%的可能将其预测为1。

### 后验预测评估

In [60]:
coords = {"obs_id": df_new.index}

with pm.Model(coords=coords) as log_model3:
    attachhome = pm.MutableData("attachhome", df_new.attachhome, dims="obs_id")
    attachphone = pm.MutableData("attachphone", df_new.attachphone, dims="obs_id")
    y = pm.MutableData('y', df_new.romantic, dims = 'obs_id')

    #先验
    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)          #定义beta_0          
    beta_3 = pm.Normal("beta_3", mu=0, sigma=0.5)          #定义beta_3
    beta_4 = pm.Normal("beta_4", mu=0, sigma=0.5)          #定义beta_4    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_3 * attachhome + beta_4 * attachphone, dims="obs_id")
    #注意此处使用了Logistic sigmoid function：pm.math.invlogit
    #相当于进行了如下计算 (1 / (1 + exp(-mu))
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")

    log_model3_trace = pm.sample(draws=5000,                 
                                tune=1000,                  
                                chains=4,                     
                                discard_tuned_samples= True, 
                                random_seed=84735)
    log_model3_ppc = pm.sample_posterior_predictive(log_model3_trace)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_3, beta_4]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 29 seconds.
Sampling: [y_est]


In [61]:
#stack(sample = ("chain", "draw")：将每一个X对应的4*5000个后验预测值合并到一个维度sample
#对于每一个X，需要计算其20000个值的平均值，因此将dim设置为sample
pred_pi = log_model3_ppc.posterior_predictive.y_est.stack(sample = ("chain", "draw")).mean(dim="sample")
# 转换为数据框
pred_pi = pred_pi.to_dataframe()

In [62]:
#将原数据中的X 和Y存入数据框
pred_pi["attachhome"] = log_model3_ppc.constant_data.attachhome.values
pred_pi["attachphone"] = log_model3_ppc.constant_data.attachphone.values
pred_pi["romantic"] = log_model3_ppc.observed_data.y_est.values

#根据分类标准（50-50）生成最终的分类结果
pred_pi["romantic_pred"] = np.where(pred_pi["y_est"] >= 0.5, 1, 0)
pred_pi

Unnamed: 0_level_0,y_est,attachhome,attachphone,romantic,romantic_pred
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.80800,3.555556,3.333333,1,1
1,0.83645,4.444444,3.111111,1,1
2,0.85435,5.000000,3.000000,1,1
3,0.76865,3.444444,1.444444,1,1
4,0.73640,2.444444,2.555556,1,1
...,...,...,...,...,...
131,0.84200,4.777778,3.000000,1,1
132,0.80220,3.555556,3.000000,1,1
133,0.83040,4.444444,2.777778,1,1
134,0.78225,3.111111,3.000000,1,1


In [63]:
# 使用`pd.crosstab`生成混淆矩阵，前两个值表明你需要统计的列名
# 由于要生成一个2*2的联表，需要指定行的名称和列的名称
confusion_matrix1 = pd.crosstab(pred_pi["romantic"], pred_pi["romantic_pred"], 
                              rownames=['Actual'], colnames=['Predicted'])
confusion_matrix1

Predicted,1
Actual,Unnamed: 1_level_1
0,25
1,111


In [64]:
coords = {"obs_id": df.index}

with pm.Model(coords=coords) as log_model3:
    attachhome = pm.MutableData("attachhome", df.attachhome, dims="obs_id")
    attachphone = pm.MutableData("attachphone", df.attachphone, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    #先验
    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)          #定义beta_0          
    beta_3 = pm.Normal("beta_3", mu=0, sigma=0.5)          #定义beta_3
    beta_4 = pm.Normal("beta_4", mu=0, sigma=0.5)          #定义beta_4    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_3 * attachhome + beta_4 * attachphone, dims="obs_id")
    #注意此处使用了Logistic sigmoid function：pm.math.invlogit
    #相当于进行了如下计算 (1 / (1 + exp(-mu))
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")

    log_model3_trace = pm.sample(draws=5000,                 
                                tune=1000,                  
                                chains=4,                     
                                discard_tuned_samples= True, 
                                random_seed=84735)
    log_model3_ppc = pm.sample_posterior_predictive(log_model3_trace)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_3, beta_4]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 31 seconds.
Sampling: [y_est]


In [65]:
#stack(sample = ("chain", "draw")：将每一个X对应的4*5000个后验预测值合并到一个维度sample
#对于每一个X，需要计算其20000个值的平均值，因此将dim设置为sample
pred_pi = log_model3_ppc.posterior_predictive.y_est.stack(sample = ("chain", "draw")).mean(dim="sample")
# 转换为数据框
pred_pi = pred_pi.to_dataframe()

In [66]:
#将原数据中的X 和Y存入数据框
pred_pi["attachhome"] = log_model3_ppc.constant_data.attachhome.values
pred_pi["attachphone"] = log_model3_ppc.constant_data.attachphone.values
pred_pi["romantic"] = log_model3_ppc.observed_data.y_est.values

#根据分类标准（50-50）生成最终的分类结果
pred_pi["romantic_pred"] = np.where(pred_pi["y_est"] >= 0.5, 1, 0)
pred_pi

Unnamed: 0_level_0,y_est,attachhome,attachphone,romantic,romantic_pred
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.52365,2.222222,1.000000,1,1
1,0.51410,3.333333,1.777778,1,1
2,0.50150,4.111111,2.555556,1,1
3,0.49995,3.777778,2.555556,1,0
4,0.49105,4.000000,3.000000,0,0
...,...,...,...,...,...
132,0.46630,2.777778,3.666667,0,0
133,0.49550,3.555556,2.666667,1,0
134,0.49465,3.888889,2.666667,1,0
135,0.48630,4.000000,3.000000,0,0


In [67]:
# 使用`pd.crosstab`生成混淆矩阵，前两个值表明你需要统计的列名
# 由于要生成一个2*2的联表，需要指定行的名称和列的名称
confusion_matrix2 = pd.crosstab(pred_pi["romantic"], pred_pi["romantic_pred"], 
                              rownames=['Actual'], colnames=['Predicted'])
confusion_matrix2

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,42,27
1,44,24


In [59]:
# 计算a b c d的数量
true_positive = 0
false_positive = 0
true_negative = 25
false_negative = 111
# 代入公式
accuracy = (true_positive + false_negative) /(true_positive + false_positive + true_negative + false_negative)
sensitivity = (true_positive) /(true_positive + false_negative)
specificity = (true_negative) / (true_negative + false_positive)

print("准确性（新站点）:", accuracy)
print("敏感性（新站点）:", sensitivity)
print("特异性（新站点）:", specificity)

准确性（新站点）: 0.8161764705882353
敏感性（新站点）: 0.0
特异性（新站点）: 1.0


In [68]:
# 计算a b c d的数量
true_positive = 24
false_positive = 27
true_negative = 42
false_negative = 44
# 代入公式
accuracy = (true_positive + false_negative) /(true_positive + false_positive + true_negative + false_negative)
sensitivity = (true_positive) /(true_positive + false_negative)
specificity = (true_negative) / (true_negative + false_positive)

print("准确性（原数据）:", accuracy)
print("敏感性（原数据）:", sensitivity)
print("特异性（原数据）:", specificity)

准确性（原数据）: 0.49635036496350365
敏感性（原数据）: 0.35294117647058826
特异性（原数据）: 0.6086956521739131


* 由上面的数据可得，model3预测新站点数据结果的敏感性极低，这说明在所有实际为正例的样本中，被正确预测为正例的比例极低，因此model3对正例的预测能力不好；同时，model3预测新站点数据结果的准确性和特异性较高，因此model3的整体性能较好。  
* model3预测原数据集的准确性、敏感性、特异性均不高，其中特异性最高，敏感性最低，因此model3对负例的预测能力较好，对正例的预测能力较弱，这与model3预测新站点数据的结果相一致。

### 与model1和model2进行模型比较

In [45]:
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')

df = df_raw[df_raw["Site"] == "Oxford"]

df = df[["romantic", "avoidance_r", "sex"]]

df["romantic"] =  np.where(df['romantic'] == 2, 0, 1)
df["sex"] =  np.where(df['sex'] == 1, 0, 1)

df["index"] = range(len(df))
df = df.set_index("index")

with pm.Model() as log_model1:
    # 此处对coords的定义方式进行了更改，因为后续我们需要进行对新数据的预测
    # 因此将维度定义成可更改的
    log_model1.add_coord('obs_id',df.index, mutable=True)
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    #先验
    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)          #定义beta_0          
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)          #定义beta_1
    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance, dims="obs_id")
    #注意此处使用了Logistic sigmoid function：pm.math.invlogit
    #相当于进行了如下计算 (1 / (1 + exp(-mu))
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")

with log_model1:
    # MCMC 近似后验分布
    log_model1_trace = pm.sample(
                                draws=5000,                   # 使用mcmc方法进行采样，draws为采样次数
                                tune=1000,                    # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                                chains=4,                     # 链数
                                discard_tuned_samples= True,  # tune的结果将在采样结束后被丢弃
                                idata_kwargs={"log_likelihood": True},
                                random_seed=84735)


with pm.Model() as log_model2:
    # 此处对coords的定义方式进行了更改，因为后续我们需要进行对新数据的预测
    # 因此将维度定义成可更改的
    log_model2.add_coord('obs_id',df.index, mutable=True)
    sex = pm.MutableData("sex", df.sex, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    #先验
    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)          #定义beta_0          
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)           #定义beta_1
    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_2 * sex, dims="obs_id")
    #注意此处使用了Logistic sigmoid function：pm.math.invlogit
    #相当于进行了如下计算 (1 / (1 + exp(-mu))
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")

with log_model2:
    # MCMC 近似后验分布
    log_model2_trace = pm.sample(
                                draws=5000,                   # 使用mcmc方法进行采样，draws为采样次数
                                tune=1000,                    # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                                chains=4,                     # 链数
                                discard_tuned_samples= True,
                                idata_kwargs={"log_likelihood": True},  # tune的结果将在采样结束后被丢弃
                                random_seed=84735)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 12 seconds.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 14 seconds.


In [46]:
# 通过 pd.read_csv 加载数据 Data_Sum_HPP_Multi_Site_Share.csv
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')

# 选取牛津站点的数据
df = df_raw[df_raw["Site"] == "Oxford"]

# 选取需要研究的变量
df = df[["romantic", "attachhome", "attachphone"]]

#重新编码，编码后的数据：1 = "yes"; 2 = "no"
df["romantic"] = np.where(df['romantic'] == 2, 0, 1)

#设置索引
df["index"] = range(len(df))
df = df.set_index("index")

with pm.Model() as log_model3:
    log_model3.add_coord('obs_id',df.index, mutable=True)
    attachhome = pm.MutableData("attachhome", df.attachhome, dims="obs_id")
    attachphone = pm.MutableData("attachphone", df.attachphone, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    #先验
    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)          #定义beta_0          
    beta_3 = pm.Normal("beta_3", mu=0, sigma=0.5)          #定义beta_3
    beta_4 = pm.Normal("beta_4", mu=0, sigma=0.5)          #定义beta_4
    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_3 * attachhome + beta_4 * attachphone, dims="obs_id")
    #注意此处使用了Logistic sigmoid function：pm.math.invlogit
    #相当于进行了如下计算 (1 / (1 + exp(-mu))
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")

with log_model3:
    # MCMC 近似后验分布
    log_model3_trace = pm.sample(
                                draws=5000,                   # 使用mcmc方法进行采样，draws为采样次数
                                tune=1000,                    # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                                chains=4,                     # 链数
                                discard_tuned_samples= True,  # tune的结果将在采样结束后被丢弃
                                idata_kwargs={"log_likelihood": True},
                                random_seed=84735)  

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_3, beta_4]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 31 seconds.


In [47]:
comparison_list = {
    "model1(avoidance)":log_model1_trace,
    "model2(sex)":log_model2_trace,
    "model3(attachhome&attachphone)":log_model3_trace,
}
az.compare(comparison_list)

Unnamed: 0,rank,elpd_loo,p_loo,elpd_diff,weight,se,dse,warning,scale
model2(sex),0,-96.495929,1.56853,0.0,0.9360443,0.176821,0.0,False,log
model1(avoidance),1,-96.622943,1.834341,0.127014,0.0639557,0.551328,0.541495,False,log
model3(attachhome&attachphone),2,-96.81462,2.092904,0.318691,1.110223e-16,0.542333,0.564979,False,log


* 由表可得，model1、model2、model3的elpd_loo相差不大，这说明三个模型预测性相差不大，三者之中model2的预测性更好，model3的预测性更差。