# 1.模型定义  


In [59]:
# 数据包以及数据库导入
# 导入 pymc 模型包，和 arviz 等分析工具 
import pymc as pm
import arviz as az
import seaborn as sns
import scipy.stats as st
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr
import pandas as pd
import ipywidgets

# 忽略不必要的警告
import warnings
warnings.filterwarnings("ignore")

In [80]:
#对数据进行初步处理
#通过 pd.read_csv 加载数据 Data_Sum_HPP_Multi_Site_Share.csv
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')

# 选取UCSB站点的数据
df = df_raw[df_raw["Site"] =="UCSB"]

# 选取变量romantic、anxiety、avoidance
df = df[["romantic", "anxiety_r", "avoidance_r","sex"]]

#重新编码，编码后的数据：1 = "yes"; 2 = "no"
df["romantic"] =  np.where(df['romantic'] == 2, 0, 1)
#0 表示男性，1表示女性
df["sex"] =  np.where(df['sex'] == 1, 0, 1)

#设置索引
df["index"] = range(len(df))
df = df.set_index("index")

In [87]:
with pm.Model() as log_model1:
    log_model1.add_coord('obs_id',df.index, mutable=True)
    anxiety = pm.MutableData("anxiety", df.anxiety_r, dims="obs_id")
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    #先验
    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)          #定义beta_0          
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)           #定义beta_1
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)           #定义beta_2
    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_1 * anxiety + beta_2 * avoidance, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")

In [62]:
# 可视化模型（贝叶斯变量因果图）
pm.model_to_graphviz(log_model1)

## 模型定义  
1. 自变量：anxiety acoidance  

2. 因变量：romantic  

3. 数据关系：  
$$  
\begin{array}{lcrl}  
\text{data:} & \hspace{.01in} & Y_i|\beta_0,\beta_1,\beta_2  & {\sim} \text{Bern}(\pi_i) \;\; \text{ with } \;\; \pi_i = \frac{e^{\beta_0 + \beta_1 X_{i1}+ \beta_2 X{i2}}}{1 + e^{\beta_0 + \beta_1 X_{i1}+ \beta_2 X_{i2}}} \\  
\text{priors:} & & \beta_{0}  &  \sim N(0,0.5)  \\  
               & & \beta_1  & \sim N(0,0.5). \\  
							 & & \beta_2  & \sim N(0,0.5). \\  
\end{array}  
$$  

## 先验预测检验 

In [63]:
#先验预测检验
log1_prior = pm.sample_prior_predictive(samples=50, 
                                          model=log_model1,
                                          random_seed=84735)

Sampling: [beta_0, beta_1, beta_2, y_est]


In [64]:
log1_prior

# 2.MCMC采样，模型诊断图，后验参数解释

##  MCMC采样

In [65]:
#mcmc采样
with log_model1:
    # MCMC 近似后验分布
    log_model1_trace = pm.sample(
                                draws=5000,                   # 使用mcmc方法进行采样，draws为采样次数
                                tune=1000,                    # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                                chains=4,                     # 链数
                                discard_tuned_samples= True,  # tune的结果将在采样结束后被丢弃
                                random_seed=84735)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 22 seconds.


## 模型诊断图  


In [66]:
#模型诊断图
az.plot_trace(log_model1_trace,
              var_names=["beta_0","beta_1", "beta_2"],
              figsize=(20,15),
              compact=False)
plt.show()

In [67]:
az.summary(log_model1_trace, var_names=["beta_0","beta_1", "beta_2"])

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
beta_0,-0.105,0.184,-0.443,0.243,0.001,0.001,29842.0,14160.0,1.0
beta_1,-0.398,0.192,-0.769,-0.058,0.001,0.001,30871.0,15786.0,1.0
beta_2,0.12,0.187,-0.225,0.472,0.001,0.001,29594.0,15162.0,1.0


In [68]:
# 通过 np.exp 将 beta 参数进行转换
az.plot_posterior(log_model1_trace, var_names=["beta_0","beta_1", "beta_2"], transform = np.exp)
plt.show()

## 后验参数解释  

结果显示：  
- $\beta_0 = -0.105$，那么 $e^{\beta_0} = 0.90$， 表明 X1 为 0时，个体恋爱的可能性为 0.9。  
- $\beta_1 = -0.398$， $e^{\beta_1} = 0.67$， 表明焦虑分数每增加1个单位，个体恋爱的发生比变为之前的0.67倍。  
- $\beta_2 = 0.120$， $e^{\beta_2} = 3.32$， 表明回避分数每增加1个单位，个体恋爱的发生比变为之前的3.32倍。  
- 然而，$\beta_0$,$\beta_2$ 的94%HDI包括0，说明回避分数不能有效预测恋爱发生的概率。 

In [69]:
#后验回归模型
with log_model1:
    log_model1_ppc = pm.sample_posterior_predictive(log_model1_trace, random_seed=84735)

Sampling: [y_est]


In [70]:
az.summary(log_model1_trace, kind = "stats")

Unnamed: 0,mean,sd,hdi_3%,hdi_97%
beta_0,-0.105,0.184,-0.443,0.243
beta_1,-0.398,0.192,-0.769,-0.058
beta_2,0.120,0.187,-0.225,0.472
mu[0],0.480,0.429,-0.347,1.281
mu[1],-0.150,0.372,-0.815,0.584
...,...,...,...,...
pi[103],0.355,0.076,0.216,0.498
pi[104],0.435,0.093,0.264,0.610
pi[105],0.390,0.100,0.205,0.576
pi[106],0.438,0.081,0.284,0.588


In [71]:
#画出每个自变量对应的恋爱概率94%hdi值
fig, axes = plt.subplots(figsize=(18, 8))

az.plot_hdi(
    df.anxiety_r,
    log_model1_trace.posterior.pi,
    hdi_prob=0.95,
    fill_kwargs={"alpha": 0.25, "linewidth": 0},
    color="C2"
)
#得到每个自变量对应的恋爱概率均值，并使用sns.lineplot连成一条光滑的曲线
post_mean = log_model1_trace.posterior.pi.mean(("chain", "draw"))
sns.lineplot(x = df.anxiety_r, 
             y= post_mean, 
             label="posterior mean", 
             color="C2"
             )
#绘制真实数据散点图
sns.scatterplot(x = df.avoidance_r, 
                y= df.romantic,label="observed data", 
                color='#C00000', 
                alpha=0.5)
#设置图例位置
plt.legend(loc="upper right",
           bbox_to_anchor=(1.5, 1),
           fontsize=12)
sns.despine()

#图2
az.plot_hdi(
    df.avoidance_r,
    log_model1_trace.posterior.pi,
    hdi_prob=0.95,
    fill_kwargs={"alpha": 0.25, "linewidth": 0},
    color="C1"
)
#得到每个自变量对应的恋爱概率均值，并使用sns.lineplot连成一条光滑的曲线
post_mean = log_model1_trace.posterior.pi.mean(("chain", "draw"))
sns.lineplot(x = df.avoidance_r, 
             y= post_mean, 
             label="posterior mean", 
             color="C1"
             )
#绘制真实数据散点图
sns.scatterplot(x = df.avoidance_r, 
                y= df.romantic,label="observed data", 
                color='#C00000', 
                alpha=0.5
                )
#设置图例位置
plt.legend(loc="upper right",
           bbox_to_anchor=(1.5, 1),
           fontsize=12)
sns.despine()

# 3. 使用定义好的模型，对新站点的结果进行预测并评估  

## 传入新站点数据

In [72]:
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')

# 选取Serbie站点的数据
df_new = df_raw[df_raw["Site"] == "Serbia"]

# 选取变量romantic、anxiety、avoidance
df_new = df_new[["romantic", "anxiety_r", "avoidance_r","sex"]]

#重新编码，编码后的数据：1 = "yes"; 2 = "no"
df_new["romantic"] =  np.where(df_new['romantic'] == 2, 0, 1)
#0 表示男性，1表示女性
df_new["sex"] =  np.where(df_new['sex'] == 1, 0, 1)

#设置索引
df_new["index"] = range(len(df_new))
df_new= df_new.set_index("index")
df_new

Unnamed: 0_level_0,romantic,anxiety_r,avoidance_r,sex
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,-1.453003,-1.011965,1
1,1,-0.984477,1.728803,1
2,0,1.358155,-0.195566,1
3,0,0.186839,-1.011965,1
4,1,-1.780971,-1.070279,1
...,...,...,...,...
159,1,0.608513,-0.078938,0
160,0,-0.609656,-1.186907,1
161,1,-0.187982,0.212633,0
162,0,0.467955,-1.070279,1


In [92]:
pred_coords = {"obs_id": df_new.index}
with log_model1:
    # 传入数据
    pm.set_data({"anxiety": df_new["anxiety_r"],
                 "avoidance":df_new["avoidance_r"],
                 "y":df_new["romantic"]},
                coords=pred_coords
                ) 
    # 生成对因变量的预测
    prediction = pm.sample_posterior_predictive(log_model1_trace,
                                                var_names=["y_est"],
                                                predictions=True,
                                                extend_inferencedata=True,
                                                random_seed=84735)


Sampling: [y_est]


In [93]:
prediction

## 组合预测数据与真实数据  


In [102]:
log_model1_ppc.posterior_predictive.y_est.stack(sample = ("chain", "draw"))

In [103]:
#stack(sample = ("chain", "draw")：将每一个X对应的4*5000个后验预测值合并到一个维度sample
#对于每一个X，需要计算其20000个值的平均值，因此将dim设置为sample
pred_pi = log_model1_ppc.posterior_predictive.y_est.stack(sample = ("chain", "draw")).mean(dim="sample")
# 转换为数据框
pred_pi = pred_pi.to_dataframe()

In [104]:
#将原数据中的X 和Y存入数据框
pred_pi["avoidance"] = log_model1_ppc.constant_data.avoidance.values
pred_pi["romantic"] = log_model1_ppc.observed_data.y_est.values

#根据分类标准（50-50）生成最终的分类结果
pred_pi["romantic_pred"] = np.where(pred_pi["y_est"] >= 0.5, 1, 0)
pred_pi

Unnamed: 0_level_0,y_est,avoidance,romantic,romantic_pred
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.63585,-1.011965,1,1
1,0.74880,1.728803,1,1
2,0.24770,-0.195566,0,0
3,0.36795,-1.011965,0,0
4,0.69175,-1.070279,1,1
...,...,...,...,...
159,0.35905,-0.078938,1,0
160,0.49110,-1.186907,0,0
161,0.51990,0.212633,1,1
162,0.31905,-1.070279,0,0


## 评估

In [105]:
confusion_matrix = pd.crosstab(pred_pi["romantic"], pred_pi["romantic_pred"], 
                              rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,59,27
1,30,48


In [114]:
# 计算a b c d的数量
true_positive = 0
false_positive = 0
true_negative = confusion_matrix.at[0, 0]
false_negative = confusion_matrix.at[1, 0]
# 代入公式
accuracy_n = (true_positive + false_negative) /(true_positive + false_positive + true_negative + false_negative)
sensitivity_n = (true_positive) /(true_positive + false_negative)
specificity_n = (true_negative) / (true_negative + false_positive)

print("准确性:", accuracy_n)
print("敏感性:", sensitivity_n)
print("特异性:", specificity_n)

准确性: 0.40625
敏感性: 0.0
特异性: 1.0


# 4. 对本数据集的预测结果进行评估 

In [107]:
coords = {"obs_id": df.index}

with pm.Model(coords=coords) as log_model1:
    log_model1.add_coord('obs_id',df.index, mutable=True)
    anxiety = pm.MutableData("anxiety", df.anxiety_r, dims="obs_id")
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    #先验
    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)          #定义beta_0          
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)           #定义beta_1
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)           #定义beta_2
    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_1 * anxiety + beta_2 * avoidance, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")


    log_model1_trace = pm.sample(draws=5000,                 
                                tune=1000,                  
                                chains=4,                     
                                discard_tuned_samples= True, 
                                random_seed=84735)
    log_model1_ppc = pm.sample_posterior_predictive(log_model1_trace)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 22 seconds.
Sampling: [y_est]


In [108]:
az.plot_ppc(log_model1_ppc, num_pp_samples=50)

<Axes: xlabel='y_est / y_est'>

In [109]:
log_model1_ppc.posterior_predictive.y_est.stack(sample = ("chain", "draw"))

In [110]:
#stack(sample = ("chain", "draw")：将每一个X对应的4*5000个后验预测值合并到一个维度sample
#对于每一个X，需要计算其20000个值的平均值，因此将dim设置为sample
pred_pi = log_model1_ppc.posterior_predictive.y_est.stack(sample = ("chain", "draw")).mean(dim="sample")
# 转换为数据框
pred_pi = pred_pi.to_dataframe()

In [111]:
#将原数据中的X 和Y存入数据框
pred_pi["avoidance"] = log_model1_ppc.constant_data.avoidance.values
pred_pi["romantic"] = log_model1_ppc.observed_data.y_est.values

#根据分类标准（50-50）生成最终的分类结果
pred_pi["romantic_pred"] = np.where(pred_pi["y_est"] >= 0.5, 1, 0)
pred_pi

Unnamed: 0_level_0,y_est,avoidance,romantic,romantic_pred
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.60740,-1.083724,0,1
1,0.46435,-1.698926,1,0
2,0.63250,0.813149,0,1
3,0.46155,-0.212188,0,0
4,0.36405,-1.134991,1,0
...,...,...,...,...
103,0.34900,-1.083724,1,0
104,0.43745,1.633418,1,0
105,0.39100,1.633418,1,0
106,0.43175,1.325817,1,0


In [112]:
# 使用`pd.crosstab`生成混淆矩阵，前两个值表明你需要统计的列名
# 由于要生成一个2*2的联表，需要指定行的名称和列的名称
confusion_matrix = pd.crosstab(pred_pi["romantic"], pred_pi["romantic_pred"], 
                              rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,38,19
1,26,25


In [115]:
# 计算a b c d的数量
true_positive = 0
false_positive = 0
true_negative = confusion_matrix.at[0, 0]
false_negative = confusion_matrix.at[1, 0]
# 代入公式
accuracy = (true_positive + false_negative) /(true_positive + false_positive + true_negative + false_negative)
sensitivity = (true_positive) /(true_positive + false_negative)
specificity = (true_negative) / (true_negative + false_positive)

print("准确性:", accuracy)
print("敏感性:", sensitivity)
print("特异性:", specificity)

准确性: 0.40625
敏感性: 0.0
特异性: 1.0


# 5. 与课上的模型进行模型比较

(log_model2:自变量为回避倾向; log_model3:自变量为性别)

In [118]:
df_raw = pd.read_csv(
    '/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')
df = df_raw[df_raw["Site"] == "UCSB"]
df = df[["romantic", "anxiety_r", "avoidance_r", "sex"]]
df["romantic"] = np.where(df['romantic'] == 2, 0, 1)
df["sex"] = np.where(df['sex'] == 1, 0, 1)
df["index"] = range(len(df))
df = df.set_index("index")

In [132]:

with pm.Model() as log_model2:
    # 此处对coords的定义方式进行了更改，因为后续我们需要进行对新数据的预测
    # 因此将维度定义成可更改的
    log_model2.add_coord('obs_id',df.index, mutable=True)
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    #先验
    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)          #定义beta_0          
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)           #定义beta_1
    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance, dims="obs_id")
    #注意此处使用了Logistic sigmoid function：pm.math.invlogit
    #相当于进行了如下计算 (1 / (1 + exp(-mu))
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")
with log_model2:
    # MCMC 近似后验分布
    log_model3_trace = pm.sample(
                                draws=5000,                   # 使用mcmc方法进行采样，draws为采样次数
                                tune=1000,                    # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                                chains=4,                     # 链数
                                discard_tuned_samples= True,
                                idata_kwargs={"log_likelihood": True},  # tune的结果将在采样结束后被丢弃
                                random_seed=84735)


coords = {"obs_id": df.index}

with pm.Model() as log_model3:
    log_model3.add_coord('obs_id',df.index, mutable=True)
    sex= pm.MutableData("sex", df.sex, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    #先验
    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)          #定义beta_0          
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)           #定义beta_2
    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_2 *sex, dims="obs_id")
    #注意此处使用了Logistic sigmoid function：pm.math.invlogit
    #相当于进行了如下计算 (1 / (1 + exp(-mu))
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")

with log_model3:
    # MCMC 近似后验分布
    log_model4_trace = pm.sample(
                                draws=5000,                   # 使用mcmc方法进行采样，draws为采样次数
                                tune=1000,                    # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                                chains=4,                     # 链数
                                discard_tuned_samples= True, 
                                idata_kwargs={"log_likelihood": True}, # tune的结果将在采样结束后被丢弃
                                random_seed=84735)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 20 seconds.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 24 seconds.


In [133]:
log_likelihood = pm.compute_log_likelihood(log_model1_trace,model=log_model1)
log_likelihood = pm.compute_log_likelihood(log_model2_trace,model=log_model2)
log_likelihood = pm.compute_log_likelihood(log_model3_trace,model=log_model3)

ValueError: ['log_likelihood'] group(s) already exists.

In [130]:
comparison_list = {
    "model1(anxiety、avoidance)":log_model1_trace,
    "model2(avoidance)":log_model2_trace,
    "model3(sex)":log_model3_trace,
}
az.compare(comparison_list)

Unnamed: 0,rank,elpd_loo,p_loo,elpd_diff,weight,se,dse,warning,scale
model1(anxiety、avoidance),0,-74.663065,2.711062,0.0,0.896731,2.162426,0.0,False,log
model3(sex),1,-76.30349,1.806491,1.640425,0.103269,0.767736,2.010355,False,log
model2(avoidance),2,-80.093415,2.160355,5.43035,0.0,2.29575,3.369024,False,log


## 模型比较  

模型一的elpd_loo更高，预测性最好  
模型二和模型三预测较差