# 模型定义       
### 1. 自变量：didf，anxiety_r  
### 2.  因变量：romantic  
### 3.  数据关系：  $$ \begin{array}{lcrl}\text{data:} & \hspace{.01in} & Y_i|\beta_0,\beta_1,\beta_2 & \stackrel{ind}{\sim}\text{Bern}(\pi_i)\;\;\text{ with }\;\;\pi_i=\frac{e^{\beta_0+\beta_1X_{i1}+\beta_2X_{i2}}}{1+e^{\beta_0+\beta_1X_{i1}+\beta_2X_{i2}}} \\ \text{priors:} & & \beta_0 & \sim N\left(0,0.5^2\right) \\ & & \beta_1 & \sim N\left(0,0.5^2\right) \\& & \beta_2 & \sim N\left(0,0.5^2\right) \\{}\end{array} $$

In [62]:
# 导入 pymc 模型包，和 arviz 等分析工具 
import pymc as pm
import arviz as az
import seaborn as sns
import scipy.stats as st
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr
import pandas as pd
import ipywidgets

# 忽略不必要的警告
import warnings
warnings.filterwarnings("ignore")

In [63]:
# 通过 pd.read_csv 加载数据 Data_Sum_HPP_Multi_Site_Share.csv
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')


# 选取站点的数据
df = df_raw[df_raw["Site"] == "VCU"]

# 选取本节课涉及的变量
df = df[["romantic", "didf", "anxiety_r"]]

#重新编码，编码后的数据：1 = "yes"; 2 = "no"
df["romantic"] =  np.where(df['romantic'] == 2, 0, 1)

#设置索引
df["index"] = range(len(df))
df = df.set_index("index")

In [64]:
#展示数据
df.head(10)

Unnamed: 0_level_0,romantic,didf,anxiety_r
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,3.363636,2.13234
1,1,2.454545,0.295121
2,0,2.181818,1.064189
3,1,1.272727,-0.730304
4,0,3.090909,-0.473948
5,1,1.363636,-0.5594
6,0,2.818182,-0.901208
7,1,3.727273,-1.713002
8,0,3.909091,0.594203
9,0,1.727273,1.790531


In [65]:
#因变量分布
plt.hist(df["romantic"])
plt.xticks([0,1])
plt.xlabel("romantic")
plt.ylabel("count")
sns.despine()
plt.show()

In [66]:
#绘制散点图
sns.scatterplot(data=df,
                x="anxiety_r",
                y="romantic",
                alpha=0.6)
#设置x轴标题
plt.xlabel("anxiety")
#设置y轴刻度
plt.yticks([0,1],['no','yes'])
sns.despine()

In [68]:
print(f"因变量“恋爱情况”为“是”的概率为{df.romantic.mean():.3f}")

因变量“恋爱情况”为“是”的概率为0.556


In [69]:
print(f"当“焦虑倾向”<0，“恋爱情况”为“是”的概率为{df[df.anxiety_r<0].romantic.mean():.3f}")
print(f"当“焦虑倾向”<0，“恋爱情况”为“是”的概率为{df[df.anxiety_r>0].romantic.mean():.3f}")

当“焦虑倾向”<0，“恋爱情况”为“是”的概率为0.720
当“焦虑倾向”<0，“恋爱情况”为“是”的概率为0.362


In [70]:
print(f"当“焦虑倾向”<-0.5，“是”的概率为{df[df.anxiety_r<-0.5].romantic.mean():.3f}")
print(f"当“焦虑倾向”在[-0.5,0.5]，“是”的概率为{df[(df.anxiety_r >=-0.5) & (df.anxiety_r <=0.5)].romantic.mean():.3f}")
print(f"当“焦虑倾向”>0.5，“是”的概率为{df[df.anxiety_r>0.5].romantic.mean():.3f}")

当“焦虑倾向”<-0.5，“是”的概率为0.765
当“焦虑倾向”在[-0.5,0.5]，“是”的概率为0.585
当“焦虑倾向”>0.5，“是”的概率为0.298


## 定义模型

In [71]:
with pm.Model() as log_model3:

    #先验
    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)          #定义beta_0          
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)

    log_model3.add_coord('obs_id',df.index, mutable=True)
    anxiety=pm.MutableData("anxiety", df.anxiety_r, dims="obs_id")
    didf=pm.MutableData("didf", df.didf, dims="obs_id")

    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    
    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_1*anxiety + beta_2*didf, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi,observed=y, dims="obs_id")

In [72]:
#可视化模型（贝叶斯变量因果图）
pm.model_to_graphviz(log_model3)

In [74]:
log3_prior = pm.sample_prior_predictive(samples=50,
                                        model=log_model3,
                                        random_seed=84735)

Sampling: [beta_0, beta_1, beta_2, y_est]


In [75]:
log3_prior

In [76]:
for i in range(log3_prior.prior.dims["draw"]):
    sns.lineplot(x = log3_prior.constant_data["anxiety"],
                y = log3_prior.prior["pi"].stack(sample=("chain", "draw"))[:,i], c="grey")
#设置x、y轴标题和总标题
plt.xlabel("anxiety",
            fontsize=12)
plt.ylabel("probability of romantic",
            fontsize=12)  
plt.suptitle("Relationships between anxiety and the probability of romantic",
           fontsize=14)

Text(0.5, 0.98, 'Relationships between anxiety and the probability of romantic')

In [77]:
for i in range(log3_prior.prior.dims["draw"]):
    sns.lineplot(x = log3_prior.constant_data["didf"],
                y = log3_prior.prior["pi"].stack(sample=("chain", "draw"))[:,i], c="grey")
#设置x、y轴标题和总标题
plt.xlabel("didf",
            fontsize=12)
plt.ylabel("probability of romantic",
            fontsize=12)  
plt.suptitle("Relationships between didf and the probability of romantic",
           fontsize=14)

Text(0.5, 0.98, 'Relationships between didf and the probability of romantic')

In [78]:
#MCMC采样 & 模型诊断
with log_model3:
    log_model3_trace = pm.sample(
                                draws=5000,
                                tune=1000,
                                chains=4,
                                discard_tuned_samples = True,
                                random_seed=84735
    )

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 28 seconds.


In [79]:
az.plot_trace(log_model3_trace,
                var_names=["beta_0","beta_1","beta_2"],
                figsize=(15,8),
                compact= False)
plt.show()

## 后验参数解释  

以下的结果显示：  
- $\beta_0 = 0.028$，那么 $e^{\beta_0} = 1.1$， 表明 X1 为 0，X2为0时，个体恋爱的可能性为 1。  
- $\beta_1 = -0.716$， $e^{\beta_0} = 0.50$， 表明焦虑分数每增加1个单位，个体恋爱的发生比变为之前的0.50倍。  
- $\beta_2 = -0.078$， $e^{\beta_0} = 1.1， 表明didf分数每增加1个单位，个体恋爱的发生比变为之前的0.90倍。  
- $\beta_1$ 的94%HDI不包括0，说明焦虑分数能有效预测恋爱发生的概率；然而$\beta_2$ 的94%HDI包括0，说明didf分数不能有效预测恋爱发生的概率

In [80]:
az.summary(log_model3_trace, var_names=["beta_0","beta_1","beta_2"])

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
beta_0,0.028,0.391,-0.705,0.757,0.004,0.003,8532.0,9133.0,1.0
beta_1,-0.716,0.176,-1.052,-0.396,0.002,0.001,11046.0,9712.0,1.0
beta_2,0.078,0.145,-0.198,0.347,0.002,0.001,8601.0,9467.0,1.0


In [81]:
#通过 np.exp 将 beta 参数进行转换
az.plot_posterior(log_model3_trace, var_names=["beta_0","beta_1","beta_2"], transform = np.exp)
plt.show()

In [116]:
with log_model3:
    log_model3_ppc = pm.sample_posterior_predictive(log_model3_trace)

Sampling: [y_est]


In [82]:
log_model3_trace

In [83]:
for i in range(100):
    sns.lineplot(x = log_model3_trace.constant_data["anxiety"],
                y = log_model3_trace.posterior["pi"].stack(sample=("chain","draw"))[:,i],
                c="grey",
                alpha = 0.4)
#设置x、y轴标题和总标题
plt.xlabel("anxiety",fontsize=12)
plt.ylabel("probability of romantic",fontsize=12)
plt.suptitle("100 posterior plausible models",fontsize=14)
sns.despine()
plt.show()

In [84]:
for i in range(100):
    sns.lineplot(x = log_model3_trace.constant_data["didf"],
                y = log_model3_trace.posterior["pi"].stack(sample=("chain","draw"))[:,i],
                c="grey",
                alpha = 0.4)
#设置x、y轴标题和总标题
plt.xlabel("didf",fontsize=12)
plt.ylabel("probability of romantic",fontsize=12)
plt.suptitle("100 posterior plausible models",fontsize=14)
sns.despine()
plt.show()

## 画出HDI值

In [85]:
#画出每个自变量对应的恋爱概率94%hdi值
az.plot_hdi(
    df.anxiety_r,
    log_model3_trace.posterior.pi,
    fill_kwargs={"alpha":0.25,"linewidth":0},
    color="C1"
)
#得到每个自变量对应的恋爱概率均值，并使用sns.lineplot连成一条光滑的曲线
post_mean = log_model3_trace.posterior.pi.mean(("chain","draw"))
sns.lineplot(x = df.anxiety_r,
            y = post_mean,
            label="posterior mean",
            color="C1")
#绘制真实数据散点图
sns.scatterplot(x = df.anxiety_r,
                y = df.romantic, label="observed data",
                color='#C00000',
                alpha=0.5)
#设置图例位置
plt.legend(loc="upper right",
            bbox_to_anchor=(1.5,1),
            fontsize=12)
sns.despine()

## 对新数据进行预测&分类  

### 传入新站点Oxford的数据

In [86]:
# 通过 pd.read_csv 加载数据 Data_Sum_HPP_Multi_Site_Share.csv
df1_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')


# 选取站点的数据
df1 = df1_raw[df1_raw["Site"] == "Oxford"]

# 选取本节课涉及的变量
df1 = df1[["romantic", "didf", "anxiety_r"]]

#重新编码，编码后的数据：1 = "yes"; 2 = "no"
df1["romantic"] =  np.where(df1['romantic'] == 2, 0, 1)

#设置索引
df1["index"] = range(len(df1))
df1 = df1.set_index("index")

In [87]:
df1

Unnamed: 0_level_0,romantic,didf,anxiety_r
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,3.818182,-1.975260
1,1,3.000000,-0.323469
2,1,2.000000,-1.172962
3,1,2.545455,0.384442
4,0,2.818182,0.148472
...,...,...,...
132,0,2.545455,0.337248
133,1,2.636364,-0.370663
134,1,2.272727,-1.644902
135,0,2.454545,0.903577


In [88]:
pred_coords = {"obs_id":range(0,137)}

with log_model3:
    #传入数据
    pm.set_data({"anxiety": df1["anxiety_r"],
                "didf": df1["didf"],
                "y": df1["romantic"]},
                coords=pred_coords
                )
    #生成对因变量的预测
    prediction = pm.sample_posterior_predictive(log_model3_trace,
                                                var_names=["y_est"],
                                                predictions=True,
                                                extend_inferencedata=True,
                                                random_seed=84735)

Sampling: [y_est]


In [89]:
prediction

In [90]:
# 提取储存在 predicitons中的预测值
y_pred = prediction.predictions["y_est"].stack(sample=("chain","draw","obs_id")).values
# 统计其中0和1的个数，并除以总数，得到0和1对应的比例值
y_pred_freq = np.bincount(y_pred)/len(y_pred)
#绘制柱状图
bars = plt.bar([0, 1], y_pred_freq, color="#70AD47")
#用于在柱状图上标明比例值
for bar, freq in zip(bars, y_pred_freq):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{freq:.2f}", ha='center', va='bottom')
#对刻度、标题、坐标轴标题进行设置
plt.xticks([0, 1])
plt.suptitle("Out-of-sample prediction(Oxford)")
plt.xlabel("romantic")
plt.ylabel("proportion")
sns.despine()

## 后验预测评估

In [118]:
coords = {"obs_id": df.index}

with pm.Model(coords=coords) as log_model3:
    anxiety = pm.MutableData("anxiety",df.anxiety_r, dims="obs_id")
    didf = pm.MutableData("didf",df.didf, dims="obs_id")


    #先验
    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)
    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_1 * anxiety + beta_2 * didf, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi, observed= df.romantic, dims="obs_id")

    log_model3_trace = pm.sample(draws=5000,
                                tune=1000,
                                chains=4,
                                discard_tuned_samples=True,
                                random_seed=84735)
    log_model3_ppc = pm.sample_posterior_predictive(log_model3_trace)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 26 seconds.
Sampling: [y_est]


In [121]:
az.plot_ppc(log_model3_ppc, num_pp_samples=50)

<Axes: xlabel='y_est / y_est'>

In [122]:
log_model3_ppc.posterior_predictive.y_est.stack(sample = ("chain","draw"))

In [123]:
pred_pi = log_model3_ppc.posterior_predictive.y_est.stack(sample = ("chain","draw")).mean(dim="sample")

pred_pi = pred_pi.to_dataframe()

In [125]:

#将原数据中的X 和Y存入数据框
pred_pi["anxiety"] = log_model3_ppc.constant_data.anxiety.values
pred_pi["romantic"] = log_model3_ppc.observed_data.y_est.values

#根据分类标准（50-50）生成最终的分类结果
pred_pi["romantic_pred"] = np.where(pred_pi["y_est"] >= 0.5, 1, 0)
pred_pi

Unnamed: 0_level_0,y_est,anxiety,romantic,romantic_pred
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.23725,2.132340,0,0
1,0.50615,0.295121,1,1
2,0.36730,1.064189,0,0
3,0.64285,-0.730304,1,1
4,0.63285,-0.473948,0,1
...,...,...,...,...
143,0.47100,0.508751,0,0
144,0.43650,0.679655,0,0
145,0.27655,1.833257,0,0
146,0.52580,0.166943,0,1


In [126]:
confusion_matrix = pd.crosstab(pred_pi["romantic"], pred_pi["romantic_pred"],
                                rownames=['Actual'],colnames=['Predicted'])
confusion_matrix

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,35,32
1,19,62


In [127]:
#计算a b c d 的数量
true_positive = 0
false_positive = 0
true_negative = confusion_matrix.at[0,0]
false_negative = confusion_matrix.at[1,0]
#代入公式
accuracy = (true_positive + false_negative) / (true_positive + false_positive + true_negative + false_negative)
sensitivity = (true_positive) /(true_positive + false_negative)
specificity = (true_negative) / (true_negative + false_positive)

print("准确性:", accuracy)
print("敏感性:", sensitivity)
print("特异性:", specificity)

准确性: 0.35185185185185186
敏感性: 0.0
特异性: 1.0


## 模型比较

In [128]:
# 通过 pd.read_csv 加载数据 Data_Sum_HPP_Multi_Site_Share.csv
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')


# 选取站点的数据
df = df_raw[df_raw["Site"] == "VCU"]

# 选取本节课涉及的变量
df = df[["romantic", "didf", "anxiety_r","avoidance_r","sex"]]

#重新编码，编码后的数据：1 = "yes"; 2 = "no"
df["romantic"] =  np.where(df['romantic'] == 2, 0, 1)

#设置索引
df["index"] = range(len(df))
df = df.set_index("index")

In [129]:
df=df.dropna()
df

Unnamed: 0_level_0,romantic,didf,anxiety_r,avoidance_r,sex
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,3.363636,2.132340,0.099036,2.0
1,1,2.454545,0.295121,-0.982006,2.0
2,0,2.181818,1.064189,0.038978,2.0
3,1,1.272727,-0.730304,-1.102122,1.0
4,0,3.090909,-0.473948,0.579499,1.0
...,...,...,...,...,...
143,0,3.000000,0.508751,-0.861890,2.0
144,0,2.545455,0.679655,0.759672,2.0
145,0,3.181818,1.833257,-0.441485,2.0
146,0,2.818182,0.166943,1.300193,1.0


In [130]:
with pm.Model() as log_model1:
    # 此处对coords的定义方式进行了更改，因为后续我们需要进行对新数据的预测
    # 因此将维度定义成可更改的
    log_model1.add_coord('obs_id',df.index, mutable=True)
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    #先验
    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)          #定义beta_0          
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)           #定义beta_1
    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance, dims="obs_id")
    #注意此处使用了Logistic sigmoid function：pm.math.invlogit
    #相当于进行了如下计算 (1 / (1 + exp(-mu))
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")



with pm.Model() as log_model2:
    log_model2.add_coord('obs_id',df.index,mutable = True)
    sex= pm.MutableData("sex", df.sex, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    #先验
    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)          #定义beta_0          
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)           #定义beta_2
   # 线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_2 *sex, dims="obs_id")
   # 注意此处使用了Logistic sigmoid function：pm.math.invlogit
   # 相当于进行了如下计算 (1 / (1 + exp(-mu))
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
   # 似然
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")


with pm.Model() as log_model3:

    #先验
    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)          #定义beta_0          
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)

    log_model3.add_coord('obs_id',df.index, mutable=True)
    anxiety=pm.MutableData("anxiety", df.anxiety_r, dims="obs_id")
    didf=pm.MutableData("didf", df.didf, dims="obs_id")

    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    
    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_1*anxiety + beta_2*didf, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi,observed=y, dims="obs_id")



with log_model1:
    log_model1_trace = pm.sample(
                                draws=5000,                   # 使用mcmc方法进行采样，draws为采样次数
                                tune=1000,                    # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                                chains=4,                     # 链数
                                discard_tuned_samples= True,  # tune的结果将在采样结束后被丢弃
                                #idata_kwargs={"log_likelihood":True},  
                                random_seed=84735)

with log_model2:
    log_model2_trace = pm.sample(
                                draws=5000,                   # 使用mcmc方法进行采样，draws为采样次数
                                tune=1000,                    # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                                chains=4,                     # 链数
                                discard_tuned_samples= True,  # tune的结果将在采样结束后被丢弃
                                #idata_kwargs={"log_likelihood":True}, 
                                random_seed=84735)

with log_model3:
    log_model3_trace = pm.sample(
                                draws=5000,
                                tune=1000,
                                chains=4,
                                discard_tuned_samples = True,
                                #idata_kwargs={"log_likelihood":True}, 
                                random_seed=84735
    )

with log_model1:
    pm.compute_log_likelihood(log_model1_trace)

with log_model2:
    pm.compute_log_likelihood(log_model2_trace)

with log_model3:
    pm.compute_log_likelihood(log_model3_trace)
    

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 12 seconds.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 22 seconds.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 25 seconds.


In [131]:
comparison_list = {
    "log_model1(contiunous)":log_model1_trace,
    "log_model2(category)":log_model2_trace,
    "log_model3(contiunous)":log_model3_trace,
}
az.compare(comparison_list)

Unnamed: 0,rank,elpd_loo,p_loo,elpd_diff,weight,se,dse,warning,scale
log_model3(contiunous),0,-95.155484,2.135387,0.0,1.0,3.853104,0.0,False,log
log_model2(category),1,-102.947879,1.236367,7.792395,1.498801e-15,1.249743,3.689529,False,log
log_model1(contiunous),2,-103.606955,1.853957,8.451471,0.0,1.211354,3.790849,False,log


### 模型3最能预测恋爱情况。