## 02210610叶子芸第二次作业  


# 模型定义

### 1.自变量：依恋回避、依恋焦虑  

### 2.因变量：是否恋爱  

### 3.数据关系：  
$$  
\begin{array}{lcrl}  
\text{data:} & \hspace{.01in} & Y_i|\beta_0,\beta_1,\beta_2 & \stackrel{ind}{\sim} \text{Bern}(\pi_i) \;\; \text{ with } \;\; \pi_i = \frac{e^{\beta_0 + \beta_1 X_{i1}+\beta_2 X_{i2}}}{1 + e^{\beta_0 + \beta_1 X_{i1}+ \beta_2 X_{i2}}} \\  
\text{priors:} & & \beta_{0}  &  \sim N\left(0, 0.5^2 \right)  \\  
               & & \beta_1  & \sim N\left(0, 0.5^2 \right)\\  
							 & & \beta_2  & \sim N\left(0, 0.5^2 \right)\\  
\end{array}  
$$

In [2]:
import pymc as pm
import arviz as az
import seaborn as sns
import scipy.stats as st
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr
import pandas as pd
import ipywidgets
import warnings
warnings.filterwarnings("ignore")

In [6]:
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')
#进入Oxford站点
df = df_raw[df_raw["Site"] == "Oxford"]
# 选取变量：是否恋爱、依恋回避、依恋焦虑
df = df[["romantic", "avoidance_r", "anxiety_r"]]
df["romantic"] =  np.where(df['romantic'] == 2, 0, 1)
df["index"] = range(len(df))
df = df.set_index("index")
print(df['romantic'].isnull().sum(axis=0))

0


In [7]:
# 展示数据
df

Unnamed: 0_level_0,romantic,avoidance_r,anxiety_r
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,0.781027,-1.975260
1,1,1.681138,-0.323469
2,1,0.893541,-1.172962
3,1,0.218458,0.384442
4,0,-1.244223,0.148472
...,...,...,...
132,0,-0.737910,0.337248
133,1,-0.681653,-0.370663
134,1,0.612256,-1.644902
135,0,-0.512882,0.903577


In [8]:
with pm.Model() as log_model1:
    log_model1.add_coord('obs_id',df.index, mutable=True)
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    anxiety= pm.MutableData("anxiety", df.anxiety_r, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)                    
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)

    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance+beta_2*anxiety, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")

In [9]:
pm.model_to_graphviz(log_model1)

In [10]:
log1_prior = pm.sample_prior_predictive(samples=50, 
                                          model=log_model1,
                                          random_seed=84735)

Sampling: [beta_0, beta_1, beta_2, y_est]


In [11]:
log1_prior

## MCMC采样 & 模型诊断 & 后验参数解释

In [12]:
with log_model1:
    log_model1_trace = pm.sample(
                                draws=5000,                   
                                tune=1000,                   
                                chains=4,                    
                                discard_tuned_samples= True,  
                                random_seed=84735)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 24 seconds.


### 模型诊断图

In [13]:
az.plot_trace(log_model1_trace,
              var_names=["beta_0","beta_1","beta_2"],
              figsize=(15,8),
              compact=False)
plt.show()

### 后验参数解释  

以下的结果显示：  
- $\beta_0 = -0.02$，那么 $e^{\beta_0} = 1$， 表明 X1 为 0时，个体恋爱的可能性为 1。  
- $\beta_1 = 0.07$， $e^{\beta_0} = 1.1$， 表明回避分数每增加1个单位，个体恋爱的发生比变为之前的1.1倍。  
- $\beta_2 = -0.70$， $e^{\beta_0} = 0.51$， 表明焦虑分数每增加1个单位，个体恋爱的发生比变为之前的0.51倍。  
- 然而，$\beta_1$ 的94%HDI包括0，说明回避分数不能有效预测恋爱发生的概率；$\beta_2$ 的94%HDI不包括0，说明焦虑分数能有效预测恋爱发生的概率。 

In [14]:
az.summary(log_model1_trace, var_names=["beta_0","beta_1","beta_2"])

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
beta_0,-0.018,0.172,-0.331,0.314,0.001,0.001,30658.0,15805.0,1.0
beta_1,0.068,0.173,-0.251,0.402,0.001,0.001,28593.0,16374.0,1.0
beta_2,-0.691,0.187,-1.04,-0.336,0.001,0.001,29646.0,15674.0,1.0


In [15]:
az.plot_posterior(log_model1_trace, var_names=["beta_0","beta_1","beta_2"], transform = np.exp)
plt.show()

In [16]:
with log_model1:
    log_model1_ppc = pm.sample_posterior_predictive(log_model1_trace, random_seed=84735)

Sampling: [y_est]


In [17]:
log_model1_trace

### 后验回归模型图

In [18]:
az.plot_hdi(
    df.avoidance_r,
    log_model1_trace.posterior.pi,
    hdi_prob=0.95,
    fill_kwargs={"alpha": 0.25, "linewidth": 0},
    color="C1"
)
post_mean = log_model1_trace.posterior.pi.mean(("chain", "draw"))
sns.lineplot(x = df.avoidance_r, 
             y= post_mean, 
             label="posterior mean", 
             color="C1")
sns.scatterplot(x = df.avoidance_r, 
                y= df.romantic,label="observed data", 
                color='#C00000', 
                alpha=0.5)
plt.legend(loc="upper right",
           bbox_to_anchor=(1.5, 1),
           fontsize=12)
sns.despine()

az.plot_hdi(
    df.anxiety_r,
    log_model1_trace.posterior.pi,
    hdi_prob=0.95,
    fill_kwargs={"alpha": 0.25, "linewidth": 0},
    color="C2"
)
post_mean = log_model1_trace.posterior.pi.mean(("chain", "draw"))
sns.lineplot(x = df.anxiety_r, 
             y= post_mean, 
             label="posterior mean", 
             color="C2")
sns.scatterplot(x = df.anxiety_r, 
                y= df.romantic,label="observed data", 
                color='#C00000', 
                alpha=0.5)
plt.legend(loc="upper right",
           bbox_to_anchor=(1.5, 1),
           fontsize=12)
sns.despine()

# 对本数据集的预测结果进行评估

In [19]:
log_model1_ppc.posterior_predictive.y_est.stack(sample = ("chain", "draw"))

In [20]:
az.plot_ppc(log_model1_ppc, num_pp_samples=50)

<Axes: xlabel='y_est / y_est'>

In [21]:
pred_pi = log_model1_ppc.posterior_predictive.y_est.stack(sample = ("chain", "draw")).mean(dim="sample")
pred_pi = pred_pi.to_dataframe()

In [22]:
pred_pi["avoidance"] = log_model1_ppc.constant_data.avoidance.values
pred_pi["anxiety"] = log_model1_ppc.constant_data.anxiety.values
pred_pi["romantic"] = log_model1_ppc.observed_data.y_est.values
pred_pi["romantic_pred"] = np.where(pred_pi["y_est"] >= 0.5, 1, 0)
pred_pi

Unnamed: 0_level_0,y_est,avoidance,anxiety,romantic,romantic_pred
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.79600,0.781027,-1.975260,1,1
1,0.57535,1.681138,-0.323469,1,1
2,0.69980,0.893541,-1.172962,1,1
3,0.43190,0.218458,0.384442,1,0
4,0.45155,-1.244223,0.148472,0,0
...,...,...,...,...,...
132,0.42705,-0.737910,0.337248,0,0
133,0.53990,-0.681653,-0.370663,1,1
134,0.76170,0.612256,-1.644902,1,1
135,0.33960,-0.512882,0.903577,0,0


In [23]:
confusion_matrix = pd.crosstab(pred_pi["romantic"], pred_pi["romantic_pred"], 
                              rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,45,24
1,26,42


In [24]:
true_positive =  confusion_matrix.at[0, 1]
false_positive = confusion_matrix.at[1, 1]
true_negative = confusion_matrix.at[0, 0]
false_negative = confusion_matrix.at[1, 0]

accuracy = (true_positive + false_negative) /(true_positive + false_positive + true_negative + false_negative)
sensitivity = (true_positive) /(true_positive + false_negative)
specificity = (true_negative) / (true_negative + false_positive)

print("准确性:", accuracy)
print("敏感性:", sensitivity)
print("特异性:", specificity)

准确性: 0.36496350364963503
敏感性: 0.48
特异性: 0.5172413793103449


模型对于原数据的预测结果**准确性**较低，说明对于模型的整体性不高，依恋焦虑和回避对于预测被试是否会恋爱的准确率仅为0.36；**敏感性**为0.48，说明对于依恋焦虑和回避得分低预测被试进行恋爱的可能性为0.48；**特异性**为0.51，说明依恋焦虑和依恋回避得分高预测被试不进行恋爱的可能性为0.51。

# **对新数据进行预测、分类、评估**

In [25]:
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')
df_new = df_raw[df_raw["Site"] == "Poland"]
df_new=df_new[["romantic","anxiety_r","avoidance_r"]]
df_new["romantic"]=np.where(df_new['romantic']==2,0,1)
df_new["index"]=range(len(df_new))
df_new=df_new.set_index("index")

In [26]:
pred_coords ={"obs_id":range(0,136)} 
with log_model1:
    pm.set_data({"anxiety":df_new["anxiety_r"],
                "avoidance": df_new["avoidance_r"],
                "y":df_new["romantic"] },
                coords=pred_coords
                )
    prediction = pm.sample_posterior_predictive(log_model1_trace, 
                                                var_names=["y_est"],
                                                predictions=True,
                                                random_seed=84735)
print(df_new)
print(df_new['romantic'].isnull().sum(axis=0))

Sampling: [y_est]


       romantic  anxiety_r  avoidance_r
index                                  
0             1   1.472294    -1.635748
1             1   1.008316     0.912629
2             1   0.698998     0.507205
3             1  -0.538274     0.623041
4             1  -0.847592    -1.577830
...         ...        ...          ...
131           1  -2.806607     0.333452
132           1  -2.806607     0.738876
133           1   0.080362     0.275535
134           1  -2.806607     0.043864
135           1  -2.806607     0.796794

[136 rows x 3 columns]
0


In [27]:
prediction

In [28]:
y_pred = prediction.predictions["y_est"].stack(sample=("chain","draw","obs_id")).values
y_pred_freq = np.bincount(y_pred)/len(y_pred)
bars = plt.bar([0, 1], y_pred_freq, color="#70AD47")
for bar, freq in zip(bars, y_pred_freq):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{freq:.2f}", ha='center', va='bottom')
plt.xticks([0, 1])
plt.suptitle("Out-of-sample prediction(X=1)")
plt.xlabel("romantic")
plt.ylabel("proportion")
sns.despine()
print(y_pred)

[1 0 0 ... 1 1 1]


In [29]:
coords = {"obs_id": df_new.index}
with pm.Model(coords=coords) as log_model1:
    avoidance = pm.MutableData("avoidance", df_new.avoidance_r, dims="obs_id")
    anxiety=pm.MutableData("anxiety", df_new.anxiety_r, dims="obs_id")

    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)          #定义beta_0          
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)           #定义beta_1
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5) 
    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance+beta_2*anxiety, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi, observed=df_new.romantic,dims="obs_id")

    log_model1_trace = pm.sample(draws=5000,                 
                                tune=1000,                  
                                chains=4,                     
                                discard_tuned_samples= True, 
                                random_seed=84735)
    log_model1_ppc = pm.sample_posterior_predictive(log_model1_trace)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 25 seconds.
Sampling: [y_est]


In [30]:
y_pred = log_model1_ppc.posterior_predictive.y_est.stack(sample = ("chain", "draw")).mean(dim="sample")
y_pred_df = pd.DataFrame(y_pred, columns=["y_pred"])
log_model1_df = pd.DataFrame({"avoidance": log_model1_ppc.constant_data.avoidance, "anxiety": log_model1_ppc.constant_data.anxiety,"romantic": log_model1_ppc.observed_data.y_est})
y_pred_df = pd.concat([y_pred_df, log_model1_df], axis=1)
y_pred_df["romantic_2"] = np.where(np.array(y_pred) >= 0.5, 1, 0)
print(y_pred_df)

confusion_matrix = pd.crosstab(y_pred_df["romantic"], y_pred_df["romantic_2"], 
                              rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

      y_pred  avoidance   anxiety  romantic  romantic_2
0    0.71635  -1.635748  1.472294         1           1
1    0.64450   0.912629  1.008316         1           1
2    0.69550   0.507205  0.698998         1           1
3    0.81645   0.623041 -0.538274         1           1
4    0.89570  -1.577830 -0.847592         1           1
..       ...        ...       ...       ...         ...
131  0.93150   0.333452 -2.806607         1           1
132  0.92695   0.738876 -2.806607         1           1
133  0.77350   0.275535  0.080362         1           1
134  0.93860   0.043864 -2.806607         1           1
135  0.92325   0.796794 -2.806607         1           1

[136 rows x 5 columns]


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,25
1,1,110


In [31]:
true_positive = confusion_matrix.at[0, 1]
false_positive = confusion_matrix.at[1, 1]
true_negative = confusion_matrix.at[0, 0]
false_negative = confusion_matrix.at[1, 0]
accuracy = (true_positive + false_negative) /(true_positive + false_positive + true_negative + false_negative)
sensitivity = (true_positive) /(true_positive + false_negative)
specificity = (true_negative) / (true_negative + false_positive)

print("准确性:", accuracy)
print("敏感性:", sensitivity)
print("特异性:", specificity)

准确性: 0.19117647058823528
敏感性: 0.9615384615384616
特异性: 0.0


18模型对于原数据的预测结果**准确性**较低，说明对于模型的整体性适中，依恋焦虑和回避对于预测被试是否会恋爱的准确率为0.19；**敏感性**为0.96，说明对于依恋焦虑和回避得分低预测被试进行恋爱的可能性为0.96；但**特异性**为0，说明依恋焦虑和依恋回避得分高不能预测被试不进行恋爱。

## 模型比较

In [32]:
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')
#进入Oxford站点
df = df_raw[df_raw["Site"] == "Oxford"]
df = df[["romantic", "avoidance_r", "sex","anxiety_r"]]
df["romantic"] =  np.where(df['romantic'] == 2, 0, 1)
df["index"] = range(len(df))
df = df.set_index("index")
print(df)

       romantic  avoidance_r  sex  anxiety_r
index                                       
0             1     0.781027  1.0  -1.975260
1             1     1.681138  1.0  -0.323469
2             1     0.893541  2.0  -1.172962
3             1     0.218458  1.0   0.384442
4             0    -1.244223  2.0   0.148472
...         ...          ...  ...        ...
132           0    -0.737910  2.0   0.337248
133           1    -0.681653  1.0  -0.370663
134           1     0.612256  2.0  -1.644902
135           0    -0.512882  2.0   0.903577
136           0    -0.231598  2.0   1.894651

[137 rows x 4 columns]


In [33]:
with pm.Model() as log_model2:
    log_model2.add_coord('obs_id',df.index, mutable=True)
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    anxiety= pm.MutableData("anxiety", df.anxiety_r, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)                    
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)

    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance+beta_2*anxiety, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")
    
with pm.Model() as log_model3:
    log_model3.add_coord('obs_id',df.index, mutable=True)
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)                   
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)          
    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")


with pm.Model() as log_model4:
    log_model4.add_coord('obs_id',df.index, mutable=True)
    sex= pm.MutableData("sex",df.sex, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)                   
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)           
    mu = pm.Deterministic("mu", beta_0 + beta_2 *sex, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")

In [80]:
with log_model2:
    log_model2_trace = pm.sample(
                                draws=5000,                   # 使用mcmc方法进行采样，draws为采样次数
                                tune=1000,                    # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                                chains=4,                     # 链数
                                discard_tuned_samples= True,  # tune的结果将在采样结束后被丢弃
                                random_seed=84735)
with log_model3:
    log_model3_trace = pm.sample(draws=5000,                  # 使用mcmc方法进行采样，draws为采样次数
                      tune=1000,                    # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                      chains=4,                     # 链数
                      discard_tuned_samples= True,  # tune的结果将在采样结束后被丢弃
                      idata_kwargs={"log_likelihood": True},
                      random_seed=84735)
with log_model4:
    log_model4_trace = pm.sample(draws=5000,                   # 使用mcmc方法进行采样，draws为采样次数
                        tune=1000,                   # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                        chains=4,                    # 链数
                        discard_tuned_samples= True, # tune的结果将在采样结束后被丢弃
                        idata_kwargs={"log_likelihood": True},
                        random_seed=84735)  

Auto-assigning NUTS sampler...
Auto-assigning NUTS sampler...


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 24 seconds.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 22 seconds.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 28 seconds.


In [82]:
with log_model2:
    log_model2_ppc = pm.sample_posterior_predictive(log_model2_trace, random_seed=84735)
with log_model3:
    log_model3_ppc = pm.sample_posterior_predictive(log_model3_trace, random_seed=84735)
with log_model4:
    log_model4_ppc = pm.sample_posterior_predictive(log_model4_trace, random_seed=84735)

In [83]:
from statistics import median
def MAE(model_ppc):
    pre_x = model_ppc.posterior_predictive["y_est"].stack(sample=("chain", "draw"))
    pre_y_mean = pre_x.mean(axis=1).values

    MAE = pd.DataFrame({
        "romantic_ppc_mean": pre_y_mean,
        "romantic_original": df.romantic
    })

    MAE["pre_error"] = abs(MAE["romantic_original"] -\
                            MAE["romantic_ppc_mean"])

    MAE = median(MAE.pre_error)
    return MAE

log_model3_MAE = MAE(log_model3_ppc)
print(f"模型3 MAE: {log_model3_MAE:.2f}")
log_model4_MAE = MAE(log_model4_ppc)
print(f"模型4 MAE: {log_model4_MAE:.2f}")
log_model2_MAE = MAE(log_model2_ppc)
print(f"模型2 MAE: {log_model2_MAE:.2f}")

#### MAE  
模型2的预测误差小于模型3和模型4，即模型2的预测能力强于模型3和模型4。

In [84]:
log_likelihood = pm.compute_log_likelihood(log_model2_trace, model=log_model2)
print(log_model3_trace)
print(log_model4_trace)
print(log_model2_trace)

#### ELPD  
ELPD结果表明：log_model2的elpd_loo最大，表明其预测效果最好

In [85]:
comparison_list = {
    "log_model3(aviodance)":log_model3_trace,
    "log_model4(sex)":log_model4_trace,
    "log_model2(aviodance&anxiety)":log_model2_trace,
}
az.compare(comparison_list)