### 陈可遇第二次作业  
1. 自变量：依恋回避，依恋焦虑  

2. 因变量：恋爱情况  

3. 数据关系：  

$$  
\begin{array}{lcrl}  
\text{data:} & \hspace{.01in} & Y_i|\beta_0,\beta_1 & {\sim} \text{Bern}(\pi_i) \;\; \text{ with } \;\; \pi_i = \frac{e^{\beta_0 + \beta_1 X_{i1}}}{1 + e^{\beta_0 + \beta_1 X_{i1}}} \\  
\text{priors:} & & \beta_{0}  &  \sim N\left(0, 0.5^2 \right)  \\  
               & & \beta_1  & \sim N\left(0, 0.5^2 \right). \\  
\end{array}  
$$

In [56]:
# 导入 pymc 模型包，和 arviz 等分析工具 
import pymc as pm
import arviz as az
import seaborn as sns
import scipy.stats as st
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr
import pandas as pd
import ipywidgets

# 忽略不必要的警告
import warnings
warnings.filterwarnings("ignore")

In [3]:
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')
#进入UCSB站点
df = df_raw[df_raw["Site"] == "UCSB"]
# 选取变量：是否恋爱、依恋回避、依恋焦虑
df = df[["romantic", "avoidance_r", "anxiety_r"]]
df["romantic"] =  np.where(df['romantic'] == 2, 0, 1)
df["index"] = range(len(df))
df = df.set_index("index")
print(df['romantic'].isnull().sum(axis=0))

0
0


In [58]:
# 展示数据
df

Unnamed: 0_level_0,romantic,avoidance_r,anxiety_r
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,-1.083724,-1.795429
1,1,-1.698926,-0.397500
2,0,0.813149,-1.434673
3,0,-0.212188,0.098540
4,1,-1.134991,0.820052
...,...,...,...
103,1,-1.083724,0.955335
104,1,1.633418,0.910241
105,1,1.633418,1.406280
106,1,1.325817,0.774957


In [59]:
with pm.Model() as log_model1:
    log_model1.add_coord('obs_id',df.index, mutable=True)
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    anxiety= pm.MutableData("anxiety", df.anxiety_r, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)                    
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)

    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance+beta_2*anxiety, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")

In [60]:
pm.model_to_graphviz(log_model1)

In [61]:
log1_prior = pm.sample_prior_predictive(samples=50, 
                                          model=log_model1,
                                          random_seed=84735)


Sampling: [beta_0, beta_1, beta_2, y_est]


In [62]:
log1_prior

## MCMC采样 & 模型诊断 & 后验参数解释

In [63]:
with log_model1:
    log_model1_trace = pm.sample(
                                draws=5000,                   
                                tune=1000,                   
                                chains=4,                    
                                discard_tuned_samples= True,  
                                random_seed=84735)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 14 seconds.


### 模型诊断图

In [64]:
az.plot_trace(log_model1_trace,
              var_names=["beta_0","beta_1","beta_2"],
              figsize=(15,8),
              compact=False)
plt.show()

### 后验参数解释

In [65]:
az.summary(log_model1_trace, var_names=["beta_0","beta_1","beta_2"])

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
beta_0,-0.104,0.183,-0.448,0.239,0.001,0.001,31435.0,15977.0,1.0
beta_1,0.123,0.186,-0.227,0.476,0.001,0.001,28105.0,15263.0,1.0
beta_2,-0.399,0.189,-0.753,-0.041,0.001,0.001,30301.0,16052.0,1.0


In [66]:
az.plot_posterior(log_model1_trace, var_names=["beta_0","beta_1","beta_2"], transform = np.exp)
plt.show()

In [67]:
with log_model1:
    log_model1_ppc = pm.sample_posterior_predictive(log_model1_trace, random_seed=84735)



Sampling: [y_est]


In [68]:
log_model1_trace

### 后验回归模型图

In [69]:
#画出每个自变量对应的恋爱概率94%hdi值
az.plot_hdi(
    df.avoidance_r,
    log_model1_trace.posterior.pi,
    hdi_prob=0.95,
    fill_kwargs={"alpha": 0.25, "linewidth": 0},
    color="C1"
)
#得到每个自变量对应的恋爱概率均值，并使用sns.lineplot连成一条光滑的曲线
post_mean = log_model1_trace.posterior.pi.mean(("chain", "draw"))
sns.lineplot(x = df.avoidance_r, 
             y= post_mean, 
             label="posterior mean", 
             color="C1")
#绘制真实数据散点图
sns.scatterplot(x = df.avoidance_r, 
                y= df.romantic,label="observed data", 
                color='#C00000', 
                alpha=0.5)
#设置图例位置
plt.legend(loc="upper right",
           bbox_to_anchor=(1.5, 1),
           fontsize=12)
sns.despine()

az.plot_hdi(
    df.anxiety_r,
    log_model1_trace.posterior.pi,
    hdi_prob=0.95,
    fill_kwargs={"alpha": 0.25, "linewidth": 0},
    color="C2"
)
post_mean = log_model1_trace.posterior.pi.mean(("chain", "draw"))
sns.lineplot(x = df.anxiety_r, 
             y= post_mean, 
             label="posterior mean", 
             color="C2")
sns.scatterplot(x = df.anxiety_r, 
                y= df.romantic,label="observed data", 
                color='#C00000', 
                alpha=0.5)
plt.legend(loc="upper right",
           bbox_to_anchor=(1.5, 1),
           fontsize=12)
sns.despine()


## 对本数据集的预测结果进行评估  


In [70]:
log_model1_ppc.posterior_predictive.y_est.stack(sample = ("chain", "draw"))

In [71]:
az.plot_ppc(log_model1_ppc, num_pp_samples=50)

<Axes: xlabel='y_est / y_est'>

In [72]:
pred_pi = log_model1_ppc.posterior_predictive.y_est.stack(sample = ("chain", "draw")).mean(dim="sample")
pred_pi = pred_pi.to_dataframe()

In [73]:
pred_pi["avoidance"] = log_model1_ppc.constant_data.avoidance.values
pred_pi["anxiety"] = log_model1_ppc.constant_data.anxiety.values
pred_pi["romantic"] = log_model1_ppc.observed_data.y_est.values
pred_pi["romantic_pred"] = np.where(pred_pi["y_est"] >= 0.5, 1, 0)
pred_pi

Unnamed: 0_level_0,y_est,avoidance,anxiety,romantic,romantic_pred
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.61005,-1.083724,-1.795429,0,1
1,0.46280,-1.698926,-0.397500,1,0
2,0.64165,0.813149,-1.434673,0,1
3,0.45840,-0.212188,0.098540,0,0
4,0.36210,-1.134991,0.820052,1,0
...,...,...,...,...,...
103,0.35425,-1.083724,0.955335,1,0
104,0.43605,1.633418,0.910241,1,0
105,0.38965,1.633418,1.406280,1,0
106,0.43590,1.325817,0.774957,1,0


In [74]:
confusion_matrix = pd.crosstab(pred_pi["romantic"], pred_pi["romantic_pred"], 
                              rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,39,18
1,27,24


In [75]:
true_positive = confusion_matrix.at[0,1]
false_positive = confusion_matrix.at[1,1]
true_negative = confusion_matrix.at[0, 0]
false_negative = confusion_matrix.at[1, 0]

accuracy = (true_positive + false_negative) /(true_positive + false_positive + true_negative + false_negative)
sensitivity = (true_positive) /(true_positive + false_negative)
specificity = (true_negative) / (true_negative + false_positive)

print("准确性:", accuracy)
print("敏感性:", sensitivity)
print("特异性:", specificity)

准确性: 0.4166666666666667
敏感性: 0.4
特异性: 0.6190476190476191


该模型对于原数据的预测准确率为0.42，说明对于模型的预测准确性不高；  
敏感性为0.4，说明依恋焦虑和回避得分低可以在中等程度上预测被试会进行恋爱；  
特异性为0.62，说明依恋焦虑和依恋回避得分高可以较高的预测被试不会进行恋爱。

## 对新数据进行预测、分类、评估  


In [76]:
df_new = df_raw[df_raw["Site"] == "Serbia"]
df_new=df_new[["romantic","anxiety_r","avoidance_r"]]
df_new["romantic"]=np.where(df_new['romantic']==2,0,1)
df_new["index"]=range(len(df_new))
df_new=df_new.set_index("index")

In [79]:
pred_coords ={"obs_id":range(174,175)} 
with log_model1:
    pm.set_data({"anxiety":df_new["anxiety"],
                 "avoidance": df_new["avoidance"],
                 "y":df_new["avoidance"] },
                 coords=pred_coords
                 ) 

    prediction = pm.sample_posterior_predictive(log_model1_trace, 
                                                var_names=["y_est"],
                                                predictions=True,
                                                random_seed=84735)
print(set_data)
print(set_data['romantic'].isnull().sum(axis=0))

KeyError: 'anxiety'

In [80]:
prediction

In [81]:
# 提取储存在 predicitons中的预测值
y_pred = prediction.predictions["y_est"].stack(sample=("chain","draw","obs_id")).values
# 统计其中0和1的个数，并除以总数，得到0和1对应的比例值
y_pred_freq = np.bincount(y_pred)/len(y_pred)
#绘制柱状图
bars = plt.bar([0, 1], y_pred_freq, color="#70AD47")
#用于在柱状图上标明比例值
for bar, freq in zip(bars, y_pred_freq):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{freq:.2f}", ha='center', va='bottom')
#对刻度、标题、坐标轴标题进行设置
plt.xticks([0, 1])
plt.suptitle("Out-of-sample prediction(X=1)")
plt.xlabel("romantic")
plt.ylabel("proportion")
sns.despine()
print(y_pred)

[1 0 0 ... 0 1 0]


## 后验预测评估

In [82]:
coords = {"obs_id": set_data.index}

with pm.Model(coords=coords) as log_model1:
    avoidance = pm.MutableData("avoidance", set_data.avoidance_r, dims="obs_id")
    anxiety=pm.MutableData("anxiety", set_data.anxiety_r, dims="obs_id")
    
    #先验
    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)          #定义beta_0          
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)           #定义beta_1
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5) 
    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance+beta_2*anxiety, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi, observed=set_data.romantic,dims="obs_id")

    log_model1_trace = pm.sample(draws=5000,                 
                                tune=1000,                  
                                chains=4,                     
                                discard_tuned_samples= True, 
                                random_seed=84735)
    log_model1_ppc = pm.sample_posterior_predictive(log_model1_trace)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 14 seconds.
Sampling: [y_est]


In [83]:
y_pred = log_model1_ppc.posterior_predictive.y_est.stack(sample = ("chain", "draw")).mean(dim="sample")
y_pred_df = pd.DataFrame(y_pred, columns=["y_pred"])
log_model1_df = pd.DataFrame({"avoidance": log_model1_ppc.constant_data.avoidance, "anxiety": log_model1_ppc.constant_data.anxiety,"romantic": log_model1_ppc.observed_data.y_est})
y_pred_df = pd.concat([y_pred_df, log_model1_df], axis=1)
y_pred_df["romantic_2"] = np.where(np.array(y_pred) >= 0.5, 1, 0)
print(y_pred_df)

confusion_matrix = pd.crosstab(y_pred_df["romantic"], y_pred_df["romantic_2"], 
                              rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

true_positive = 0
false_positive = 0
true_negative = confusion_matrix.at[0, 0]
false_negative = confusion_matrix.at[1, 0]
accuracy = (true_positive + false_negative) /(true_positive + false_positive + true_negative + false_negative)
sensitivity = (true_positive) /(true_positive + false_negative)
specificity = (true_negative) / (true_negative + false_positive)

print("准确性:", accuracy)
print("敏感性:", sensitivity)
print("特异性:", specificity)

      y_pred  avoidance   anxiety  romantic  romantic_2
0    0.64080  -1.011965 -1.453003         1           1
1    0.74815   1.728803 -0.984477         1           1
2    0.24900  -0.195566  1.358155         0           0
3    0.36810  -1.011965  0.186839         0           0
4    0.69205  -1.070279 -1.780971         1           1
..       ...        ...       ...       ...         ...
159  0.36520  -0.078938  0.608513         1           0
160  0.48730  -1.186907 -0.609656         0           0
161  0.52000   0.212633 -0.187982         1           1
162  0.31835  -1.070279  0.467955         0           0
163  0.60865   0.854090 -0.422245         1           1

[164 rows x 5 columns]
准确性: 0.34444444444444444
敏感性: 0.0
特异性: 1.0


## 模型比较

In [89]:
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')
#进入UCSB站点
df = df_raw[df_raw["Site"] == "UCSB"]
df = df[["romantic", "avoidance_r", "sex","anxiety_r"]]
df["romantic"] =  np.where(df['romantic'] == 2, 0, 1)
df["index"] = range(len(df))
df = df.set_index("index")
print(df)

       romantic  avoidance_r  sex  anxiety_r
index                                       
0             0    -1.083724  2.0  -1.795429
1             1    -1.698926  1.0  -0.397500
2             0     0.813149  2.0  -1.434673
3             0    -0.212188  2.0   0.098540
4             1    -1.134991  1.0   0.820052
...         ...          ...  ...        ...
103           1    -1.083724  2.0   0.955335
104           1     1.633418  2.0   0.910241
105           1     1.633418  1.0   1.406280
106           1     1.325817  2.0   0.774957
107           1    -0.263455  2.0  -1.840523

[108 rows x 4 columns]


In [91]:
with pm.Model() as log_model2:
    log_model2.add_coord('obs_id',df.index, mutable=True)
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    anxiety= pm.MutableData("anxiety", df.anxiety_r, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)                    
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)

    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance+beta_2*anxiety, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")
    
with pm.Model() as log_model3:
    log_model3.add_coord('obs_id',df.index, mutable=True)
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)                   
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)          
    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")


In [92]:
with log_model2:
    log_model2_trace = pm.sample(
                                draws=5000,                   # 使用mcmc方法进行采样，draws为采样次数
                                tune=1000,                    # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                                chains=4,                     # 链数
                                discard_tuned_samples= True,  # tune的结果将在采样结束后被丢弃
                                random_seed=84735)
with log_model3:
    log_model3_trace = pm.sample(draws=5000,                  # 使用mcmc方法进行采样，draws为采样次数
                      tune=1000,                    # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                      chains=4,                     # 链数
                      discard_tuned_samples= True,  # tune的结果将在采样结束后被丢弃
                      idata_kwargs={"log_likelihood": True},
                      random_seed=84735)


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 14 seconds.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 12 seconds.


In [93]:
with log_model2:
    log_model2_ppc = pm.sample_posterior_predictive(log_model2_trace, random_seed=84735)
with log_model3:
    log_model3_ppc = pm.sample_posterior_predictive(log_model3_trace, random_seed=84735)

Sampling: [y_est]


Sampling: [y_est]


In [95]:
from statistics import median
def MAE(model_ppc):
    pre_x = model_ppc.posterior_predictive["y_est"].stack(sample=("chain", "draw"))
    pre_y_mean = pre_x.mean(axis=1).values

    MAE = pd.DataFrame({
        "romantic_ppc_mean": pre_y_mean,
        "romantic_original": df.romantic
    })

    MAE["pre_error"] = abs(MAE["romantic_original"] -\
                            MAE["romantic_ppc_mean"])

    MAE = median(MAE.pre_error)
    return MAE

log_model3_MAE = MAE(log_model3_ppc)
print(f"模型3 MAE: {log_model3_MAE:.2f}")
log_model2_MAE = MAE(log_model2_ppc)
print(f"模型2 MAE: {log_model2_MAE:.2f}")

模型3 MAE: 0.49
模型2 MAE: 0.46


模型2误差小于模型3，所以模型2预测能力更好。

In [96]:
log_likelihood = pm.compute_log_likelihood(log_model2_trace, model=log_model2)
print(log_model3_trace)
print(log_model2_trace)

Inference data with groups:
	> posterior
	> log_likelihood
	> sample_stats
	> observed_data
	> constant_data
Inference data with groups:
	> posterior
	> log_likelihood
	> sample_stats
	> observed_data
	> constant_data


EPLD

In [98]:
comparison_list = {
    "log_model3(aviodance)":log_model3_trace,
    "log_model2(aviodance&anxiety)":log_model2_trace,
}
az.compare(comparison_list)

Unnamed: 0,rank,elpd_loo,p_loo,elpd_diff,weight,se,dse,warning,scale
log_model2(aviodance&anxiety),0,-74.62665,2.676953,0.0,0.903333,2.168101,0.0,False,log
log_model3(aviodance),1,-76.30349,1.806491,1.67684,0.096667,0.767736,2.01397,False,log


模型2的EPLD小于模型3，说明模型2预测效果更好。