## 欢迎进入 ModelWhale Notebook  

这里你可以编写代码，文档  

### 关于文件目录  


**project**：project 目录是本项目的工作空间，可以把将项目运行有关的所有文件放在这里，目录中文件的增、删、改操作都会被保留  


**input**：input 目录是数据集的挂载位置，所有挂载进项目的数据集都在这里，未挂载数据集时 input 目录被隐藏  


**temp**：temp 目录是临时磁盘空间，训练或分析过程中产生的不必要文件可以存放在这里，目录中的文件不会保存  


In [1]:
# 试试这个经典示例
print ("hello ModelWhale")

hello ModelWhale


In [2]:
# 查看个人持久化工作区文件
!ls /home/mw/project/

In [3]:
import pymc as pm
import arviz as az
import seaborn as sns
import scipy.stats as st
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr
import pandas as pd
import ipywidgets
import warnings
warnings.filterwarnings("ignore")

In [4]:
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')
#进入Serbia站点
df = df_raw[df_raw["Site"] == "Serbia"]
# 选取变量：是否恋爱、依恋回避、依恋焦虑
df = df[["romantic", "avoidance_r", "anxiety_r"]]
df["romantic"] =  np.where(df['romantic'] == 2, 0, 1)
df["index"] = range(len(df))
df = df.set_index("index")
print(df['romantic'].isnull().sum(axis=0))

0


In [5]:
with pm.Model() as log_model1:
    log_model1.add_coord('obs_id',df.index, mutable=True)
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    anxiety= pm.MutableData("anxiety", df.anxiety_r, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)                    
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)

    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance+beta_2*anxiety, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")

In [6]:
pm.model_to_graphviz(log_model1)

In [7]:
log1_prior = pm.sample_prior_predictive(samples=50, 
                                          model=log_model1,
                                          random_seed=84735)

Sampling: [beta_0, beta_1, beta_2, y_est]


In [8]:
log1_prior

In [9]:
with log_model1:
    log_model1_trace = pm.sample(
                                draws=5000,                   
                                tune=1000,                   
                                chains=4,                    
                                discard_tuned_samples= True,  
                                random_seed=84735)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 23 seconds.


In [10]:
az.plot_trace(log_model1_trace,
              var_names=["beta_0","beta_1","beta_2"],
              figsize=(15,8),
              compact=False)
plt.show()

In [11]:
az.summary(log_model1_trace, var_names=["beta_0","beta_1","beta_2"])

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
beta_0,-0.106,0.159,-0.401,0.191,0.001,0.001,28562.0,15977.0,1.0
beta_1,0.306,0.161,-0.001,0.605,0.001,0.001,28147.0,15889.0,1.0
beta_2,-0.701,0.174,-1.021,-0.365,0.001,0.001,27593.0,16108.0,1.0


In [12]:
az.plot_posterior(log_model1_trace, var_names=["beta_0","beta_1","beta_2"], transform = np.exp)
plt.show()

In [13]:
with log_model1:
    log_model1_ppc = pm.sample_posterior_predictive(log_model1_trace, random_seed=84735)

Sampling: [y_est]


In [14]:
log_model1_trace

In [15]:
az.plot_hdi(
    df.avoidance_r,
    log_model1_trace.posterior.pi,
    hdi_prob=0.95,
    fill_kwargs={"alpha": 0.25, "linewidth": 0},
    color="C1"
)
post_mean = log_model1_trace.posterior.pi.mean(("chain", "draw"))
sns.lineplot(x = df.avoidance_r, 
             y= post_mean, 
             label="posterior mean", 
             color="C1")
sns.scatterplot(x = df.avoidance_r, 
                y= df.romantic,label="observed data", 
                color='#C00000', 
                alpha=0.5)
plt.legend(loc="upper right",
           bbox_to_anchor=(1.5, 1),
           fontsize=12)
sns.despine()

az.plot_hdi(
    df.anxiety_r,
    log_model1_trace.posterior.pi,
    hdi_prob=0.95,
    fill_kwargs={"alpha": 0.25, "linewidth": 0},
    color="C2"
)
post_mean = log_model1_trace.posterior.pi.mean(("chain", "draw"))
sns.lineplot(x = df.anxiety_r, 
             y= post_mean, 
             label="posterior mean", 
             color="C2")
sns.scatterplot(x = df.anxiety_r, 
                y= df.romantic,label="observed data", 
                color='#C00000', 
                alpha=0.5)
plt.legend(loc="upper right",
           bbox_to_anchor=(1.5, 1),
           fontsize=12)
sns.despine()

In [16]:
log_model1_ppc.posterior_predictive.y_est.stack(sample = ("chain", "draw"))

In [17]:
az.plot_ppc(log_model1_ppc, num_pp_samples=50)

<Axes: xlabel='y_est / y_est'>

In [18]:
pred_pi = log_model1_ppc.posterior_predictive.y_est.stack(sample = ("chain", "draw")).mean(dim="sample")
pred_pi = pred_pi.to_dataframe()


In [19]:
pred_pi["avoidance"] = log_model1_ppc.constant_data.avoidance.values
pred_pi["anxiety"] = log_model1_ppc.constant_data.anxiety.values
pred_pi["romantic"] = log_model1_ppc.observed_data.y_est.values
pred_pi["romantic_pred"] = np.where(pred_pi["y_est"] >= 0.5, 1, 0)
pred_pi

Unnamed: 0_level_0,y_est,avoidance,anxiety,romantic,romantic_pred
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.64315,-1.011965,-1.453003,1,1
1,0.74085,1.728803,-0.984477,1,1
2,0.25125,-0.195566,1.358155,0,0
3,0.36820,-1.011965,0.186839,0,0
4,0.69040,-1.070279,-1.780971,1,1
...,...,...,...,...,...
159,0.36500,-0.078938,0.608513,1,0
160,0.49015,-1.186907,-0.609656,0,0
161,0.52475,0.212633,-0.187982,1,1
162,0.32095,-1.070279,0.467955,0,0


In [20]:
confusion_matrix = pd.crosstab(pred_pi["romantic"], pred_pi["romantic_pred"], 
                              rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,60,26
1,30,48


In [21]:
true_positive =  confusion_matrix.at[0, 1]
false_positive = confusion_matrix.at[1, 1]
true_negative = confusion_matrix.at[0, 0]
false_negative = confusion_matrix.at[1, 0]

accuracy = (true_positive + false_negative) /(true_positive + false_positive + true_negative + false_negative)
sensitivity = (true_positive) /(true_positive + false_negative)
specificity = (true_negative) / (true_negative + false_positive)

print("准确性:", accuracy)
print("敏感性:", sensitivity)
print("特异性:", specificity)

准确性: 0.34146341463414637
敏感性: 0.4642857142857143
特异性: 0.5555555555555556


模型对于原数据的预测结果准确性较低，依恋焦虑和回避对于预测被试是否会恋爱的准确率仅为0.34；敏感性为0.46，说明对于依恋焦虑和回避得分低预测被试进行恋爱的可能性为0.46；特异性为0.56，说明依恋焦虑和依恋回避得分高预测被试不进行恋爱的可能性为0.56。

In [22]:
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')
df_new = df_raw[df_raw["Site"] == "VCU"]
df_new=df_new[["romantic","anxiety_r","avoidance_r"]]
df_new["romantic"]=np.where(df_new['romantic']==2,0,1)
df_new["index"]=range(len(df_new))
df_new=df_new.set_index("index")
with pm.Model() as log_model1:
    log_model1.add_coord('obs_id',df.index, mutable=True)
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    anxiety= pm.MutableData("anxiety", df.anxiety_r, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)                    
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)

    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance+beta_2*anxiety, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")

In [24]:
pred_coords ={"obs_id":range(0,151)} 
with log_model1:
    pm.set_data({"anxiety":df_new["anxiety_r"],
                "avoidance": df_new["avoidance_r"],
                "y":df_new["romantic"] },
                coords=pred_coords
                )
    prediction = pm.sample_posterior_predictive(log_model1_trace, 
                                                var_names=["y_est"],
                                                predictions=True,
                                                random_seed=84735)
print(df_new)
print(df_new['romantic'].isnull().sum(axis=0))

Sampling: [y_est]


       romantic  anxiety_r  avoidance_r
index                                  
0             0   2.132340     0.099036
1             1   0.295121    -0.982006
2             0   1.064189     0.038978
3             1  -0.730304    -1.102122
4             0  -0.473948     0.579499
...         ...        ...          ...
146           0   0.166943     1.300193
147           0   1.106915     0.459383
148           1  -0.773030     1.720599
149           1  -2.482070    -0.381427
150           1  -0.858482     0.939846

[151 rows x 3 columns]
0


In [25]:
prediction

In [26]:
y_pred = prediction.predictions["y_est"].stack(sample=("chain","draw","obs_id")).values
y_pred_freq = np.bincount(y_pred)/len(y_pred)
bars = plt.bar([0, 1], y_pred_freq, color="#70AD47")
for bar, freq in zip(bars, y_pred_freq):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{freq:.2f}", ha='center', va='bottom')
plt.xticks([0, 1])
plt.suptitle("Out-of-sample prediction(X=1)")
plt.xlabel("romantic")
plt.ylabel("proportion")
sns.despine()
print(y_pred)

[0 0 0 ... 1 1 1]


In [27]:
coords = {"obs_id": df_new.index}
with pm.Model(coords=coords) as log_model1:
    avoidance = pm.MutableData("avoidance", df_new.avoidance_r, dims="obs_id")
    anxiety=pm.MutableData("anxiety", df_new.anxiety_r, dims="obs_id")

    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)          #定义beta_0          
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)           #定义beta_1
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5) 
    #线性关系
    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance+beta_2*anxiety, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    #似然
    likelihood = pm.Bernoulli("y_est",p=pi, observed=df_new.romantic,dims="obs_id")

    log_model1_trace = pm.sample(draws=5000,                 
                                tune=1000,                  
                                chains=4,                     
                                discard_tuned_samples= True, 
                                random_seed=84735)
    log_model1_ppc = pm.sample_posterior_predictive(log_model1_trace)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 24 seconds.
Sampling: [y_est]


In [28]:
y_pred = log_model1_ppc.posterior_predictive.y_est.stack(sample = ("chain", "draw")).mean(dim="sample")
y_pred_df = pd.DataFrame(y_pred, columns=["y_pred"])
log_model1_df = pd.DataFrame({"avoidance": log_model1_ppc.constant_data.avoidance, "anxiety": log_model1_ppc.constant_data.anxiety,"romantic": log_model1_ppc.observed_data.y_est})
y_pred_df = pd.concat([y_pred_df, log_model1_df], axis=1)
y_pred_df["romantic_2"] = np.where(np.array(y_pred) >= 0.5, 1, 0)
print(y_pred_df)

confusion_matrix = pd.crosstab(y_pred_df["romantic"], y_pred_df["romantic_2"], 
                              rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

      y_pred  avoidance   anxiety  romantic  romantic_2
0    0.21910   0.099036  2.132340         0           0
1    0.51750  -0.982006  0.295121         1           1
2    0.36650   0.038978  1.064189         0           0
3    0.68980  -1.102122 -0.730304         1           1
4    0.62965   0.579499 -0.473948         0           1
..       ...        ...       ...       ...         ...
146  0.50455   1.300193  0.166943         0           1
147  0.35785   0.459383  1.106915         0           0
148  0.65525   1.720599 -0.773030         1           1
149  0.87360  -0.381427 -2.482070         1           1
150  0.68205   0.939846 -0.858482         1           1

[151 rows x 5 columns]


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,36,31
1,20,64


In [29]:
true_positive = confusion_matrix.at[0, 1]
false_positive = confusion_matrix.at[1, 1]
true_negative = confusion_matrix.at[0, 0]
false_negative = confusion_matrix.at[1, 0]
accuracy = (true_positive + false_negative) /(true_positive + false_positive + true_negative + false_negative)
sensitivity = (true_positive) /(true_positive + false_negative)
specificity = (true_negative) / (true_negative + false_positive)

print("准确性:", accuracy)
print("敏感性:", sensitivity)
print("特异性:", specificity)

准确性: 0.33774834437086093
敏感性: 0.6078431372549019
特异性: 0.36


模型对于原数据的预测结果准确性较低，依恋焦虑和回避对于预测被试是否会恋爱的准确率仅为0.35；敏感性为0.48，说明对于依恋焦虑和回避得分低预测被试进行恋爱的可能性为0.48；特异性为0.55，说明依恋焦虑和依恋回避得分高预测被试不进行恋爱的可能性为0.55。

In [30]:
df_raw = pd.read_csv('/home/mw/input/bayes20238001/Data_Sum_HPP_Multi_Site_Share.csv')
#进入Serbia站点
df = df_raw[df_raw["Site"] == "Serbia"]
df = df[["romantic", "avoidance_r", "sex","anxiety_r"]]
df["romantic"] =  np.where(df['romantic'] == 2, 0, 1)
df["sex"] =  np.where(df['sex'] == 2, 0, 1)
df["index"] = range(len(df))
df = df.set_index("index")
print(df)

       romantic  avoidance_r  sex  anxiety_r
index                                       
0             1    -1.011965    0  -1.453003
1             1     1.728803    0  -0.984477
2             0    -0.195566    0   1.358155
3             0    -1.011965    0   0.186839
4             1    -1.070279    0  -1.780971
...         ...          ...  ...        ...
159           1    -0.078938    1   0.608513
160           0    -1.186907    0  -0.609656
161           1     0.212633    1  -0.187982
162           0    -1.070279    0   0.467955
163           1     0.854090    1  -0.422245

[164 rows x 4 columns]


In [31]:
with pm.Model() as log_model2:
    log_model2.add_coord('obs_id',df.index, mutable=True)
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    anxiety= pm.MutableData("anxiety", df.anxiety_r, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)                    
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)

    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance+beta_2*anxiety, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")

In [32]:
with pm.Model() as log_model3:
    log_model3.add_coord('obs_id',df.index, mutable=True)
    avoidance = pm.MutableData("avoidance", df.avoidance_r, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)                   
    beta_1 = pm.Normal("beta_1", mu=0, sigma=0.5)          
    mu = pm.Deterministic("mu", beta_0 + beta_1 * avoidance, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")

In [33]:
with pm.Model() as log_model4:
    log_model4.add_coord('obs_id',df.index, mutable=True)
    sex= pm.MutableData("sex",df.sex, dims="obs_id")
    y = pm.MutableData('y', df.romantic, dims = 'obs_id')

    beta_0 = pm.Normal("beta_0", mu=0, sigma=0.5)                   
    beta_2 = pm.Normal("beta_2", mu=0, sigma=0.5)           
    mu = pm.Deterministic("mu", beta_0 + beta_2 *sex, dims="obs_id")
    pi = pm.Deterministic("pi", pm.math.invlogit(mu), dims="obs_id")
    likelihood = pm.Bernoulli("y_est",p=pi, observed=y,dims="obs_id")

In [34]:
with log_model2:
    log_model2_trace = pm.sample(
                                draws=5000,                   # 使用mcmc方法进行采样，draws为采样次数
                                tune=1000,                    # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                                chains=4,                     # 链数
                                discard_tuned_samples= True,  # tune的结果将在采样结束后被丢弃
                                random_seed=84735)
with log_model3:
    log_model3_trace = pm.sample(draws=5000,                  # 使用mcmc方法进行采样，draws为采样次数
                      tune=1000,                    # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                      chains=4,                     # 链数
                      discard_tuned_samples= True,  # tune的结果将在采样结束后被丢弃
                      idata_kwargs={"log_likelihood": True},
                      random_seed=84735)
with log_model4:
    log_model4_trace = pm.sample(draws=5000,                   # 使用mcmc方法进行采样，draws为采样次数
                        tune=1000,                   # tune为调整采样策略的次数，可以决定这些结果是否要被保留
                        chains=4,                    # 链数
                        discard_tuned_samples= True, # tune的结果将在采样结束后被丢弃
                        idata_kwargs={"log_likelihood": True},
                        random_seed=84735)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 24 seconds.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_1]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 21 seconds.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta_2]


Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 21 seconds.


In [35]:
with log_model2:
    log_model2_ppc = pm.sample_posterior_predictive(log_model2_trace, random_seed=84735)
with log_model3:
    log_model3_ppc = pm.sample_posterior_predictive(log_model3_trace, random_seed=84735)
with log_model4:
    log_model4_ppc = pm.sample_posterior_predictive(log_model4_trace, random_seed=84735)

Sampling: [y_est]


Sampling: [y_est]


Sampling: [y_est]


In [36]:
from statistics import median
def MAE(model_ppc):
    pre_x = model_ppc.posterior_predictive["y_est"].stack(sample=("chain", "draw"))
    pre_y_mean = pre_x.mean(axis=1).values

    MAE = pd.DataFrame({
        "romantic_ppc_mean": pre_y_mean,
        "romantic_original": df.romantic
    })

    MAE["pre_error"] = abs(MAE["romantic_original"] -\
                            MAE["romantic_ppc_mean"])

    MAE = median(MAE.pre_error)
    return MAE

log_model3_MAE = MAE(log_model3_ppc)
print(f"模型3 MAE: {log_model3_MAE:.2f}")
log_model4_MAE = MAE(log_model4_ppc)
print(f"模型4 MAE: {log_model4_MAE:.2f}")
log_model2_MAE = MAE(log_model2_ppc)
print(f"模型2 MAE: {log_model2_MAE:.2f}")

模型3 MAE: 0.49
模型4 MAE: 0.47
模型2 MAE: 0.43


模型2的预测误差小于模型3和模型4，即模型2的预测能力强于模型3和模型4

In [37]:
log_likelihood = pm.compute_log_likelihood(log_model2_trace, model=log_model2)
print(log_model3_trace)
print(log_model4_trace)
print(log_model2_trace)

Inference data with groups:
	> posterior
	> log_likelihood
	> sample_stats
	> observed_data
	> constant_data
Inference data with groups:
	> posterior
	> log_likelihood
	> sample_stats
	> observed_data
	> constant_data
Inference data with groups:
	> posterior
	> log_likelihood
	> sample_stats
	> observed_data
	> constant_data


In [38]:
comparison_list = {
    "log_model3(aviodance)":log_model3_trace,
    "log_model4(sex)":log_model4_trace,
    "log_model2(aviodance&anxiety)":log_model2_trace,
}
az.compare(comparison_list)

Unnamed: 0,rank,elpd_loo,p_loo,elpd_diff,weight,se,dse,warning,scale
log_model2(aviodance&anxiety),0,-105.325475,2.704009,0.0,0.982628,4.123952,0.0,False,log
log_model3(aviodance),1,-114.485728,1.871405,9.160253,0.0,1.358026,3.993764,False,log
log_model4(sex),2,-114.506609,1.521912,9.181134,0.017372,0.854521,4.055201,False,log


log_model2的elpd_loo最大，表明其预测效果最好