In [1]:
from google.colab import files

uploads = files.upload()

Saving translated_labeled_tweets.csv to translated_labeled_tweets.csv


In [2]:
! pip install "jax[cuda12_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
! pip install pymc arviz

Looking in links: https://storage.googleapis.com/jax-releases/jax_cuda_releases.html


In [4]:
import pymc as pm
! pip install --upgrade pymc[jax] arviz



In [5]:
import os
os.environ["PYMC_BACKEND"] = "jax"

In [8]:
import pandas as pd
import arviz as az
from sklearn.preprocessing import LabelEncoder


df = pd.read_csv('translated_labeled_tweets.csv')
df['misinformation_label'].unique()

array(['valid', 'misleading', 'invalid'], dtype=object)

In [9]:
df['is_misinfo'] = (df['misinformation_label'] != 'valid').astype(int)
user_idx = LabelEncoder().fit_transform(df['tweet.user.screen_name'])
lex = df['lexical_diversity'].to_numpy()
ent = df['entropy'].to_numpy()
mis = df['is_misinfo'].to_numpy()
eng = df['engagement'].to_numpy()
n_users = len(df['tweet.user.screen_name'].unique())

In [10]:
with pm.Model() as model:
  mu_int = pm.Normal('mu_int',0,5)
  beta_mis = pm.Normal('beta_mis',0,2)
  beta_lex = pm.Normal('beta_lex',0,2)
  beta_ent = pm.Normal('beta_ent',0,2)
  sigma_u = pm.Exponential('sigma_u',1)
  u = pm.Normal('u',0,sigma_u, shape=n_users)
  eta = mu_int + beta_mis * mis + beta_lex * lex + beta_ent * ent + u [user_idx]
  theta = pm.Exponential('theta',1)
  mu = pm.math.exp(eta)
  obs = pm.NegativeBinomial('obs',mu=mu, alpha=theta, observed = eng)

  idata = pm.sample(1000, tune=1000, target_accept =0.9)

  az.summary(idata, var_names=['mu_int','beta_mis','beta_lex','beta_ent','sigma_u','theta'])

Output()

ERROR:pymc.stats.convergence:The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


In [11]:
az.summary(idata, var_names=['mu_int','beta_mis','beta_lex','beta_ent','sigma_u','theta'])

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
mu_int,3.703,0.155,3.41,3.995,0.008,0.006,384.0,460.0,1.01
beta_mis,0.077,0.03,0.019,0.133,0.001,0.001,2638.0,1955.0,1.0
beta_lex,-0.109,0.057,-0.222,-0.012,0.001,0.001,2605.0,1995.0,1.0
beta_ent,0.064,0.01,0.045,0.082,0.0,0.0,2423.0,2131.0,1.0
sigma_u,0.552,0.113,0.36,0.761,0.003,0.002,1904.0,2047.0,1.0
theta,1.417,0.023,1.375,1.459,0.0,0.0,2517.0,1942.0,1.0


In [13]:
df['affiliated'].unique()

array(['right', 'left', 'center-left'], dtype=object)

In [14]:
df["p_left"]  = (df["affiliated"] == "Left").astype(int)
df["p_right"] = (df["affiliated"] == "Right").astype(int)
# if both p_left=0 & p_right=0, that tweet is “Center” by construction

p_left   = df["p_left"].to_numpy()
p_right  = df["p_right"].to_numpy()

with pm.Model() as model:
    mu_int = pm.Normal("mu_int",0,5)
    beta_mis = pm.Normal("beta_mis",0,2)
    beta_lex = pm.Normal("beta_lex",0,2)
    beta_ent = pm.Normal("beta_ent",0,2)
    beta_l = pm.Normal("beta_l",0,2)
    beta_r = pm.Normal("beta_r",0,2)
    beta_lm = pm.Normal("beta_lm",0,2)
    beta_rm = pm.Normal("beta_rm",0,2)

    sigma_u = pm.Exponential("sigma_u", 1)
    u = pm.Normal("u", 0, sigma_u, shape=n_users)

    eta = (
        mu_int
        + beta_mis * mis
        + beta_lex * lex
        + beta_ent * ent
        + beta_l * p_left
        + beta_r * p_right
        + beta_lm * (p_left * mis)
        + beta_rm * (p_right * mis)
        + u[user_idx]
    )
    theta = pm.Exponential("theta", 1)
    mu_ = pm.math.exp(eta)
    obs = pm.NegativeBinomial("obs", mu=mu_, alpha=theta, observed=eng)
    idata = pm.sample(
        draws=1000,
        tune=1000,
        chains=100,
        cores=100,
        target_accept=0.9,
    )

summary = az.summary(
    idata,
    var_names=[
        "mu_int",
        "beta_mis",
        "beta_lex",
        "beta_ent",
        "beta_l",
        "beta_r",
        "beta_lm",
        "beta_rm",
        "sigma_u",
        "theta",
    ],
)
print(summary)

Output()

           mean     sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  ess_bulk  \
mu_int    3.700  0.161   3.395    4.004      0.001    0.001   13266.0   
beta_mis  0.077  0.030   0.020    0.134      0.000    0.000  133189.0   
beta_lex -0.111  0.056  -0.219   -0.007      0.000    0.000  125650.0   
beta_ent  0.064  0.009   0.046    0.082      0.000    0.000  126767.0   
beta_l    0.019  1.995  -3.724    3.741      0.005    0.007  133069.0   
beta_r   -0.002  1.993  -3.717    3.772      0.005    0.007  133700.0   
beta_lm  -0.001  1.989  -3.790    3.658      0.005    0.007  130997.0   
beta_rm  -0.011  2.003  -3.768    3.742      0.005    0.007  136299.0   
sigma_u   0.557  0.121   0.359    0.780      0.000    0.001   78858.0   
theta     1.418  0.023   1.375    1.461      0.000    0.000  137496.0   

          ess_tail  r_hat  
mu_int     20185.0   1.01  
beta_mis   68945.0   1.00  
beta_lex   68822.0   1.00  
beta_ent   69843.0   1.00  
beta_l     68426.0   1.00  
beta_r     65925.0   1.00  


In [15]:
summary

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
mu_int,3.7,0.161,3.395,4.004,0.001,0.001,13266.0,20185.0,1.01
beta_mis,0.077,0.03,0.02,0.134,0.0,0.0,133189.0,68945.0,1.0
beta_lex,-0.111,0.056,-0.219,-0.007,0.0,0.0,125650.0,68822.0,1.0
beta_ent,0.064,0.009,0.046,0.082,0.0,0.0,126767.0,69843.0,1.0
beta_l,0.019,1.995,-3.724,3.741,0.005,0.007,133069.0,68426.0,1.0
beta_r,-0.002,1.993,-3.717,3.772,0.005,0.007,133700.0,65925.0,1.0
beta_lm,-0.001,1.989,-3.79,3.658,0.005,0.007,130997.0,67380.0,1.0
beta_rm,-0.011,2.003,-3.768,3.742,0.005,0.007,136299.0,70765.0,1.0
sigma_u,0.557,0.121,0.359,0.78,0.0,0.001,78858.0,54648.0,1.0
theta,1.418,0.023,1.375,1.461,0.0,0.0,137496.0,69475.0,1.0


In [None]:
  # baseline tweet has intagement log(3.7) = 40.4
  # misinfo has +log(0.077)=1.08 if misinfo true => 8 percent increase
  # lexical diversity has -log(0.111)=0.895 => 10 percent decrease
  # entropy has log(0.064) = 1.066 => 6.6 increase

  # party affiliation by itself does not predict higher or lower engagement
  # sigma_u 0.557 exp(0.557) = 1.75, users vary around the global mean