In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np

import pymc3 as pm 
import pandas as pd
import utils

from talkpages import WikiCorpusReader, WikiCorpus
from alignment import Alignment

from collections import defaultdict

TOPIC = 'environment'

corpus_reader = WikiCorpusReader('../../data/controversial/')
tsv_filename = corpus_reader.json_to_tsv('tsv/', topic_list=[TOPIC])
corpus = WikiCorpus(tsv_filename)

100%|██████████| 1236/1236 [00:00<00:00, 388996.75it/s]


Loading threads from original json file...


 55%|█████▍    | 91229/166322 [00:00<00:00, 463192.11it/s]

166322 threads loaded.
Process threads.


100%|██████████| 166322/166322 [00:00<00:00, 461955.82it/s]
100%|██████████| 12395/12395 [00:00<00:00, 284255.81it/s]


12395 valid utterances found.
Utterances written to tab-separated file tsv/WikiControversial-environment.tsv


In [2]:
corpus.tokenize_posts()
    
markers, marker_words = utils.read_convokit_markers('../coord-liwc-patterns.txt')
categories = list(markers.keys())

corpus.count_marker_categories(markers)

Tokenizing posts.: 100%|██████████| 12395/12395 [05:53<00:00, 35.09it/s]
Detecting article.: 100%|██████████| 12395/12395 [00:00<00:00, 98646.34it/s] 
Detecting auxverb.:   0%|          | 0/12395 [00:00<?, ?it/s]

Filtered 0 posts with 0-length utterances


Detecting auxverb.: 100%|██████████| 12395/12395 [00:03<00:00, 3613.45it/s]
Detecting conj.: 100%|██████████| 12395/12395 [00:00<00:00, 17968.49it/s]
Detecting adverb.: 100%|██████████| 12395/12395 [00:01<00:00, 7856.93it/s]
Detecting ppron.: 100%|██████████| 12395/12395 [00:01<00:00, 7110.07it/s]
Detecting ipron.: 100%|██████████| 12395/12395 [00:01<00:00, 11586.15it/s]
Detecting preps.: 100%|██████████| 12395/12395 [00:01<00:00, 8751.44it/s]
Detecting quant.: 100%|██████████| 12395/12395 [00:02<00:00, 6044.51it/s]


In [3]:
pairs = corpus.reply_pairs()
al = Alignment(corpus, markers)

In [4]:
users = corpus.get_users()
net = corpus.social_network(prune=False)
corpus.assign_centrality('eigenvector')

100%|██████████| 6568/6568 [00:00<00:00, 6683.40it/s]
 14%|█▍        | 952/6568 [00:00<00:00, 9516.79it/s]

Build network.


100%|██████████| 6568/6568 [00:00<00:00, 10291.46it/s]


The unpruned network has 2095 nodes (users).
Centrality information has been assigned to all pairs.


In [5]:
N_base, N_align, C_base, C_align, dyad2cent = al.counts(mode='categorical', centrality='eigenvector')

100%|██████████| 6568/6568 [00:03<00:00, 2054.71it/s]


In [6]:
centr_A, centr_B = zip(*dyad2cent.values())

In [7]:
def invlogit(x):
    return pm.math.exp(x) / (1 + pm.math.exp(x))

In [9]:
indiv_traces = {}

# centr_A = corpus.pairs['eigenvector_a'].values
# centr_B = corpus.pairs['eigenvector_b'].values

for cat, category in enumerate(categories):

    N_b = [N_base[dyad][cat] for dyad in N_base]
    N_a = [N_align[dyad][cat] for dyad in N_align]
    C_b = [C_base[dyad][cat] for dyad in C_base]
    C_a = [C_align[dyad][cat] for dyad in C_align]
    
    with pm.Model() as individual_model:

        b0_base  = pm.Cauchy('beta0_base' , alpha=0, beta=2.5)
#         b1_base  = pm.Normal('beta1_base' , mu=0, sd=1)
#         b2_base  = pm.Normal('beta2_base' , mu=0, sd=1)
        b0_align = pm.Normal('beta0_align', mu=0, sd=0.25)
        b1_align = pm.Normal('beta1_align', mu=0, sd=1)
        b2_align = pm.Normal('beta2_align', mu=0, sd=1)
        
        
#         lincomb_base  = b0_base  + (b1_base  * centr_A) + (b2_base  * centr_B)
        lincomb_align = b0_base + (b1_align * centr_A) + (b2_align * centr_B) + b0_align
    
    
        mu_base  = pm.math.invlogit(b0_base)
        mu_align = pm.math.invlogit(lincomb_align)
    
        base_count  = pm.Binomial('C_base' , p=mu_base , observed=C_b, n=N_b)
        align_count = pm.Binomial('C_align', p=mu_align, observed=C_a, n=N_a)
        
    with individual_model:
        start = pm.find_MAP()
        step = pm.NUTS(scaling=start)
        indiv_traces[category] = pm.sample(draws=2000, random_seed=123, progressbar=True, tune=500, chains=4)
    

logp = -8,567.9, ||grad|| = 23.391: 100%|██████████| 29/29 [00:00<00:00, 582.62it/s]  
Auto-assigning NUTS sampler...
INFO:pymc3:Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
INFO:pymc3:Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
INFO:pymc3:Multiprocess sampling (4 chains in 2 jobs)
NUTS: [beta2_align, beta1_align, beta0_align, beta0_base]
INFO:pymc3:NUTS: [beta2_align, beta1_align, beta0_align, beta0_base]
Sampling 4 chains: 100%|██████████| 10000/10000 [01:23<00:00, 120.41draws/s]
The acceptance probability does not match the target. It is 0.9140837216664544, but should be close to 0.8. Try to increase the number of tuning steps.
The acceptance probability does not match the target. It is 0.8852397799486448, but should be close to 0.8. Try to increase the number of tuning steps.
logp = -8,693.9, ||grad|| = 5.1971: 100%|██████████| 26/26 [00:00<00:00, 610.00it/s]  
Auto-assigning NUTS sampler...
INFO:pymc3:Auto-

The acceptance probability does not match the target. It is 0.8957349170343957, but should be close to 0.8. Try to increase the number of tuning steps.
logp = -8,970.1, ||grad|| = 166.56: 100%|██████████| 29/29 [00:00<00:00, 658.75it/s]  
Auto-assigning NUTS sampler...
INFO:pymc3:Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
INFO:pymc3:Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
INFO:pymc3:Multiprocess sampling (4 chains in 2 jobs)
NUTS: [beta2_align, beta1_align, beta0_align, beta0_base]
INFO:pymc3:NUTS: [beta2_align, beta1_align, beta0_align, beta0_base]
Sampling 4 chains: 100%|██████████| 10000/10000 [01:28<00:00, 70.42draws/s]
The acceptance probability does not match the target. It is 0.8831547140307391, but should be close to 0.8. Try to increase the number of tuning steps.
The acceptance probability does not match the target. It is 0.9132776327957636, but should be close to 0.8. Try to increase the number 

In [None]:
for category in categories:
    df_trace = pm.trace_to_dataframe(indiv_traces[category])
    pd.plotting.scatter_matrix(df_trace[-1000:], diagonal='kde')
#     print(category)
    pm.traceplot(indiv_traces[category])

# df_trace = pm.trace_to_dataframe(indiv_traces['ppron'])
# pd.plotting.scatter_matrix(df_trace[-1000:], diagonal='kde');
# pm.traceplot(indiv_traces['ppron']);

In [None]:
for category in categories:
    print(category)
    print((indiv_traces[category]))

In [None]:
import seaborn as sn
trace = indiv_traces['article']
sn.kdeplot(trace['beta1_align'], trace['beta2_align'])
plt.xlabel('c_A', fontsize=20)
plt.ylabel('c_B', fontsize=20)
plt.style.use('ggplot')

In [None]:
# plt.figure(figsize=(8, 10))
for c in categories:
    pm.plots.plot_posterior(indiv_traces[c], varnames=['beta0_base', 'beta0_align', 'beta1_align', 'beta2_align'])
#     pm.forestplot(indiv_traces[c], varnames=['beta0_base', 'beta1_align', 'beta2_align'])

In [None]:
log_reg_code = """
data {
    int<lower=0> D;   // number of dyads
    
    int N_base [D];   // num tokens baseline
    int N_align[D];   // num tokens alignment
    int C_base [D];   // num marker occurrences baseline
    int C_align[D];   // num marker occurrences alignment
    
    real<lower=0, upper=1> c_A[D];   // centrality of addressees
    real<lower=0, upper=1> c_B[D];   // centrality of speakers
}
transformed data {}
parameters {
    real beta0_base [D];
    real beta1_base [D];
    real beta2_base [D];
    real beta0_align[D];
    real beta1_align[D];
    real beta2_align[D];
}
transformed parameters {
    real lincomb_base [D]; 
    real lincomb_align[D];
    
    real<lower=0,upper=1> mu_base [D]; 
    real<lower=0,upper=1> mu_align[D];
      
    lincomb_base  = beta0_base  + beta1_base  * c_A + beta2_base  * c_B
    lincomb_align = beta0_align + beta1_align * c_A + beta2_align * c_B
    
    mu_base  = inv_logit(lincomb_base)
    mu_align = inv_logit(lincomb_base + lincomb_align)
}
model {
    beta0_base  ~ normal(0, 1)
    beta1_base  ~ normal(0, 1)
    beta2_base  ~ normal(0, 1)
    beta0_align ~ normal(0, 1)
    beta1_align ~ normal(0, 1)
    beta2_align ~ normal(0, 1)
    
    C_base  ~ binomial(N_base,  mu_base)
    C_align ~ binomial(N_align, mu_align) 
  }
}
generated quantities {}
"""