# Alignment and centrality

#### In this notebook, we estimate alignment in conversations and quantify to which amount alignment  is influenced by the centrality of the interlocutors.
---

In [1]:
import pickle
import dill
import numpy as np
import pandas as pd
import pymc3 as pm

from collections import defaultdict, Counter

import utils
from talkpages import WikiCorpusReader, WikiCorpus
from alignment import Alignment

%matplotlib inline
import matplotlib.pyplot as plt

> The conversations are taken from a selection of 10 topics from the Controversial TalkPages corpus.

In [2]:
TOPICS = ['religion',
          'science', 
          'politics', 
          'history', 
          'people',
          'philosophy', 
          'sports',
          'linguistics', 
          'psychiatry',
          'environment']

> To count alignment, we use a selection of marker categories and tokens from the LIWC dictionaries. There is no overlap between any two categories due to some preprocessing (`marker selection.ipynb`).

In [3]:
META_CATEGORIES = {'stylistic': [
                        'articles',
                        'negations',
                        'prepositions',
                        'numbers',
                        'pronouns'
                    ], 
                    'rhetoric': [
                        'tentative',   
                        'certainty',
                        'discrepancy',
                        'inclusive',
                        'exclusive'
                    ],
                    'discursive': [
                        'causation',
                        'insight',
                        'inhibition',
                        'communication',
                        'cognitive process',
                        'sensory process',
                        'motion'
                    ],
                    'stance': [
                        'optimism',
                        'anger',
                        'anxiety',
                        'sadness'
                    ]}


# Keep a list of category names for convenience.
CATEGORY_LIST = []
for cats in META_CATEGORIES.values():
    CATEGORY_LIST.extend(cats)

    
# Load the filtered lists of markers. 
with open('../../data/liwc/final.dict', 'rb') as f:
    MARKER_DICT = pickle.load(f)
    
    marker_list = []
    for markers in MARKER_DICT.values():
        marker_list.extend(markers)
    MARKER_LIST = list(set(marker_list))

----

> We focus on `category-not-word` alignment to exclude cases of lexical repetition ([Doyle & Frank 2016](http://www.aclweb.org/anthology/P16-1050), pp. 531-532).

In [4]:
MODE = 'category'
# MODE = 'cnw'

MAX_ITERS = 100000
N_SAMPLES = 4000
TRACE_SIZE = 1000

CAUCHY_ALPHA = -2
CAUCHY_BETA = 3

In [None]:
for TOPIC in TOPICS:
    
    print('{}\n{}'.format(TOPIC, '*'*15))
    
    # Load category-not-word alignment counts (Doyle & Frank, 2016)
    with open('./counts-{}/{}.dill'.format(MODE, TOPIC), 'rb') as f:
        N_base_all, N_align_all, C_base_all, C_align_all, _, _, dyad2strength, _ = dill.load(f)
    
    # Statistical modelling
    for c, category in enumerate(CATEGORY_LIST):
        
        print('{}, {}\n{}'.format(TOPIC, category, '*'*30))
        
        # Data
        N_base, N_align, C_base, C_align = [], [], [], []
        tie_strengths = []
        
        # collect the counts for this category of markers
        for dyad in N_base_all:
            if C_base_all[dyad][c] > N_base_all[dyad][c]:
                continue
            if C_align_all[dyad][c] > N_align_all[dyad][c]:
                continue
            try:
                tie_strengths.append(dyad2strength[dyad])
            except KeyError:
                continue
            N_base.append(N_base_all[dyad][c])
            C_base.append(C_base_all[dyad][c])
            N_align.append(N_align_all[dyad][c])
            C_align.append(C_align_all[dyad][c])
        
        if not any(N_base):
            print('N_base: all zeros.')
        if not any(N_align):
            print('N_align: all zeros.')
        if not any(C_align):
            print('C_align: all zeros.')
        if not any(C_base):
            print('C_base: all zeros.')
        
        if not (any(N_base) or any(N_align) or any(C_align) or any(C_base)):
            continue
        
        tie_strengths =  utils.standardise(tie_strengths)
        
        # A simple logistic model
        with pm.Model() as model:
            # Parameters
            beta0 = pm.Cauchy('baseline intercept', alpha=CAUCHY_ALPHA, beta=CAUCHY_BETA)
            alpha0 = pm.Normal('alignment intercept', mu=0, sd=0.25)
            alpha1 = pm.Normal('coefficient tie-strength', mu=0, sd=1)
            
            # Include a guessing coefficient for robust logistic regression
            # (cfr. J. Kruschke, 2014, 'Doing Bayesian data analysis', pp. 635-636)
            guess = pm.Beta('guessing coefficient', alpha=1, beta=9)  
            
            # Transformed parameters
            mu_base  = guess * 0.5 + (1-guess) * pm.math.invlogit(beta0)            
            mu_align = guess * 0.5 + (1-guess) * pm.math.invlogit(beta0+alpha0 + alpha1*tie_strengths)
            
            # Model
            base_count  = pm.Binomial('C_base' , p=mu_base , observed=C_base, n=N_base)
            align_count = pm.Binomial('C_align', p=mu_align, observed=C_align, n=N_align)
            
            
        
        # Inference
        with model:

            print(model.check_test_point())
    
            approx = pm.fit(n=MAX_ITERS, method='advi', 
                            callbacks=[pm.callbacks.CheckParametersConvergence(diff='absolute')])

            print('Sampling {} ...'.format(N_SAMPLES), end=' ')
            full_trace = approx.sample(draws=N_SAMPLES)
            print('Done.')
            
            trace = full_trace[-TRACE_SIZE:]
            trace_df = pm.trace_to_dataframe(trace)
            trace_df.to_csv('./traces/{}/tiestrength/{}-{}.csv'.format(MODE, TOPIC, category))


            print(pm.summary(trace))

            
            pm.traceplot(trace, varnames=['baseline intercept', 
                                          'alignment intercept',
                                          'coefficient tie-strength',
                                          'guessing coefficient'])
            
            plt.savefig('plots/traceplots/{}/tiestrength/{}-{}.pdf'.format(MODE, TOPIC, category))


            pm.plot_posterior(trace)
            plt.savefig('plots/posteriors/{}/tiestrength/{}-{}.pdf'.format(MODE, TOPIC, category))


religion
***************
religion, articles
******************************
baseline intercept                    -2.06
alignment intercept                    0.47
coefficient tie-strength              -0.92
guessing coefficient_logodds__        -1.05
C_base                             -3596.19
C_align                          -494324.09
Name: Log-probability of test_point, dtype: float64


Average Loss = 85,135: 100%|██████████| 100000/100000 [07:01<00:00, 237.32it/s]   
Finished [100%]: Average Loss = 85,135
INFO:pymc3.variational.inference:Finished [100%]: Average Loss = 85,135


Sampling 4000 ... Done.
                              mean        sd  mc_error   hpd_2.5  hpd_97.5
baseline intercept       -4.563773  0.034286  0.001008 -4.626820 -4.495294
alignment intercept       0.454950  0.036409  0.001065  0.383105  0.524691
coefficient tie-strength  0.024793  0.001815  0.000051  0.021389  0.028443
guessing coefficient      0.623259  0.000438  0.000013  0.622442  0.624163


The `ymin` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `bottom` instead.
  alternative='`bottom`', obj_type='argument')


religion, negations
******************************
baseline intercept                     -2.06
alignment intercept                     0.47
coefficient tie-strength               -0.92
guessing coefficient_logodds__         -1.05
C_base                           -4168890.04
C_align                            -42492.48
Name: Log-probability of test_point, dtype: float64


Average Loss = 11,935:  42%|████▏     | 41897/100000 [02:37<04:20, 222.64it/s]    
Interrupted at 41,903 [41%]: Average Loss = 4.5626e+05
INFO:pymc3.variational.inference:Interrupted at 41,903 [41%]: Average Loss = 4.5626e+05


Sampling 4000 ... Done.
                              mean        sd      mc_error   hpd_2.5  hpd_97.5
baseline intercept       -7.653564  0.021411  7.293027e-04 -7.695549 -7.615448
alignment intercept       1.431192  0.102633  3.498371e-03  1.243548  1.634716
coefficient tie-strength -0.166236  0.066985  2.072410e-03 -0.294051 -0.048959
guessing coefficient      0.000070  0.000018  5.392273e-07  0.000038  0.000106
religion, prepositions
******************************
baseline intercept                     -2.06
alignment intercept                     0.47
coefficient tie-strength               -0.92
guessing coefficient_logodds__         -1.05
C_base                           -4173076.59
C_align                            -42641.00
Name: Log-probability of test_point, dtype: float64


Average Loss = 8,132.5:  51%|█████     | 51012/100000 [03:30<04:05, 199.75it/s]   
Interrupted at 51,014 [51%]: Average Loss = 3.7518e+05
INFO:pymc3.variational.inference:Interrupted at 51,014 [51%]: Average Loss = 3.7518e+05


Sampling 4000 ... 