# Alignment without social influence

#### In this notebook, we estimate alignment in conversations.
---

In [7]:
import pickle
import dill
import numpy as np
import pandas as pd
import pymc3 as pm 
from math import ceil

from collections import defaultdict, Counter

import utils
from talkpages import WikiCorpusReader, WikiCorpus
from alignment import Alignment

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

ModuleNotFoundError: No module named 'pymc3.graph'

> The conversations are taken from a selection of 10 topics from the Controversial TalkPages corpus.

In [3]:
TOPICS = [
          'religion',
          'science', 
          'politics', 
          'history', 
          'people',
          'philosophy', 
          'sports',
          'linguistics', 
          'psychiatry',
          'environment']

> To count alignment, we use a selection of marker categories and tokens from the LIWC dictionaries. There is no overlap between any two categories due to some preprocessing (`marker selection.ipynb`).

In [4]:
META_CATEGORIES = {'stylistic': [
                        'articles',
                        'negations',
                        'prepositions',
                        'numbers',
                        'pronouns'
                    ], 
                    'rhetoric': [
                        'tentative',   
                        'certainty',
                        'discrepancy',
                        'inclusive',
                        'exclusive'
                    ],
                    'discursive': [
                        'causation',
                        'insight',
                        'inhibition',
                        'communication',
                        'cognitive process',
                        'sensory process',
                        'motion'
                    ],
                    'stance': [
                        'optimism',
                        'anger',
                        'anxiety',
                        'sadness',
                    ]}


# Keep a list of category names for convenience.
CATEGORY_LIST = []
for cats in META_CATEGORIES.values():
    CATEGORY_LIST.extend(cats)

    
# Load the filtered lists of markers. 
with open('../../data/liwc/final.dict', 'rb') as f:
    MARKER_DICT = pickle.load(f)
    
    marker_list = []
    for markers in MARKER_DICT.values():
        marker_list.extend(markers)
    MARKER_LIST = list(set(marker_list))

----

> Do we focus on `category-not-word` alignment to exclude cases of lexical repetition ([Doyle & Frank 2016](http://www.aclweb.org/anthology/P16-1050), pp. 531-532) ?

In [5]:
MODE = 'category'
# MODE = 'cnw'

MAX_ITERS = 100000
N_SAMPLES = 4000
TRACE_SIZE = 1000

CAUCHY_ALPHA = -2
CAUCHY_BETA = 3

In [10]:
for TOPIC in TOPICS:
    
    print('{}\n{}'.format(TOPIC, '*'*15))
    
    # Load category-not-word alignment counts (Doyle & Frank, 2016)
    with open('./counts-{}/{}.dill'.format(MODE, TOPIC), 'rb') as f:
        N_base_all, N_align_all, C_base_all, C_align_all, _, _, _, _ = dill.load(f)

    
    # Statistical modelling
    for c, category in enumerate(CATEGORY_LIST):
        
        print('{}, {}\n{}'.format(TOPIC, category, '*'*30))
        
        # Data: collect the counts for this category of markers
        N_base, N_align, C_base, C_align = [], [], [], []
        for dyad in N_base_all:
            if C_base_all[dyad][c] > N_base_all[dyad][c]:
                continue
            if C_align_all[dyad][c] > N_align_all[dyad][c]:
                continue
            N_base.append(N_base_all[dyad][c])
            C_base.append(C_base_all[dyad][c])
            N_align.append(N_align_all[dyad][c])
            C_align.append(C_align_all[dyad][c])
    
    
        if not any(N_base):
            print('N_base: all zeros.')
        if not any(N_align):
            print('N_align: all zeros.')
        if not any(C_align):
            print('C_align: all zeros.')
        if not any(C_base):
            print('C_base: all zeros.')
        
        if not (any(C_base) and any(C_align)):
            print()
            continue
        
        # A simple logistic model.
        with pm.Model() as model:
            # Parameters
            beta0 = pm.Cauchy('baseline intercept', alpha=CAUCHY_ALPHA, beta=CAUCHY_BETA)
            alpha0 = pm.Normal('alignment intercept', mu=0, sd=0.05)
            
            # Include a guessing coefficient for robust logistic regression
            # (cfr. J. Kruschke, 2014, 'Doing Bayesian data analysis', pp. 635-636)
            guess = pm.Beta('guessing coefficient', alpha=1, beta=9)  
            
            # Transformed parameters
            mu_base  = guess * (1/2) + (1-guess) * pm.math.invlogit(beta0)            
            mu_align = guess * (1/2) + (1-guess) * pm.math.invlogit(beta0 + alpha0)
#             mu_base  = pm.math.invlogit(beta0)            
#             mu_align = pm.math.invlogit(beta0 + alpha0)
            
            # Model
            base_count  = pm.Binomial('C_base' , p=mu_base , observed=C_base, n=N_base)
            align_count = pm.Binomial('C_align', p=mu_align, observed=C_align, n=N_align)
        

        # Inference
        with model:
            
            
            print(pm.model_to_graphviz(model))
            break
            
            print(model.check_test_point())
    
            approx = pm.fit(n=MAX_ITERS, method='advi', 
                            callbacks=[pm.callbacks.CheckParametersConvergence(diff='absolute')])
        
            
            print('Sampling {} ...'.format(N_SAMPLES), end=' ')
            full_trace = approx.sample(draws=N_SAMPLES)
            print('Done.')
            
            trace = full_trace[-TRACE_SIZE:]
            trace_df = pm.trace_to_dataframe(trace)
#             trace_df.to_csv('./traces/{}/swam/{}-{}.csv'.format(MODE, TOPIC, category))


#             print(pm.summary(trace))

            
            pm.traceplot(trace, varnames=['baseline intercept', 'alignment intercept', 'guessing coefficient'])
#             plt.savefig('plots/traceplots/{}/swam/{}-{}.pdf'.format(MODE, TOPIC, category))
#             plt.show()

            pm.plot_posterior(trace)
#             plt.savefig('plots/posteriors/{}/swam/{}-{}.pdf'.format(MODE, TOPIC, category))
            plt.show();
        break

religion
***************
religion, articles
******************************
digraph {
	"alignment intercept" [label="alignment intercept ~ Normal"]
	"baseline intercept" [label="baseline intercept ~ Cauchy"]
	"guessing coefficient" [label="guessing coefficient ~ Beta"]
	subgraph "cluster23,223" {
		C_base [label="C_base ~ Binomial" style=filled]
		C_align [label="C_align ~ Binomial" style=filled]
		label="23,223" labeljust=r labelloc=b style=rounded
	}
	"baseline intercept" -> C_base
	"guessing coefficient" -> C_base
	"alignment intercept" -> C_align
	"baseline intercept" -> C_align
	"guessing coefficient" -> C_align
}
science
***************
science, articles
******************************
digraph {
	"alignment intercept" [label="alignment intercept ~ Normal"]
	"baseline intercept" [label="baseline intercept ~ Cauchy"]
	"guessing coefficient" [label="guessing coefficient ~ Beta"]
	subgraph "cluster35,401" {
		C_base [label="C_base ~ Binomial" style=filled]
		C_align [label="C_align ~ B

KeyboardInterrupt: 