In [5]:
import utils
import pickle

from talkpages import WikiCorpusReader, WikiCorpus
from alignment import Alignment
from matplotlib import pyplot as plt
from collections import defaultdict, Counter

from copy import copy
import numpy as np
import networkx as nx

In [2]:
ALL_META_CATEGORIES = {'stylistic': ['articles', 'negations', 'prepositions', 'numbers', 'pronouns'], 
                   'rhetoric': ['tentative', 'certainty', 'discrepancy', 'inclusive', 'exclusive'],
                   'discursive': ['causation', 'insight', 'inhibition', 'communication', 'cognitive process', 'sensory process', 'motion'], # 'motion'
                   'stance': ['affective process', 'positive emotion', 'optimism', 'negative emotion', 'anger', 'anxiety', 'sadness']}

In [3]:
CATEGORIES = []
for meta, cats in ALL_META_CATEGORIES.items():
    CATEGORIES.extend(cats)
    
marker_dict, marker_words = utils.read_liwc_markers('../../../data/liwc/', CATEGORIES)

First strategy: remove all overlapping markers. Let's compute the percentage of markers that are retained.

In [11]:
def overlapping_markers(marker_dictionary, category_list):
    excluded_markers = Counter()
    for cat in category_list:
        for cat2 in category_list:
            if cat != cat2:
                for m in (set(marker_dictionary[cat]) & set(marker_dictionary[cat2])):
                    excluded_markers[m] += 1
    return excluded_markers

In [5]:
taboo_list = overlapping_markers(marker_dict, CATEGORIES)

**68%** of the markers appear in more than one category.

In [6]:
round(len(taboo_list) / len(marker_words) * 100, 2)

67.77

These are the number of markers in each category before and after filtering out every marker that appears in more than one category:

In [7]:
for cat, markers in marker_dict.items():
    print
    print('{:20}  {:3}     {:3}'.format(cat, len(markers), len([m for m in markers if m not in taboo_list])))

articles                3       3
negations              31      21
prepositions           43      32
numbers                29      28
pronouns               71      64
tentative              81      43
certainty              30      17
discrepancy            32       0
inclusive              16       9
exclusive              19      13
causation              50       0
insight               117       0
inhibition             64       0
communication         127      40
cognitive process     322      34
sensory process       112      58
motion                 73      72
affective process     618       3
positive emotion      264       0
optimism               70       0
negative emotion      345       0
anger                 120       0
anxiety                61       0
sadness                72       0


----

Let's start by tackling the stance categories. Let's remove the _affective process_ category which includes most affective markers. Now **48%** of the markers appear in more than one category.

In [8]:
for cat, markers in marker_dict.items():
    if cat not in CATEGORIES:
        continue
    print('{:20}  {:3}     {:3}'.format(cat, len(markers), len([m for m in markers if m not in taboo_list])))

articles                3       3
negations              31      21
prepositions           43      32
numbers                29      28
pronouns               71      64
tentative              81      43
certainty              30      17
discrepancy            32       0
inclusive              16       9
exclusive              19      13
causation              50       0
insight               117       0
inhibition             64       0
communication         127      40
cognitive process     322      34
sensory process       112      58
motion                 73      72
affective process     618       3
positive emotion      264       0
optimism               70       0
negative emotion      345       0
anger                 120       0
anxiety                61       0
sadness                72       0


-----

Stance categories: _Optimism_ is subsumed by _positive emotion_ and _negative emotion_ subsumes _anger, anxiety, sadness_.

In [9]:
for c2 in ['positive emotion', 'negative emotion']:
    for c in ['anger', 'anxiety', 'sadness', 'optimism']:
        overlap = set(marker_dict[c2]) & set(marker_dict[c])
        print('Overlap portion of {}: {}%'.format(c2, round(100 * len(overlap) / len(marker_dict[c2]), 2)))
        print('Overlap portion of {}: {}%'.format(c, round(100 * len(overlap) / len(marker_dict[c]), 2)))
        print()


Overlap portion of positive emotion: 0.0%
Overlap portion of anger: 0.0%

Overlap portion of positive emotion: 0.0%
Overlap portion of anxiety: 0.0%

Overlap portion of positive emotion: 0.0%
Overlap portion of sadness: 0.0%

Overlap portion of positive emotion: 26.52%
Overlap portion of optimism: 100.0%

Overlap portion of negative emotion: 34.78%
Overlap portion of anger: 100.0%

Overlap portion of negative emotion: 17.68%
Overlap portion of anxiety: 100.0%

Overlap portion of negative emotion: 20.87%
Overlap portion of sadness: 100.0%

Overlap portion of negative emotion: 0.0%
Overlap portion of optimism: 0.0%



Removing _affective process, optimism, anger, anxiety,_ and _sadness_. **28%** of the markers appear in more than one category.

In [10]:
META_CATEGORIES = {'stylistic': ['articles', 'negations', 'prepositions', 'numbers', 'pronouns'], # 'pronouns'
                   'rethoric': ['tentative', 'certainty', 'discrepancy', 'inclusive', 'exclusive'],
                   'discursive': ['causation', 'insight', 'inhibition', 'communication', 'cognitive process', 'sensory process', 'motion'], # 'motion'
                   'stance': ['positive emotion', 'negative emotion']}

CATEGORIES = []
for meta, cats in META_CATEGORIES.items():
    CATEGORIES.extend(cats)
    
marker_dict, marker_words = utils.read_liwc_markers('../../../data/liwc/', CATEGORIES)

taboo_list = overlapping_markers(marker_dict, CATEGORIES)
print('{}%'.format(round(len(taboo_list) / len(marker_words) * 100, 2)))

28.38%


In [11]:
for cat, markers in marker_dict.items():
    if cat not in CATEGORIES:
        continue
    print('{:20}  {:3}     {:3}'.format(cat, len(markers), len([m for m in markers if m not in taboo_list])))

articles                3       3
negations              31      21
prepositions           43      32
numbers                29      28
pronouns               71      64
tentative              81      43
certainty              30      17
discrepancy            32       0
inclusive              16       9
exclusive              19      13
causation              50       0
insight               117       0
inhibition             64       0
communication         127      44
cognitive process     322      34
sensory process       112      58
motion                 73      72
positive emotion      264     222
negative emotion      345     309


Alternatively, let's try to remove the less fine-grained categories: _positive, negative, affective process_.

In [12]:
META_CATEGORIES = {'stylistic': ['articles', 'negations', 'prepositions', 'numbers', 'pronouns'], # 'pronouns'
                   'rethoric': ['tentative', 'certainty', 'discrepancy', 'inclusive', 'exclusive'],
                   'discursive': ['causation', 'insight', 'inhibition', 'communication', 'cognitive process', 'sensory process', 'motion'], # 'motion'
                   'stance': ['optimism', 'anger', 'anxiety', 'sadness']}

CATEGORIES = []
for meta, cats in META_CATEGORIES.items():
    CATEGORIES.extend(cats)
    
marker_dict, marker_words = utils.read_liwc_markers('../../../data/liwc/', CATEGORIES)

taboo_list = overlapping_markers(marker_dict, CATEGORIES)
print('{}%'.format(round(len(taboo_list) / len(marker_words) * 100, 2)))

33.7%


In [13]:
for cat, markers in marker_dict.items():
    if cat not in CATEGORIES:
        continue
    print('{:20}  {:3}     {:3}'.format(cat, len(markers), len([m for m in markers if m not in taboo_list])))

articles                3       3
negations              31      22
prepositions           43      32
numbers                29      28
pronouns               71      64
tentative              81      46
certainty              30      19
discrepancy            32       0
inclusive              16       9
exclusive              19      13
causation              50       0
insight               117       0
inhibition             64       0
communication         127      52
cognitive process     322      37
sensory process       112      63
motion                 73      72
optimism               70      50
anger                 120     100
anxiety                61      48
sadness                72      68


The problematic categories are now: _discrepancy, inclusive, exclusive, causation, insight, inhibition_.

_Causation, discrepancy, insight, inhibition_ are subsumed by _cognitive process_.

In [14]:
for c2 in ['cognitive process']:
    for c in ['discrepancy', 'inclusive', 'exclusive', 'causation', 'insight', 'inhibition', 'cognitive process']:
        if c != c2:
            overlap = set(marker_dict[c2]) & set(marker_dict[c])
            print('Overlap portion of {}: {}%'.format(c2, round(100 * len(overlap) / len(marker_dict[c2]), 2)))
            print('Overlap portion of {}: {}%'.format(c, round(100 * len(overlap) / len(marker_dict[c]), 2)))
            print()



Overlap portion of cognitive process: 9.94%
Overlap portion of discrepancy: 100.0%

Overlap portion of cognitive process: 0.0%
Overlap portion of inclusive: 0.0%

Overlap portion of cognitive process: 0.31%
Overlap portion of exclusive: 5.26%

Overlap portion of cognitive process: 15.53%
Overlap portion of causation: 100.0%

Overlap portion of cognitive process: 36.34%
Overlap portion of insight: 100.0%

Overlap portion of cognitive process: 19.88%
Overlap portion of inhibition: 100.0%



We could keep these four categories discarding their cross-overlaps as well as the set difference between the cognitive process category and the union of the other four. This still leaves us with 64 markers.

In [15]:
union = set(marker_dict['discrepancy']) | set(marker_dict['causation']) | set(marker_dict['insight']) | set(marker_dict['inhibition'])
print(len(marker_dict['cognitive process']))

new_marker_dict = copy(marker_dict)
cognitive = set(marker_dict['cognitive process']) - union
new_marker_dict['cognitive process'] = list(cognitive)

322


In [16]:
taboo_list = overlapping_markers(new_marker_dict, CATEGORIES)
print('{}%'.format(round(len(taboo_list) / len(marker_words) * 100, 2)))
print()

for cat, markers in new_marker_dict.items():
    if cat not in CATEGORIES:
        continue
    print('{:20}  {:3}    {:3}   {:3}'.format(cat, len(marker_dict[cat]),
                                        len([m for m in marker_dict[cat] if m not in taboo_list]),
                                        len([m for m in markers if m not in taboo_list])))

16.53%

articles                3      3     3
negations              31     22    22
prepositions           43     32    32
numbers                29     28    28
pronouns               71     64    64
tentative              81     46    46
certainty              30     19    19
discrepancy            32     22    22
inclusive              16      9     9
exclusive              19     13    13
causation              50     40    40
insight               117     76    76
inhibition             64     50    50
communication         127     52    52
cognitive process     322    225    37
sensory process       112     63    63
motion                 73     72    72
optimism               70     50    50
anger                 120    100   100
anxiety                61     48    48
sadness                72     68    68


Let's now decouple function words from stance and discursive markers.

In [17]:
fn_words =  set(marker_dict['articles']) | set(marker_dict['negations']) | set(marker_dict['prepositions']) | set(marker_dict['numbers']) | set(marker_dict['pronouns'])

new_marker_dict2 = copy(new_marker_dict)
for meta in ['rhetoric', 'discursive', 'stance']:
    cats = ALL_META_CATEGORIES[meta]
    for cat in cats:
        try:
            new_marker_dict2[cat] = list(set(new_marker_dict[cat]) - fn_words)
        except KeyError:
            pass

In [18]:
taboo_list = overlapping_markers(new_marker_dict2, CATEGORIES)
print('{}%'.format(round(len(taboo_list) / len(marker_words) * 100, 2)))
print()

for cat in new_marker_dict2:
    if cat not in CATEGORIES:
        continue
    print('{:20}  {:3}     {:3}    {:3}'.format(cat, len(marker_dict[cat]), 
                                                len([m for m in new_marker_dict[cat] if m not in taboo_list]),
                                                len([m for m in new_marker_dict2[cat] if m not in taboo_list])
                                               ))

14.7%

articles                3       3      3
negations              31      27     27
prepositions           43      42     42
numbers                29      28     28
pronouns               71      69     69
tentative              81      51     46
certainty              30      20     19
discrepancy            32      26     22
inclusive              16      15      9
exclusive              19      16     13
causation              50      41     40
insight               117      76     76
inhibition             64      50     50
communication         127      52     52
cognitive process     322      37     37
sensory process       112      63     63
motion                 73      72     72
optimism               70      50     50
anger                 120     100    100
anxiety                61      48     48
sadness                72      68     68


----

# Now let's do the whole thing once again to obtain the final lists of markers.

In [6]:
RELEVANT_META_CATEGORIES = {'stylistic': ['articles', 'negations', 'prepositions', 'numbers', 'pronouns'], # 'pronouns'
                   'rhetoric': ['tentative', 'certainty', 'discrepancy', 'inclusive', 'exclusive'],
                   'discursive': ['causation', 'insight', 'inhibition', 'communication', 'cognitive process', 'sensory process', 'motion'], # 'motion'
                   'stance': ['anger', 'anxiety', 'sadness', 'optimism']}

> Note that the stance categories _positive emotion, negative emotion, affective process_ have been removed.

In [7]:
RELEVANT_CATEGORIES = []
for meta, cats in RELEVANT_META_CATEGORIES.items():
    RELEVANT_CATEGORIES.extend(cats)
    
relevant_marker_dict, relevant_marker_words = utils.read_liwc_markers('../../../data/liwc/', RELEVANT_CATEGORIES)
baseline_marker_dict = copy(relevant_marker_dict)

In [8]:
union = set(relevant_marker_dict['discrepancy']) | set(relevant_marker_dict['causation']) | set(relevant_marker_dict['insight']) | set(relevant_marker_dict['inhibition'])

cognitive = set(relevant_marker_dict['cognitive process']) - union
relevant_marker_dict['cognitive process'] = list(cognitive)


In [9]:
fn_words =  set(relevant_marker_dict['articles']) | set(relevant_marker_dict['negations']) | set(relevant_marker_dict['prepositions']) | set(relevant_marker_dict['numbers']) | set(relevant_marker_dict['pronouns'])

for meta in ['rhetoric', 'discursive', 'stance']:
    cats = RELEVANT_META_CATEGORIES[meta]
    for cat in cats:
        try:
            relevant_marker_dict[cat] = list(set(relevant_marker_dict[cat]) - fn_words)
        except KeyError:
            pass

In [12]:
taboo_list = overlapping_markers(relevant_marker_dict, RELEVANT_CATEGORIES)
for cat, markers in relevant_marker_dict.items():
    print('{:20}  {:3}     {:3}'.format(cat, len(baseline_marker_dict[cat]), len([m for m in markers if m not in taboo_list])))

articles                3       3
negations              31      27
prepositions           43      42
numbers                29      28
pronouns               71      69
tentative              81      46
certainty              30      19
discrepancy            32      22
inclusive              16       9
exclusive              19      13
causation              50      40
insight               117      76
inhibition             64      50
communication         127      52
cognitive process     322      37
sensory process       112      63
motion                 73      72
anger                 120     100
anxiety                61      48
sadness                72      68
optimism               70      50


In [13]:
with open('../../../data/liwc/final.dict', 'wb') as f:
    pickle.dump(relevant_marker_dict, f)