# Topic Modelling for TimeSets
*Apply LDA to find topics in the CIA leak case.*

In [19]:
load_existing_model = False
input_file = '../data/cialeakcase.json'
model_path = 'lda_models/cia-8-topics'

In [20]:
import json

import pandas as pd
pd.set_option('display.max_colwidth', 1000)

from gensim.models.ldamodel import LdaModel
from gensim import corpora
from gensim.models import Phrases
from gensim.models.phrases import Phraser

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

from collections import Counter
from itertools import chain

## Data Preprocessing
#### Load data

In [21]:
def extract_text(e):
    return e.get('title', '') + '. ' + e.get('content', '')

def load_data(filename):
    with open(filename) as f:
        events = json.load(f)['events']
        return [extract_text(e) for e in events]

In [22]:
raw_docs = load_data(input_file)
len(raw_docs), raw_docs[0]

(50,
 'Wilson\'s factfinding mission to Niger. Former ambassador Joseph C. Wilson travels to Niger at the CIA\'s request to check for evidence that Iraq bought uranium "yellowcake" from the African country that could be used for production of a nuclear weapon.')

#### Clean the data

In [23]:
def clean(doc, stop, exclude, lemma):
    stop_free = ' '.join(w for w in doc.lower().split() if w not in stop)
    punc_free = ''.join(c for c in stop_free if c not in exclude)
    normalized = [lemma.lemmatize(w) for w in punc_free.split() if len(w) > 2]
    return normalized

def preprocess(docs):
    'Return a list of words for each preprocessed document.'
    stop = set(stopwords.words('english'))
    exclude = set(string.punctuation) 
    lemma = WordNetLemmatizer()
    return [clean(doc, stop, exclude, lemma) for doc in docs]

In [24]:
docs = preprocess(raw_docs)
docs[0]

['wilson',
 'factfinding',
 'mission',
 'niger',
 'former',
 'ambassador',
 'joseph',
 'wilson',
 'travel',
 'niger',
 'cia',
 'request',
 'check',
 'evidence',
 'iraq',
 'bought',
 'uranium',
 'yellowcake',
 'african',
 'country',
 'could',
 'used',
 'production',
 'nuclear',
 'weapon']

#### Look at word frequency

In [25]:
def get_most_common_terms(docs):
    counter = Counter(chain.from_iterable(docs))
    total_count = sum(counter.values())
    return [(w, c, c / total_count) for w, c in counter.most_common(10)]

In [26]:
get_most_common_terms(docs)

[('rove', 29, 0.022053231939163497),
 ('libby', 29, 0.022053231939163497),
 ('cooper', 23, 0.01749049429657795),
 ('jury', 23, 0.01749049429657795),
 ('time', 19, 0.014448669201520912),
 ('source', 19, 0.014448669201520912),
 ('grand', 19, 0.014448669201520912),
 ('president', 17, 0.012927756653992395),
 ('miller', 16, 0.012167300380228136),
 ('wilson', 15, 0.011406844106463879)]

There are no particular highly-frequent words, so no need to exclude any words.

## Topic Modelling

#### Find common bigrams

In [27]:
def get_phrases(bigram_model, docs):
    phrase_dict = {phrase:score for phrase, score in bigram_model.export_phrases(docs)}
    return sorted(phrase_dict.items(), key=lambda x: -x[1])

In [30]:
bigram_model = Phrases(docs, min_count=5)
get_phrases(bigram_model, docs)

[(b'white house', 71.85798816568048),
 (b'grand jury', 48.631578947368425),
 (b'new york', 36.14285714285714),
 (b'confidential source', 22.82706766917293),
 (b'judith miller', 15.8125)]

#### Use bigrams to find topics

In [31]:
bigram_phraser = Phraser(bigram_model)
bigram_docs = [bigram_phraser[doc] for doc in docs]

In [32]:
get_most_common_terms(bigram_docs)

[('rove', 29, 0.022943037974683545),
 ('libby', 29, 0.022943037974683545),
 ('cooper', 23, 0.01819620253164557),
 ('time', 19, 0.015031645569620253),
 ('grand_jury', 19, 0.015031645569620253),
 ('president', 17, 0.013449367088607595),
 ('wilson', 15, 0.011867088607594937),
 ('white_house', 13, 0.010284810126582278),
 ('say', 13, 0.010284810126582278),
 ('source', 12, 0.00949367088607595)]

In [42]:
def build_lda(docs, num_topics=10, passes=10, alpha='symmetric', eta=None):
    "Return an LDA model from the given list of documents."
    dictionary = corpora.Dictionary(docs)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in docs]    
    return LdaModel(doc_term_matrix, num_topics=num_topics, id2word=dictionary, passes=passes, alpha=alpha, eta=eta, random_state=0)

def get_topics(lda):
    return [[(lda.id2word[t], float(format(p, '.2f'))) for t, p in lda.get_topic_terms(i, topn=5)] for i in range(lda.num_topics)]

In [63]:
if load_existing_model:
    lda = LdaModel.load(model_path)
else:
    lda = build_lda(bigram_docs, num_topics=8, passes=20, alpha=0.5, eta=0.005)
    lda.save(model_path)
get_topics(lda)

[[('wilson', 0.06),
  ('rove', 0.05),
  ('cheney', 0.04),
  ('joe', 0.04),
  ('time', 0.03)],
 [('grand_jury', 0.04),
  ('rove', 0.03),
  ('time', 0.03),
  ('cooper', 0.02),
  ('libby', 0.02)],
 [('libby', 0.05),
  ('president', 0.03),
  ('bush', 0.02),
  ('white_house', 0.02),
  ('prison', 0.02)],
 [('rove', 0.06),
  ('libby', 0.04),
  ('source', 0.03),
  ('mcclellan', 0.03),
  ('say', 0.02)],
 [('testifies', 0.22),
  ('grand_jury', 0.2),
  ('federal', 0.16),
  ('gonzales', 0.11),
  ('white_house', 0.06)],
 [('wilson', 0.03),
  ('plame', 0.02),
  ('column', 0.02),
  ('africa', 0.02),
  ('official', 0.02)],
 [('cooper', 0.04),
  ('confidential_source', 0.03),
  ('jail', 0.02),
  ('judge', 0.02),
  ('miller', 0.02)],
 [('cooper', 0.1),
  ('plames', 0.05),
  ('rove', 0.05),
  ('identity', 0.04),
  ('source', 0.04)]]

#### Examine words in each topic and assign label

In [43]:
def print_topic_terms(lda):
    return [', '.join(lda.id2word[t] for t, p in lda.get_topic_terms(i, topn=5)) for i in range(lda.num_topics)]

In [81]:
print_topic_terms(lda)

['rove, mcclellan, white_house, administration, official',
 'wilson, time, rove, cooper, grand_jury',
 'libby, president, bush, state, white_house',
 'libby, rove, source, say, plame',
 'grand_jury, federal, testifies, gonzales, counsel',
 'plame, wilson, column, woodward, published',
 'cooper, confidential_source, judge, jail, miller',
 'cooper, plames, rove, source, identity']

In [82]:
topic_labels = [
    'white house',
    'wilson',
    'libby-bush',
    'libby-rove',
    'jury-testify',
    'plame-column',
    'courts-judges',
    'rove-source'
]

#### Examine documents with their topics

In [83]:
def get_document_topics(lda, dictionary, doc):
    return lda.get_document_topics(dictionary.doc2bow(doc), minimum_probability=0.1)

In [84]:
dictionary = corpora.Dictionary(docs)

In [85]:
for i in range(10):
    topics = get_document_topics(lda, dictionary, docs[i])
    print(raw_docs[i], [(t, topic_labels[t], format(p, '.2f')) for t, p in topics])
    print()

Wilson's factfinding mission to Niger. Former ambassador Joseph C. Wilson travels to Niger at the CIA's request to check for evidence that Iraq bought uranium "yellowcake" from the African country that could be used for production of a nuclear weapon. [(1, 'wilson', '0.87')]

Bush's State of the Union with "16 words". President Bush delivers his State of the Union address.   In the speech he includes the following sentence: "The British government has learned that Saddam Hussein recently sought significant quantities of uranium from Africa." Those 16 words contradicted what Wilson had reported upon his return from Niger to check out the claim. Months later they would be retracted by the White House. [(2, 'libby-bush', '0.35'), (5, 'plame-column', '0.48')]

Kristof's NYT Column disputes accuracy of 16 words. The New York Times publishes a column by Nicholas Kristof disputing the accuracy of the 16 words in the president's State of the Union address. The column reports that, following up

---
Get back to fb dataset
- POS noun only
- frequency per document
- remove both highly frequent and infrequent terms