# Topic Modelling for TimeSets
*Apply LDA to find topics in the facebook dataset.*

In [19]:
load_existing_model = False
input_file = '../data/detailedFbData.json'
model_path = 'lda_models/fb-8-topics'

In [20]:
import json

import pandas as pd
pd.set_option('display.max_colwidth', 1000)

from gensim.models.ldamodel import LdaModel
from gensim import corpora
from gensim.models import Phrases
from gensim.models.phrases import Phraser

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

from collections import Counter
from itertools import chain

## Data Preprocessing
#### Load data

In [21]:
def extract_text(post):
    return post.get('message') or post.get('description') or ''

def load_data(filename):
    with open(filename) as f:
        posts = json.load(f) # list of posts
        return [extract_text(post) for post in posts]

In [22]:
raw_docs = load_data(input_file)
len(raw_docs), raw_docs[0]

(536,
 'WATCH: "JEB EXCLAMATION POINT!" - Jeb Bush, now driving around Selina Meyer in the opening intro to the #Emmys with Jimmy Kimmel.')

#### Clean the data

In [24]:
def clean(doc, stop, exclude, lemma):
    stop_free = ' '.join(w for w in doc.lower().split() if w not in stop)
    punc_free = ''.join(c for c in stop_free if c not in exclude)
    normalized = [lemma.lemmatize(w) for w in punc_free.split() if len(w) > 2]
    return normalized

def preprocess(docs):
    'Return a list of words for each preprocessed document.'
    stop = set(stopwords.words('english'))
    exclude = set(string.punctuation) 
    lemma = WordNetLemmatizer()
    return [clean(doc, stop, exclude, lemma) for doc in docs]

In [25]:
docs = preprocess(raw_docs)
docs[0]

['watch',
 'jeb',
 'exclamation',
 'point',
 'jeb',
 'bush',
 'driving',
 'around',
 'selina',
 'meyer',
 'opening',
 'intro',
 'emmy',
 'jimmy',
 'kimmel']

#### Look at word frequency

In [7]:
def get_most_common_terms(docs):
    counter = Counter(chain.from_iterable(docs))
    total_count = sum(counter.values())
    return [(w, c, c / total_count) for w, c in counter.most_common(10)]

In [8]:
get_most_common_terms(docs)

[('trump', 205, 0.027777777777777776),
 ('donald', 169, 0.022899728997289974),
 ('clinton', 113, 0.015311653116531165),
 ('hillary', 98, 0.013279132791327914),
 ('say', 81, 0.01097560975609756),
 ('president', 76, 0.010298102981029811),
 ('debate', 74, 0.01002710027100271),
 ('presidential', 58, 0.007859078590785908),
 ('obama', 50, 0.006775067750677507),
 ('first', 45, 0.006097560975609756)]

#### The dataset seems about debate between Donald Trump and Hillary Clinton, so it's safe to remove those top four words and other presidential related one.

In [9]:
def remove_high_frequency_words(docs, excluded_words):
    return [[w for w in doc if w not in excluded_words] for doc in docs]

In [10]:
excluded_words = frozenset(['trump', 'donald', 'clinton', 'hillary', 'president', 'presidential'])
filtered_docs = remove_high_frequency_words(docs, excluded_words)
get_most_common_terms(filtered_docs)

[('say', 81, 0.012160336285842966),
 ('debate', 74, 0.011109443026572586),
 ('obama', 50, 0.007506380423359856),
 ('first', 45, 0.00675574238102387),
 ('new', 43, 0.006455487164089476),
 ('republican', 37, 0.005554721513286293),
 ('campaign', 35, 0.005254466296351899),
 ('election', 28, 0.00420357303708152),
 ('news', 26, 0.003903317820147125),
 ('people', 26, 0.003903317820147125)]

## Topic Modelling

#### Find common bigrams

In [11]:
def get_phrases(bigram_model, docs):
    phrase_dict = {phrase:score for phrase, score in bigram_model.export_phrases(docs)}
    return sorted(phrase_dict.items(), key=lambda x: -x[1])

In [12]:
bigram_model = Phrases(filtered_docs, min_count=5)
get_phrases(bigram_model, filtered_docs)

[(b'abc news', 230.53846153846155),
 (b'occupy democrat', 208.9411764705882),
 (b'north carolina', 166.5),
 (b'new york', 139.3953488372093),
 (b'like page', 106.91638795986621),
 (b'fighting isi', 95.14285714285714),
 (b'white house', 91.441647597254),
 (b'republican nominee', 86.4),
 (b'united nation', 61.47692307692308),
 (b'new jersey', 53.10299003322259),
 (b'police shooting', 42.7379679144385),
 (b'first debate', 38.4),
 (b'michelle obama', 26.639999999999997),
 (b'election day', 17.83928571428571)]

#### Use bigrams to find topics

In [13]:
bigram_phraser = Phraser(bigram_model)
bigram_docs = [bigram_phraser[doc] for doc in filtered_docs]

In [14]:
get_most_common_terms(bigram_docs)

[('say', 81, 0.012430939226519336),
 ('debate', 53, 0.008133824432166974),
 ('obama', 44, 0.006752608962553714),
 ('campaign', 35, 0.005371393492940454),
 ('people', 26, 0.003990178023327195),
 ('first', 24, 0.003683241252302026),
 ('would', 24, 0.003683241252302026),
 ('american', 24, 0.003683241252302026),
 ('state', 24, 0.003683241252302026),
 ('republican', 24, 0.003683241252302026)]

In [15]:
def build_lda(docs, num_topics=10, passes=10, alpha='symmetric', eta=None):
    "Return an LDA model from the given list of documents."
    dictionary = corpora.Dictionary(docs)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in docs]    
    return LdaModel(doc_term_matrix, num_topics=num_topics, id2word=dictionary, passes=passes, alpha=alpha, eta=eta, random_state=0)

def get_topics(lda):
    return [[(lda.id2word[t], float(format(p, '.2f'))) for t, p in lda.get_topic_terms(i)] for i in range(lda.num_topics)]

In [18]:
if load_existing_model:
    lda = LdaModel.load(model_path)
else:
    lda = build_lda(bigram_docs, num_topics=8, passes=20)
    lda.save(model_path)
get_topics(lda)

[[('say', 0.01),
  ('great', 0.01),
  ('debate', 0.01),
  ('republican', 0.01),
  ('first_debate', 0.01),
  ('week', 0.01),
  ('race', 0.01),
  ('city', 0.01),
  ('campaign', 0.01),
  ('advantage', 0.01)],
 [('obama', 0.01),
  ('like', 0.01),
  ('people', 0.01),
  ('say', 0.01),
  ('really', 0.01),
  ('win', 0.01),
  ('come', 0.01),
  ('refugee', 0.01),
  ('back', 0.0),
  ('world', 0.0)],
 [('debate', 0.01),
  ('there', 0.01),
  ('say', 0.01),
  ('obama', 0.01),
  ('voter', 0.0),
  ('people', 0.0),
  ('war', 0.0),
  ('candidate', 0.0),
  ('national', 0.0),
  ('tell', 0.0)],
 [('obama', 0.01),
  ('debate', 0.01),
  ('tonight', 0.01),
  ('live', 0.01),
  ('host', 0.0),
  ('detroit', 0.0),
  ('one', 0.0),
  ('special', 0.0),
  ('chris', 0.0),
  ('american', 0.0)],
 [('say', 0.01),
  ('would', 0.01),
  ('first_debate', 0.01),
  ('election', 0.01),
  ('take', 0.01),
  ('new_york', 0.01),
  ('first', 0.01),
  ('going', 0.01),
  ('people', 0.0),
  ('debate', 0.0)],
 [('support', 0.01),
  ('qu

#### Examine words in each topic and assign label

In [16]:
def print_topic_terms(lda):
    return [', '.join(lda.id2word[t] for t, p in lda.get_topic_terms(i)) for i in range(lda.num_topics)]

In [17]:
print_topic_terms(lda)

['week, live, republican, american, attack, question, host, time, tonight, detroit',
 'former, debate, national, republican, bush, excited, according, race, voter, first',
 'obama, more, occupy_democrat, like_page, new_york, country, city, people, show, cnn',
 'debate, national, election, republican_nominee, harry, reid, campaign, leader, year, senate',
 'people, obama, debate, could, war, state, one, like, woman, going',
 'likely, great, political, obama, were, american, johnson, need, saying, black',
 'obama, campaign, people, take, question, know, election, first, ever, new',
 'debate, tell, two, former, one, face, gop, bill, get, campaign']

In [19]:
topic_labels = [
    'republican live tonight',
    'former bush excited debate',
    'obama occupy_democrat',
    'debate republican_nominee',
    'obama war woman',
    'obama johnson black',
    'obama firt ever',
    'debate former bill cliton'
]

#### Examine documents with their topics

In [31]:
def get_document_topics(lda, dictionary, doc):
    return lda.get_document_topics(dictionary.doc2bow(doc), minimum_probability=0.1)

In [20]:
dictionary = corpora.Dictionary(filtered_docs)

In [40]:
for i in range(10):
    topics = get_document_topics(lda, dictionary, filtered_docs[i])
    print(raw_docs[i], [(t, topic_labels[t], format(p, '.2f')) for t, p in topics])
    print()

WATCH: "JEB EXCLAMATION POINT!" - Jeb Bush, now driving around Selina Meyer in the opening intro to the #Emmys with Jimmy Kimmel. [(2, 'obama occupy_democrat', '0.95')]

The Syrian military declared today that the U.S.-Russia brokered cease-fire is over, blaming rebel groups for violating the agreement. [(6, 'obama firt ever', '0.94')]

Rose Pak, an influential community activist who turned San Francisco's Asian-American population into a political power in the city, passes away at 68. [(2, 'obama occupy_democrat', '0.95')]

Pres. Barack Obama awards the 2015 National Medal of Arts and National Humanities Medal to distinguished recipients. The winners include Morgan Freeman, Audra McDonald and Mel Brooks. [(2, 'obama occupy_democrat', '0.96')]

Warplanes target the besieged Syrian city of Aleppo for the first time since the ceasefire went into effect last week. [(0, 'republican live tonight', '0.94')]

Using Skittles to make a point about “our Syrian refugee problem” didn’t go over too