In [1]:
# Imports
import pickle as pic
import numpy as np
import seaborn as sb
import nltk, string
import pandas as pd
from gensim import corpora, models, similarities
from gensim.models.coherencemodel import CoherenceModel
from collections import defaultdict
from nltk.corpus import stopwords
from sklearn.cross_validation import train_test_split
import time
%matplotlib inline



In [2]:
# Load Data
all_data = pic.load(open("../Data/all_data.p"))

## Prepare Text

In [3]:
# Get Text and Index
ann_text = all_data['ann_text'].values
ref_text = all_data['fragment'].values

# Define normalizer
stemmer = nltk.stem.porter.PorterStemmer()

stop = set(stopwords.words('english'))

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens if item not in stop]

def normalize(text):
    return stem_tokens(nltk.word_tokenize(
            text.lower().translate(None, string.punctuation)))

# Normalize our annotations and Referent Text
ann_text_bow = np.array(map(normalize, ann_text))
ref_text_bow = np.array(map(normalize, ref_text))

ann_dictionary = corpora.Dictionary(ann_text_bow)
ref_dictionary = corpora.Dictionary(ref_text_bow)

# Print Number of Words in the Dictionary
print len(ann_dictionary.values())
print len(ref_dictionary.values())


# Create our corpuses
ann_corpus_bow = np.array([ann_dictionary.doc2bow(text) for text in ann_text_bow])
ref_corpus_bow = np.array([ann_dictionary.doc2bow(text) for text in ref_text_bow])

73933
35603


## Evaluate Annotation LDA Models

In [5]:
topic_num_list = range(10, 101 ,10)
ann_results = test_topic_num(topic_num_list, ann_corpus_bow, ann_text_bow, ann_dictionary, random_state=40)

Number of topics being evaluated: 10
Model trained in 172.823943853 seconds.
Perplexity found in 142.275618076 seconds.
Coherence scores found in 63.4813780785 seconds.
[-7033183.6672986522, 820.46647954697676, 0.55857028354711791, -2.3726795047188407]
Number of topics being evaluated: 20
Model trained in 181.846566916 seconds.
Perplexity found in 143.11157012 seconds.
Coherence scores found in 99.7857139111 seconds.
[-7304996.6740683373, 1063.3621488539711, 0.55282878328002494, -2.4959181051983959]
Number of topics being evaluated: 30
Model trained in 193.702538013 seconds.
Perplexity found in 150.282443047 seconds.
Coherence scores found in 158.997165918 seconds.
[-7554470.3893681429, 1349.1048620147867, 0.56092496928770297, -2.6135371563231673]
Number of topics being evaluated: 40
Model trained in 206.005049944 seconds.
Perplexity found in 127.240170002 seconds.
Coherence scores found in 195.903263092 seconds.
[-7782340.1108094342, 1676.714149076671, 0.54172728302812756, -3.10657245

In [None]:
ann_results

## Evaluate Referent LDA Models

ref_results = test_topic_num(topic_num_list, ref_corpus_bow, ref_text_bow, ref_dictionary, random_state=40)

## Functions

def lda_evaluation(corpus, texts, dictionary, num_folds=3, topics=10, random_state=None):
    # Create train-test split
    train_corp, test_corp, train_texts, test_texts = train_test_split(corpus, texts, random_state=random_state)

    # Train LDA Model
    start = time.time()
    lda = models.LdaMulticore(train_corp, id2word=dictionary, 
                              num_topics=topics, iterations=50)
    train_time = time.time()
    print "Model trained in %s seconds." % (train_time - start)
    
    # Find perplexity on held out documents
    perplexity = lda.bound(test_corp)
    number_of_words = sum(len(doc) for doc in test_texts)
    per_word_perplex = np.exp2(-perplexity / number_of_words)
    perplexity_time = time.time()
    print "Perplexity found in %s seconds." % (perplexity_time - train_time)
    
    # Calculate coherence measures
    cm_cv = CoherenceModel(model=lda, texts=train_texts, 
                           dictionary=dictionary, coherence='c_v')
    cm_umass = CoherenceModel(model=lda, corpus=train_corp, 
                              dictionary=dictionary, coherence='u_mass')
    cv_coherence = cm_cv.get_coherence()
    umass_coherence = cm_umass.get_coherence()
    coherence_time = time.time()
    print "Coherence scores found in %s seconds." % (coherence_time - perplexity_time)
    
    # Return results
    result = [perplexity, per_word_perplex, cv_coherence, umass_coherence]
    return result

def test_topic_num(topic_num_list, corpus, texts, dictionary, random_state=None):
    # Create Reults DataFrame
    results = pd.DataFrame()
    
    # Evaluate topic model for each number of topics in list
    for num in topic_num_list:
        print "Number of topics being evaluated: %s" % num
        evaluation = lda_evaluation(corpus, texts, dictionary, 
                                    topics=num, random_state=random_state)
        print evaluation
        scores = pd.DataFrame(np.array([num] + evaluation, ndmin=2))
        results = results.append(scores)
    
    results.rename(columns={0: 'num_topics', 1: 'perplex', 2: 'per_word_perplex', 
                            3: 'cv', 4: 'umass'}, inplace=True)    
    return results

def best_topic_num(results):
    results_sorted = sorted(results.items(), key=lambda x: x[1][1])
    return results_sorted[0]
    