In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')

In [3]:
import re

text_corpus = newsgroups_train.data
text_corpus = [re.sub('\S*@\S*\s?', '', doc) for doc in text_corpus] #removing email addresses
text_corpus = [re.sub('\s+', ' ', doc) for doc in text_corpus] #removing newline characters
text_corpus = [re.sub("\'", "", doc) for doc in text_corpus] #removing single quote characters

print(text_corpus[1])

From: (Guy Kuo) Subject: SI Clock Poll - Final Call Summary: Final call for SI clock reports Keywords: SI,acceleration,clock,upgrade Article-I.D.: shelley.1qvfo9INNc3s Organization: University of Washington Lines: 11 NNTP-Posting-Host: carson.u.washington.edu A fair number of brave souls who upgraded their SI clock oscillator have shared their experiences for this poll. Please send a brief message detailing your experiences with the procedure. Top speed attained, CPU rated speed, add on cards and adapters, heat sinks, hour of usage per day, floppy disk functionality with 800 and 1.4 m floppies are especially requested. I will be summarizing in the next two days, so please add to the network knowledge base if you have done the clock upgrade and havent answered this poll. Thanks. Guy Kuo 


In [4]:
import gensim
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

def doc_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

words = list(doc_to_words(text_corpus))

print(words[1])

['from', 'guy', 'kuo', 'subject', 'si', 'clock', 'poll', 'final', 'call', 'summary', 'final', 'call', 'for', 'si', 'clock', 'reports', 'keywords', 'si', 'acceleration', 'clock', 'upgrade', 'article', 'shelley', 'qvfo', 'innc', 'organization', 'university', 'of', 'washington', 'lines', 'nntp', 'posting', 'host', 'carson', 'washington', 'edu', 'fair', 'number', 'of', 'brave', 'souls', 'who', 'upgraded', 'their', 'si', 'clock', 'oscillator', 'have', 'shared', 'their', 'experiences', 'for', 'this', 'poll', 'please', 'send', 'brief', 'message', 'detailing', 'your', 'experiences', 'with', 'the', 'procedure', 'top', 'speed', 'attained', 'cpu', 'rated', 'speed', 'add', 'on', 'cards', 'and', 'adapters', 'heat', 'sinks', 'hour', 'of', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', 'with', 'and', 'floppies', 'are', 'especially', 'requested', 'will', 'be', 'summarizing', 'in', 'the', 'next', 'two', 'days', 'so', 'please', 'add', 'to', 'the', 'network', 'knowledge', 'base', 'if', 'you', 

In [5]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

def remove_stopwords(text):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in text_corpus]

words = remove_stopwords(words)

print(words[1])

['guy', 'kuo', 'si', 'clock', 'poll', 'final', 'call', 'summary', 'final', 'call', 'si', 'clock', 'reports', 'keywords', 'si', 'acceleration', 'clock', 'upgrade', 'article', 'shelley', 'qvfo', 'innc', 'organization', 'university', 'washington', 'lines', 'nntp', 'posting', 'host', 'carson', 'washington', 'fair', 'number', 'brave', 'souls', 'upgraded', 'si', 'clock', 'oscillator', 'shared', 'experiences', 'poll', 'please', 'send', 'brief', 'message', 'detailing', 'experiences', 'procedure', 'top', 'speed', 'attained', 'cpu', 'rated', 'speed', 'add', 'cards', 'adapters', 'heat', 'sinks', 'hour', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', 'floppies', 'especially', 'requested', 'summarizing', 'next', 'two', 'days', 'please', 'add', 'network', 'knowledge', 'base', 'done', 'clock', 'upgrade', 'havent', 'answered', 'poll', 'thanks', 'guy', 'kuo']


In [6]:
import spacy

nlp = spacy.load('en', disable=['parser', 'ner'])

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for idx, sent in enumerate(texts):
        if (idx) % 500 == 0:
            print(str(idx) + ' documents lemmatised')
        doc = nlp(" ".join(sent)) 
#         texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        texts_out.append([token.lemma_ for token in doc if token.pos_ not in {'SPACE', 'PUNCT'}])
    return texts_out

data_lemmatized = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

0 documents lemmatised
500 documents lemmatised
1000 documents lemmatised
1500 documents lemmatised
2000 documents lemmatised
2500 documents lemmatised
3000 documents lemmatised
3500 documents lemmatised
4000 documents lemmatised
4500 documents lemmatised
5000 documents lemmatised
5500 documents lemmatised
6000 documents lemmatised
6500 documents lemmatised
7000 documents lemmatised
7500 documents lemmatised
8000 documents lemmatised
8500 documents lemmatised
9000 documents lemmatised
9500 documents lemmatised
10000 documents lemmatised
10500 documents lemmatised
11000 documents lemmatised


In [7]:
# Create Dictionary
import gensim.corpora as corpora
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
corpus = [id2word.doc2bow(text) for text in data_lemmatized]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           per_word_topics=True)

In [8]:
from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.664*"ax" + 0.049*"max" + 0.010*"ei" + 0.008*"di" + 0.007*"pl" + '
  '0.005*"um" + 0.004*"bhj" + 0.004*"ql" + 0.004*"tm" + 0.003*"wm"'),
 (1,
  '0.013*"-PRON-" + 0.010*"organization" + 0.010*"line" + 0.009*"article" + '
  '0.008*"write" + 0.008*"not" + 0.007*"would" + 0.006*"be" + 0.006*"year" + '
  '0.006*"like"'),
 (2,
  '0.015*"window" + 0.013*"line" + 0.010*"problem" + 0.010*"organization" + '
  '0.009*"do" + 0.008*"not" + 0.008*"-PRON-" + 0.008*"card" + 0.008*"run" + '
  '0.008*"use"'),
 (3,
  '0.012*"not" + 0.011*"would" + 0.010*"line" + 0.009*"-PRON-" + 0.008*"do" + '
  '0.008*"write" + 0.007*"organization" + 0.007*"article" + 0.006*"know" + '
  '0.006*"one"'),
 (4,
  '0.014*"-PRON-" + 0.010*"line" + 0.009*"organization" + 0.009*"write" + '
  '0.008*"get" + 0.007*"not" + 0.007*"article" + 0.007*"post" + 0.006*"be" + '
  '0.006*"one"'),
 (5,
  '0.009*"not" + 0.007*"say" + 0.007*"israel" + 0.007*"-PRON-" + '
  '0.006*"armenian" + 0.006*"go" + 0.005*"israeli" + 0.005*"woul

In [9]:
# Visualize the topics
import pyLDAvis
import pyLDAvis.gensim  
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [10]:
print('Perplexity: ', lda_model.log_perplexity(corpus))

Perplexity:  -9.14470391659023


In [11]:
from gensim.models import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.5581679561824157


In [12]:
import numpy as np

def get_theta(lda_model):
    doc_topic_list = [lda_model.get_document_topics(corpus[doc]) for doc in range(len(corpus))]
    num_topics = lda_model.get_topics().shape[0]
    theta = np.zeros((len(corpus), num_topics))
    for ind,r in enumerate(doc_topic_list):
        for c in r:
            theta[ind][c[0]] = c[1]
    return theta

In [13]:
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

def hellinger(theta):
    return euclidean_distances(np.sqrt(theta))

def dot(theta):
    return 1 - np.dot(theta, theta.transpose())

In [14]:
from numba import jit, prange, autojit

In [15]:
@jit(nopython = True)
def kl(p, q):
    return np.dot(p, np.log(q / p))

In [16]:
@jit(nopython = True)
def sym_kl(p, q):
    return kl(p, q) + kl(q, p)

In [17]:
@jit(nopython = True)
def js(p, q):
    m = (p + q) / 2
    return kl(p, m) / 2 + kl(q, m) / 2

In [18]:
@jit(nopython = True)
def overlap(p, q):
    return 1 - np.sum(np.minimum(p,q))

In [19]:
@jit(nopython = True)
def l1_norm(p, q):
    return np.sum(np.abs(p - q))

In [20]:
@jit(nopython = True)
def bc(p, q):
    return -np.log(np.sum(np.sqrt(p * q)))

In [21]:
@jit(nopython = True, parallel = True)
def outer(X, fn):
    dist = np.empty((X.shape[0], X.shape[0]), dtype = np.float32)
    for i in prange(X.shape[0]):
        for j in prange(i):
            dist[j,i] = dist[i,j] = fn(X[i,:], X[j,:])           
    return dist

In [22]:
dist_fns = {'cosine' : cosine_distances,
            'eucl' : euclidean_distances,
            'hellinger' : hellinger,
            'dot' : dot,
            'sym_kl' : lambda x: outer(x, sym_kl),
            'js' : lambda x: outer(x, js),
            'overlap': lambda x: outer(x, overlap),
            'l1' : lambda x: outer(x, l1_norm),
            'bc' : lambda x: outer(x, bc)}

In [23]:
def average_precision(theta, labels, k, dist_fn):
    dist = dist_fn(theta)
    np.fill_diagonal(dist, np.inf)
    ind = np.argpartition(dist, k - 1)[:k, :]
    return np.mean(labels[ind] == labels[None,:])

In [24]:
def eval_topic_model(model, labels, k = 100, dist_fns = dist_fns):
    theta = get_theta(model)
    best_score = 0
    best_dist = None
    for (dist, f) in dist_fns.items():
        score = average_precision(theta, labels, 100, f)
        if score > best_score:
            best_score, best_dist = score, dist
    return best_score, best_dist

In [25]:
seed = 123
for num_topics in range(5, 60 + 1,5):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           per_word_topics=True)
    score, dist = eval_topic_model(lda_model, newsgroups_train.target)
    print("%2i topics AP@100 = %.3f (%s distance)" % (num_topics, score, dist))

 5 topics AP@100 = 0.511 (eucl distance)
10 topics AP@100 = 0.529 (l1 distance)
15 topics AP@100 = 0.502 (hellinger distance)
20 topics AP@100 = 0.492 (eucl distance)
25 topics AP@100 = 0.549 (eucl distance)
30 topics AP@100 = 0.568 (eucl distance)
35 topics AP@100 = 0.432 (l1 distance)
40 topics AP@100 = 0.513 (l1 distance)
45 topics AP@100 = 0.540 (eucl distance)
50 topics AP@100 = 0.647 (eucl distance)
55 topics AP@100 = 0.515 (hellinger distance)
60 topics AP@100 = 0.415 (l1 distance)
