# Hierachical LDA (sort of)

1. Clustering in a graph by topic
2. Finding key words within a topic

[Data source](https://www.kaggle.com/snapcrack/all-the-news#articles1.csv)  
[LDA Tutorial](https://medium.com/ml2vec/topic-modeling-is-an-unsupervised-learning-approach-to-clustering-documents-to-discover-topics-fdfbf30e27df)

NOTE: maybe take a look at this for help:
https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24


In [4]:
import pandas as pd
dataset = pd.read_csv("data/articles1.csv")
dataset.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [26]:
import nltk
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string

texts = list(dataset.content)
cleaned_texts = []

tokenizer = RegexpTokenizer(r'\w+')
stopword_set = set(stopwords.words('english'))

stemmer = PorterStemmer()

for text in texts:
    clean = text.lower()
    
    # https://stackoverflow.com/questions/15547409/how-to-get-rid-of-punctuation-using-nltk-tokenizer
    #translate_table = dict((ord(char), None) for char in string.punctuation)
    
    #clean = clean.translate(translate_table)
    clean = tokenizer.tokenize(clean)
    clean = set(clean).difference(stopword_set)
    stemmed = []
    for token in clean:
        stemmed.append(stemmer.stem(token))
    cleaned_texts.append(stemmed)

In [27]:
import pickle

pickle_out = open("cleaned_texts.pickle","wb")
pickle.dump(cleaned_texts, pickle_out)
pickle_out.close()

#pickle_in = open("cleaned_texts.pickle","rb")
#cleaned_texts = pickle.load(pickle_in)

In [28]:
from gensim.models import ldamodel
import gensim.corpora

num_topics = 10

id2word = gensim.corpora.Dictionary(cleaned_texts)

corpus = [id2word.doc2bow(text) for text in cleaned_texts]

lda = ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)

In [18]:
import numpy as np

def get_lda_topics(model, num_topics):
    #word_dict = {};
    all_words = []
    for i in range(num_topics):
        words = model.show_topic(i,);
        all_words.extend(words)
        print(words)
        #word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    #return pd.DataFrame(word_dict);
    np_words = np.ndarray(all_words)
    return np_words

In [65]:
#get_lda_topics(lda, num_topics)

lda.print_topics()

[(0,
  '0.005*"use" + 0.005*"like" + 0.004*"year" + 0.004*"make" + 0.004*"get" + 0.003*"one" + 0.003*"time" + 0.003*"work" + 0.003*"need" + 0.003*"go"'),
 (1,
  '0.004*"year" + 0.004*"show" + 0.004*"time" + 0.004*"star" + 0.003*"new" + 0.003*"one" + 0.003*"look" + 0.003*"work" + 0.003*"first" + 0.003*"cnn"'),
 (2,
  '0.006*"kill" + 0.005*"report" + 0.005*"said" + 0.005*"polic" + 0.004*"attack" + 0.004*"offic" + 0.004*"cnn" + 0.004*"one" + 0.004*"citi" + 0.004*"peopl"'),
 (3,
  '0.003*"live" + 0.003*"like" + 0.003*"say" + 0.003*"want" + 0.003*"one" + 0.003*"call" + 0.003*"feel" + 0.003*"thing" + 0.003*"time" + 0.003*"peopl"'),
 (4,
  '0.005*"state" + 0.004*"report" + 0.004*"presid" + 0.004*"offici" + 0.004*"nation" + 0.004*"investig" + 0.004*"said" + 0.004*"call" + 0.003*"countri" + 0.003*"say"'),
 (5,
  '0.006*"law" + 0.006*"state" + 0.004*"court" + 0.004*"year" + 0.004*"case" + 0.004*"offic" + 0.004*"feder" + 0.004*"legal" + 0.003*"report" + 0.003*"attorney"'),
 (6,
  '0.008*"compani"

In [37]:
lda.get_document_topics(id2word.doc2bow(cleaned_texts[0]))

[(4, 0.07458261), (5, 0.35726357), (6, 0.30371642), (8, 0.25716272)]

need to get all documents of a particular topic (note: already have BoW for cleaned_texts, called `corpus`)

In [38]:
def get_relevant_docs(topic_num):
    relevant_docs = []
    
    i = 0
    for bow in corpus:
        topics = dict(lda.get_document_topics(bow))
        if topic_num in topics:
            relevant_docs.append((i, topics[topic_num]))           
        i += 1
    return relevant_docs

In [47]:
things = get_relevant_docs(8)

In [54]:
ranked_things = sorted(things, key=lambda tup: tup[1], reverse=True)

relevant_ranked_things = [tup[0] for tup in ranked_things if tup[1] > .5]

#dthings = dict(things)
#relevant_articles = [cleaned_texts[i] for i in ]

In [60]:
#texts[ranked_things[0][0]]

print(len(cleaned_texts))
print(len(things))
print(len(relevant_ranked_things))
relevant_ranked_things

50000
30323
10301


[30886,
 23793,
 46775,
 36112,
 15472,
 27085,
 28285,
 33634,
 29991,
 25407,
 43798,
 33231,
 25006,
 39299,
 44120,
 35653,
 36233,
 21416,
 46847,
 21580,
 30902,
 20848,
 34386,
 21089,
 16741,
 36050,
 20427,
 19170,
 18728,
 35833,
 38159,
 8406,
 27264,
 11366,
 33429,
 28887,
 21279,
 31021,
 37274,
 12257,
 33192,
 38054,
 46828,
 46116,
 14311,
 26607,
 19396,
 28068,
 15765,
 23280,
 48830,
 38614,
 17164,
 37628,
 26571,
 24364,
 6648,
 16405,
 26391,
 13296,
 33658,
 27879,
 29574,
 46623,
 30262,
 27219,
 29977,
 46053,
 26845,
 2928,
 19002,
 36069,
 18497,
 22805,
 17797,
 15245,
 49380,
 17666,
 9420,
 36664,
 37723,
 34443,
 31559,
 27861,
 29422,
 27888,
 16050,
 45996,
 21649,
 43445,
 19489,
 25023,
 32016,
 29511,
 30319,
 46219,
 38684,
 49387,
 35688,
 45579,
 20984,
 45686,
 46117,
 38648,
 26804,
 28360,
 21341,
 27209,
 33241,
 30833,
 29175,
 28896,
 22356,
 20915,
 47337,
 35650,
 19974,
 17457,
 23454,
 16539,
 18612,
 21291,
 38633,
 26269,
 18104,
 468

In [61]:
relevant_corpus = [corpus[i] for i in relevant_ranked_things]
relevant_lda = ldamodel.LdaModel(corpus=relevant_corpus, id2word=id2word, num_topics=num_topics)

In [64]:
relevant_lda.print_topics(num_words=10)

[(0,
  '0.003*"showcas" + 0.002*"obscen" + 0.001*"wgn" + 0.001*"bakari" + 0.000*"seller" + 0.000*"republican" + 0.000*"said" + 0.000*"presid" + 0.000*"twitter" + 0.000*"follow"'),
 (1,
  '0.005*"presid" + 0.005*"said" + 0.005*"go" + 0.004*"republican" + 0.004*"say" + 0.004*"american" + 0.004*"would" + 0.004*"get" + 0.004*"trump" + 0.004*"care"'),
 (2,
  '0.004*"support" + 0.004*"state" + 0.004*"democrat" + 0.004*"republican" + 0.004*"senat" + 0.004*"elect" + 0.004*"vote" + 0.003*"presid" + 0.003*"nation" + 0.003*"polit"'),
 (3,
  '0.013*"follow" + 0.012*"twitter" + 0.008*"said" + 0.007*"trump" + 0.006*"donald" + 0.006*"presid" + 0.006*"russian" + 0.006*"news" + 0.005*"investig" + 0.005*"think"'),
 (4,
  '0.003*"abedin" + 0.003*"huma" + 0.003*"libya" + 0.002*"trump" + 0.002*"republican" + 0.002*"donald" + 0.002*"campaign" + 0.002*"report" + 0.002*"endors" + 0.002*"also"'),
 (5,
  '0.008*"poll" + 0.006*"vote" + 0.006*"show" + 0.005*"voter" + 0.004*"elect" + 0.004*"percent" + 0.004*"point