# Hierachical LDA (sort of)

1. Clustering in a graph by topic
2. Finding key words within a topic

[Data source](https://www.kaggle.com/snapcrack/all-the-news#articles1.csv)  
[LDA Tutorial](https://medium.com/ml2vec/topic-modeling-is-an-unsupervised-learning-approach-to-clustering-documents-to-discover-topics-fdfbf30e27df)

In [1]:
import pandas as pd
dataset = pd.read_csv("data/articles1.csv")
dataset.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string

texts = list(dataset.content)
cleaned_texts = []

stopword_set = set(stopwords.words('english'))

stemmer = PorterStemmer()

for text in texts:
    clean = text.lower()
    
    # https://stackoverflow.com/questions/15547409/how-to-get-rid-of-punctuation-using-nltk-tokenizer
    translate_table = dict((ord(char), None) for char in string.punctuation)
    
    clean = clean.translate(translate_table)
    clean = nltk.word_tokenize(clean)
    clean = set(clean).difference(stopword_set)
    stemmed = []
    for token in clean:
        stemmed.append(stemmer.stem(token))
    cleaned_texts.append(stemmed)

In [7]:
from gensim.models import ldamodel
import gensim.corpora

num_topics = 10

id2word = gensim.corpora.Dictionary(cleaned_texts)

corpus = [id2word.doc2bow(text) for text in cleaned_texts]

lda = ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)

In [8]:
def get_lda_topics(model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20);
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    return pd.DataFrame(word_dict);

In [9]:
get_lda_topics(lda, num_topics)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09,Topic # 10
0,year,kill,”,look,republican,state,make,game,compani,countri
1,’,attack,’,day,trump,report,work,play,’,european
2,”,report,say,time,donald,presid,like,win,”,minist
3,accord,’,show,work,’,”,use,’,busi,leader
4,1,”,call,get,democrat,’,need,team,—,europ
5,increas,polic,time,one,”,offici,way,”,ceo,govern
6,report,said,one,go,presid,investig,get,player,new,prime
7,said,offic,said,’,elect,nation,chang,sport,like,year
8,000,one,like,like,campaign,said,year,year,use,nation
9,state,investig,year,year,said,call,thing,first,also,’
