# Topic Modeling and Latent Dirichlet Allocation (LDA) in Python

In [None]:
# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24(Reference Tutorial)

In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv('abcnews-date-text1.csv');
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text
documents

Data Pre-processing
We will perform the following steps:
Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
Words that have fewer than 3 characters are removed.
All stopwords are removed.
Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.
Words are stemmed — words are reduced to their root form.

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

# Use to stem words in English
stemmer = SnowballStemmer("english")
print(stemmer.stem("runable"))

### Lemmitizer

Here, we've got a bunch of examples of the lemma for the words that we use. The only major thing to note is that lemmatize takes a part of speech parameter, "pos." If not supplied, the default is "noun." This means that an attempt will be made to find the closest noun, which can create trouble for you. Keep this in mind if you use lemmatizing!


lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("better", pos="a"))


### gensim.utils.simple_preprocess
Convert a document into a list of tokens.

This lowercases, tokenizes, de-accents (optional). – the output are final tokens = unicode strings, that won’t be processed any further.

In [None]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

The below statement is used to get headline text

In [None]:
doc_sample = documents[documents['index'] == 1].values[0][0]
type(doc_sample)

In [None]:
print('original document: ')
words = []
words=doc_sample.split(' ')
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))


In [None]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]


In [None]:
processed_docs.shape

In [None]:
processed_docs[:15]

In [None]:
dictbow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary

In [None]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 11:
        break

Filter out tokens that appear in
less than 15 documents (absolute number) or
more than 0.5 documents (fraction of total corpus size, not absolute number).
after the above two steps, keep only the first 100000 most frequent tokens.

In [None]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[43]

In [None]:
for doc in bow_corpus:
   print([[dictionary[id], freq] for id, freq in doc])

for i in range(len(mylist)):
    dict_key=mylist[i][0]
    dict_value=mylist[i][1]
    term=dictionary[dict_key]
    new_dict[term]=dict_value
new_dict

In [None]:
bow_corpus[1100]

In [None]:
processed_docs.loc[4310]

In [None]:
dictionary[162]

In [None]:
new_dict = {}
mylist= bow_corpus[4310]
mylist

In [None]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint
for doc in corpus_tfidf:
    print(doc)
    

### Tfidf Model Working
models.TfidfModel takes bow_corpus as input (we know that bow_corpus is a numerical representation of each
                                             document Headline Text in the form of(termID,termFreq))
what TfidfModel will do is that it will create similiar numerical representation of each document in the form of
(termID,term-TFIDF)

In [None]:
dictionary

In [None]:
corpus_tfidf

### Running LDA using Bag of Words
Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

For each topic, we will explore the words occuring in that topic and its relative weight.


In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
processed_docs[4310]


In [None]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))