### NB: This notebook is created based on [this one](https://github.com/sanketg10/nlp-portfolio/blob/master/2.2-topic-modeling/Latent_dirichlet_allocation.ipynb). Any edit was done on my part.

## Loading Dataset

In [46]:
import pandas as pd

df = pd.read_csv('abcnews-date-text.csv')

## Data Preprocessing

In [14]:
'''
The following steps will be performed:
- Tokenization
- Stopwords removal
- Lemmatization
- Stemming
'''

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS as sw
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np

np.random.seed(12)
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
stemmer = SnowballStemmer('english')

In [68]:
def lemmatize_stem(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in simple_preprocess(text):
        if token not in sw:
            result.append(lemmatize_stem(token))
    return result

In [79]:
preprocessed_docs = df['headline_text'][:500000].apply(preprocess)

## BoWs on the dataset

In [82]:
'''
Gensim has a tool that can be used to create a dictionary of indexed words
with their number of occurrences
'''

dictionary = gensim.corpora.Dictionary(preprocessed_docs)

'''
Remove extremes (i.e. very common/very rare words)

- words appearing less than 15 times
- words appearing in more than 10% of all documents

After this, keep the top k=100.000 words 
'''

dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)

In [85]:
'''
Create a BoW model for each document
'''

bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]

## TF-IDF on the documents

In [105]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

## Running basic LDA on BoW

In [100]:
lda_model = gensim.models.LdaModel(bow_corpus,
                                  num_topics = 10,
                                  id2word = dictionary,
                                  passes = 2)

In [101]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic {}: {}".format(idx, topic))
    print("\n")

Topic 0: 0.039*"say" + 0.023*"abc" + 0.015*"report" + 0.013*"bodi" + 0.013*"cost" + 0.012*"rule" + 0.011*"target" + 0.011*"oil" + 0.011*"work" + 0.010*"fin"


Topic 1: 0.035*"kill" + 0.033*"water" + 0.029*"attack" + 0.027*"home" + 0.018*"qld" + 0.015*"need" + 0.014*"fall" + 0.013*"price" + 0.012*"bushfir" + 0.012*"tiger"


Topic 2: 0.021*"school" + 0.017*"fear" + 0.016*"market" + 0.016*"road" + 0.014*"driver" + 0.013*"law" + 0.013*"death" + 0.012*"blaze" + 0.012*"storm" + 0.011*"rise"


Topic 3: 0.022*"day" + 0.020*"case" + 0.019*"worker" + 0.019*"pay" + 0.017*"test" + 0.016*"china" + 0.015*"end" + 0.015*"lose" + 0.012*"brisban" + 0.012*"dead"


Topic 4: 0.032*"crash" + 0.024*"talk" + 0.022*"miss" + 0.017*"hospit" + 0.014*"polic" + 0.014*"victim" + 0.014*"car" + 0.013*"north" + 0.013*"rudd" + 0.013*"die"


Topic 5: 0.036*"council" + 0.018*"plan" + 0.015*"servic" + 0.015*"busi" + 0.015*"act" + 0.014*"take" + 0.013*"time" + 0.013*"cut" + 0.011*"new" + 0.011*"budget"


Topic 6: 0.029*"new

## Running LDA using tf-df

In [110]:
lda_model_tfidf = gensim.models.LdaModel(corpus_tfidf,
                                        num_topics = 10,
                                        id2word = dictionary,
                                        passes=2)

In [113]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print("Topic {}:{}".format(idx, topic))
    print("\n")

Topic 0:0.014*"woman" + 0.014*"rise" + 0.012*"storm" + 0.010*"guilti" + 0.010*"tiger" + 0.010*"price" + 0.008*"rat" + 0.008*"perth" + 0.008*"rate" + 0.008*"boy"


Topic 1:0.011*"chang" + 0.009*"climat" + 0.009*"student" + 0.009*"asylum" + 0.008*"reveal" + 0.008*"river" + 0.007*"delay" + 0.007*"afl" + 0.007*"murray" + 0.007*"stimulus"


Topic 2:0.016*"market" + 0.012*"gold" + 0.010*"coast" + 0.010*"turnbul" + 0.010*"budget" + 0.010*"rescu" + 0.009*"sign" + 0.009*"australian" + 0.009*"babi" + 0.008*"share"


Topic 3:0.021*"man" + 0.019*"polic" + 0.017*"charg" + 0.015*"murder" + 0.014*"court" + 0.011*"death" + 0.010*"shoot" + 0.009*"accus" + 0.009*"sex" + 0.009*"bushfir"


Topic 4:0.014*"job" + 0.011*"health" + 0.009*"school" + 0.008*"flood" + 0.008*"worker" + 0.008*"fund" + 0.008*"fear" + 0.008*"boost" + 0.007*"bodi" + 0.007*"teacher"


Topic 5:0.062*"interview" + 0.021*"flu" + 0.020*"swine" + 0.012*"power" + 0.010*"obama" + 0.007*"station" + 0.007*"tour" + 0.007*"adelaid" + 0.007*"darwi

## Document classification

In [119]:
document_num = 700

#random test document

print("Document: {}".format(df['headline_text'][document_num]))
for index, score in sorted(lda_model_tfidf[bow_corpus[document_num]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

Document: adelaide break port hearts

Score: 0.4200214445590973	 
Topic: 0.036*"council" + 0.018*"plan" + 0.015*"servic" + 0.015*"busi" + 0.015*"act" + 0.014*"take" + 0.013*"time" + 0.013*"cut" + 0.011*"new" + 0.011*"budget"

Score: 0.21999533474445343	 
Topic: 0.029*"new" + 0.022*"job" + 0.014*"power" + 0.014*"set" + 0.014*"get" + 0.013*"open" + 0.013*"world" + 0.012*"want" + 0.012*"centr" + 0.011*"cup"

Score: 0.21996290981769562	 
Topic: 0.021*"school" + 0.017*"fear" + 0.016*"market" + 0.016*"road" + 0.014*"driver" + 0.013*"law" + 0.013*"death" + 0.012*"blaze" + 0.012*"storm" + 0.011*"rise"

Score: 0.02000472880899906	 
Topic: 0.022*"day" + 0.020*"case" + 0.019*"worker" + 0.019*"pay" + 0.017*"test" + 0.016*"china" + 0.015*"end" + 0.015*"lose" + 0.012*"brisban" + 0.012*"dead"

Score: 0.020004095509648323	 
Topic: 0.032*"crash" + 0.024*"talk" + 0.022*"miss" + 0.017*"hospit" + 0.014*"polic" + 0.014*"victim" + 0.014*"car" + 0.013*"north" + 0.013*"rudd" + 0.013*"die"

Score: 0.0200026500

## Classification of unseen document

In [129]:
unseen_document = "I love swimming."

bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for idx, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(idx, 5)))

score: 0.3666568398475647	 Topic: 0.016*"market" + 0.012*"gold" + 0.010*"coast" + 0.010*"turnbul" + 0.010*"budget"
score: 0.36662834882736206	 Topic: 0.062*"interview" + 0.021*"flu" + 0.020*"swine" + 0.012*"power" + 0.010*"obama"
score: 0.033339351415634155	 Topic: 0.014*"woman" + 0.014*"rise" + 0.012*"storm" + 0.010*"guilti" + 0.010*"tiger"
score: 0.033339351415634155	 Topic: 0.011*"chang" + 0.009*"climat" + 0.009*"student" + 0.009*"asylum" + 0.008*"reveal"
score: 0.033339351415634155	 Topic: 0.021*"man" + 0.019*"polic" + 0.017*"charg" + 0.015*"murder" + 0.014*"court"
score: 0.033339351415634155	 Topic: 0.014*"job" + 0.011*"health" + 0.009*"school" + 0.008*"flood" + 0.008*"worker"
score: 0.033339351415634155	 Topic: 0.016*"dead" + 0.011*"kill" + 0.010*"blaze" + 0.010*"blast" + 0.009*"alleg"
score: 0.033339351415634155	 Topic: 0.017*"car" + 0.014*"crash" + 0.013*"final" + 0.011*"die" + 0.011*"threat"
score: 0.033339351415634155	 Topic: 0.020*"abc" + 0.012*"child" + 0.010*"run" + 0.009*