# Import Libs

In [None]:
%matplotlib inline

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


# Import and Clean Newsgroup Dataset

In [None]:
from string import punctuation
from nltk import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups


newsgroups = fetch_20newsgroups()

stop_words = set(stopwords.words('english'))


tokenizer = RegexpTokenizer(r'\s+', gaps=True)


stemmer = PorterStemmer()

translet_tab = {ord(p): u" " for p in punctuation}


def text2token(raw_text):

  clean_text = raw_text.lower().translate(translet_tab)

  tokens = [token.strip() for token in tokenizer.tokenize(clean_text)]

  tokens = [token for token in tokens if token not in stop_words]

  stemmed_tokens = [stemmer.stem(token) for token in tokens]

  return [token for token in stemmed_tokens if len(token) > 2]

dataset = [text2token(txt) for txt in newsgroups['data']]

In [None]:
from gensim.corpora import Dictionary

dictionary = Dictionary(documents=dataset, prune_at=None)

dictionary.filter_extremes(no_below=5,  no_above=0.3, keep_n=None)

dictionary.compactify()

bow_dataset = [dictionary.doc2bow(doc) for doc in dataset]

# Train model

In [None]:
from gensim.models import LdaMulticore

num_topics = 15

lda1 = LdaMulticore(
    corpus=bow_dataset, num_topics=num_topics, id2word=dictionary,
    workers=4, eval_every=None, passes=10, batch=True
)

In [None]:
lda2 = LdaMulticore(
        corpus=bow_dataset, num_topics=num_topics, id2word=dictionary,
    workers=4, eval_every=None, passes=10,
        alpha=(5.0/num_topics),
        eta=0.01,
        batch=True
)

# Analyizing LDA topics discovered

In [None]:
import pyLDAvis.gensim_models
import pyLDAvis
import warnings

# Visualizing topics
pyLDAvis.enable_notebook()
LDAvis_prepered = pyLDAvis.gensim_models.prepare(lda2, bow_dataset, dictionary)
LDAvis_prepered