# Table of Contents

* [Setting up the Environment](#section1)
* [Creating Input](#section2)
* [Model Creation](#section3)
* [Explore Model](#section4)
    * [Printing Topics](#section4_1)
    * [Looking for Topic Evolution](#section4_2)
    * [Visualising Dynamic Topic Models](#section4_3)
    * [Topic Coherence](#section4_4)

# Setting up the Environment <a class=anchor id=section1></a>

In [6]:
%%capture
!pip install spacy
!pip install gensim
!pip install pyLDAvis
!python -m spacy download en_core_web_sm

In [None]:
import re
import spacy
import pickle
import gensim
import logging
import warnings
import numpy as np
import pandas as pd
import gensim.corpora as corpora

from gensim.models import LdaSeqModel
from gensim.corpora import Dictionary
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt

from pprint import pprint
from nltk.corpus import stopwords
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
from nltk.corpus import PlaintextCorpusReader
from gensim.parsing.preprocessing import preprocess_string


%matplotlib inline
pyLDAvis.enable_notebook()
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

# Creating Input <a class=anchor id=section2></a>

In [7]:
!rm -rf `find -type d -name .ipynb_checkpoints`

In [8]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [9]:
# Setence to Words
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [11]:
corpus_topics = ['TRANSFORMED/1950-1959', 'TRANSFORMED/1960-1969', 'TRANSFORMED/1970-1979', 'TRANSFORMED/1980-1989', 
                 'TRANSFORMED/1990-1999', 'TRANSFORMED/2000-2009', 'TRANSFORMED/2010-2019', 'TRANSFORMED/2020-2029']

In [13]:
flag=True

for corpus_root in corpus_topics:
    rows = []
    corpus = PlaintextCorpusReader(corpus_root, '.*txt')
    text_list = corpus.fileids()
    for text in text_list:
        if corpus_root in ['TRANSFORMED/1950-1959', 'TRANSFORMED/1960-1969', 'TRANSFORMED/1970-1979']:
            identifier = text.split("_")[0]
        else:
            identifier = text.split("_")[-1][:-4]
        rows.append([identifier, " ".join(corpus.words(text))])
    # CREATE DATFRAME
    df = pd.DataFrame.from_records(rows, columns=['id', 'content'])
    
    # CLEANING
    # Convert to list
    data = df['content'].values.tolist()
    # Remove Emails
    data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
    # Remove new line characters
    data = [re.sub('\s+', ' ', sent) for sent in data]
    # Remove distracting single quotes
    data = [re.sub("\'", "", sent) for sent in data]
    # Sentence to words
    data_words = list(sent_to_words(data))
    
    # BUILD THE BIGRAM AND TRIGRAM MODELS
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    # Faster way to get a sentence clubbed as a bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    
    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)
    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # INPUT FOR CREATING THE DICTIONARY AND CORPUS NEEDED FOR TOPIC MODELING
    if not flag:
        texts.extend(data_lemmatized)
    else:
        texts = data_lemmatized
        flag = False
        
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
print("*"*20+"FINISHED"+"*"*20)

********************FINISHED********************


In [14]:
# with open('news_texts.pkl', 'wb') as f:
#     pickle.dump(texts, f)
# with open('news_corpus.pkl', 'wb') as f:
#     pickle.dump(corpus, f)
# with open('news_dictionary.pkl', 'wb') as f:
#     pickle.dump(dictionary, f)

# Model Creation <a class=anchor id=section3></a>

In [4]:
print("Started loading..")
with open('news_corpus.pkl', 'rb') as f:
    corpus = pickle.load(f)
with open('news_dictionary.pkl', 'rb') as f:
    dictionary = pickle.load(f)
print("Finished loading..")

Started loading..
Finished loading..


In [None]:
print("Started training..")
time_slice = [1051, 2900, 1258, 1771, 1520, 2010, 2852, 412]
lda_seq = LdaSeqModel(corpus=corpus, time_slice=time_slice, id2word=dictionary, num_topics=20, chunksize=200, passes=1)
print("Finished training..")

Started training..


  convergence = np.fabs((bound - old_bound) / old_bound)


In [None]:
# print("Saving model..")
# with open('lda_seq_model.pkl', 'wb') as f:
#     pickle.dump(lda_seq, f)
# print("Saved model!")

# Explore Model <a class=anchor id=section4></a>

In [8]:
with open('lda_seq_model.pkl', 'rb') as f:
    ldaseq = pickle.load(f)
print("Started loading..")
with open('news_corpus.pkl', 'rb') as f:
    corpus = pickle.load(f)
with open('news_dictionary.pkl', 'rb') as f:
    dictionary = pickle.load(f)
with open('news_texts.pkl', 'rb') as f:
    texts = pickle.load(f)
print("Finished loading..")

Started loading..
Finished loading..


## Printing Topics <a class=anchor id=section4_1></a>

To print all topics from a particular time-period, simply use `print_topics`. The input parameter to print_topics is a time-slice option. By passing 0 we are seeing the topics in the 1st time-slice. The result would be a list of lists, where each individual list contains a tuple of the most probable words in the topic. i.e (word, word_probability)

In [9]:
# 20 topcis per time slice. Each topic is made up of keywords.
ldaseq.print_topics(time=0)

[[('exchange', 0.032585327213301735),
  ('stock', 0.019525168953572793),
  ('permission', 0.013779895045716464),
  ('security', 0.012904726996567844),
  ('market', 0.012717460121291923),
  ('system', 0.011966344322286184),
  ('trading', 0.011877605199272076),
  ('volume', 0.011071054074381099),
  ('board', 0.00983614316524415),
  ('firm', 0.009088387644313996),
  ('order', 0.008521025431274625),
  ('floor', 0.008180032181665842),
  ('member', 0.008046198309664397),
  ('trade', 0.007470811077206533),
  ('broker', 0.00668577235927363),
  ('brokerage', 0.006471767578013388),
  ('street', 0.0062318701803079765),
  ('automation', 0.005983172342001948),
  ('continue', 0.005849581878421414),
  ('copyright', 0.005694449381308857)],
 [('newspaper', 0.09606789463087527),
  ('advertising', 0.05312090521009613),
  ('publisher', 0.039837194670586655),
  ('paper', 0.03074020067403551),
  ('printer', 0.026955371687537154),
  ('power', 0.020145897414895017),
  ('press', 0.014176352632787617),
  ('dail

## Looking for Topic Evolution <a class=anchor id=section4_2></a>

To fix a topic and see it evolve, use `print_topic_times`. The input parameter is the topic_id In this case, we are looking at the evolution of the technology topic.

In [10]:
ldaseq.print_topic_times(topic=0) # evolution of 1st topic

[[('exchange', 0.032585327213301735),
  ('stock', 0.019525168953572793),
  ('permission', 0.013779895045716464),
  ('security', 0.012904726996567844),
  ('market', 0.012717460121291923),
  ('system', 0.011966344322286184),
  ('trading', 0.011877605199272076),
  ('volume', 0.011071054074381099),
  ('board', 0.00983614316524415),
  ('firm', 0.009088387644313996),
  ('order', 0.008521025431274625),
  ('floor', 0.008180032181665842),
  ('member', 0.008046198309664397),
  ('trade', 0.007470811077206533),
  ('broker', 0.00668577235927363),
  ('brokerage', 0.006471767578013388),
  ('street', 0.0062318701803079765),
  ('automation', 0.005983172342001948),
  ('continue', 0.005849581878421414),
  ('copyright', 0.005694449381308857)],
 [('exchange', 0.0331196315128335),
  ('stock', 0.02092678261073136),
  ('market', 0.014609282235584152),
  ('security', 0.013194687781034328),
  ('trading', 0.012613570037657717),
  ('system', 0.012467251676093596),
  ('permission', 0.011636224244246431),
  ('volum

In [15]:
ldaseq.print_topic_times(topic=0) # evolution of 1st topic

[[('exchange', 0.032585327213301735),
  ('stock', 0.019525168953572793),
  ('permission', 0.013779895045716464),
  ('security', 0.012904726996567844),
  ('market', 0.012717460121291923),
  ('system', 0.011966344322286184),
  ('trading', 0.011877605199272076),
  ('volume', 0.011071054074381099),
  ('board', 0.00983614316524415),
  ('firm', 0.009088387644313996),
  ('order', 0.008521025431274625),
  ('floor', 0.008180032181665842),
  ('member', 0.008046198309664397),
  ('trade', 0.007470811077206533),
  ('broker', 0.00668577235927363),
  ('brokerage', 0.006471767578013388),
  ('street', 0.0062318701803079765),
  ('automation', 0.005983172342001948),
  ('continue', 0.005849581878421414),
  ('copyright', 0.005694449381308857)],
 [('exchange', 0.0331196315128335),
  ('stock', 0.02092678261073136),
  ('market', 0.014609282235584152),
  ('security', 0.013194687781034328),
  ('trading', 0.012613570037657717),
  ('system', 0.012467251676093596),
  ('permission', 0.011636224244246431),
  ('volum

## Visualising Dynamic Topic Models <a class=anchor id=section4_3></a>

In [28]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=0, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)

In [29]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=1, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)

In [30]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=2, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)

In [31]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=3, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)

In [36]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=4, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)

In [37]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=5, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)

In [38]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=6, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)

In [40]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=7, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)

## Topic Coherence <a class=anchor id=section4_4></a>

In [18]:
for time in range(0, 8):
    topics_dtm = ldaseq.dtm_coherence(time=time)
    cm_DTM = CoherenceModel(topics=topics_dtm, corpus=corpus, dictionary=dictionary, coherence='u_mass')
    print("U_mass topic coherence for time slice {} is {}".format(time, cm_DTM.get_coherence()))
    cm_DTM = CoherenceModel(topics=topics_dtm, texts=texts, dictionary=dictionary, coherence='c_v')
    print("C_v topic coherence for time slice {} is {}\n".format(time, cm_DTM.get_coherence()))

U_mass topic coherence for time slice 0 is -1.5541729439538832
C_v topic coherence for time slice 0 is 0.5107585530532012

U_mass topic coherence for time slice 1 is -1.5637246263545326
C_v topic coherence for time slice 1 is 0.51689783972042

U_mass topic coherence for time slice 2 is -1.5503575266253393
C_v topic coherence for time slice 2 is 0.5130503285375594

U_mass topic coherence for time slice 3 is -1.556059172869139
C_v topic coherence for time slice 3 is 0.5000063761521739

U_mass topic coherence for time slice 4 is -1.574721515162128
C_v topic coherence for time slice 4 is 0.4947732734552862

U_mass topic coherence for time slice 5 is -1.5449613886087166
C_v topic coherence for time slice 5 is 0.4977619376575218

U_mass topic coherence for time slice 6 is -1.5703996916354193
C_v topic coherence for time slice 6 is 0.5144505701987887

U_mass topic coherence for time slice 7 is -1.6126666979742343
C_v topic coherence for time slice 7 is 0.510858620949502

