# Dynamic Topic Model

- Construct the input data from the preprocessed text data to apply to gensim
- Inputs for gensim: time_slice, dictionary, corpus
- Set the range for the hypothetical number of topics and then run Dynamic Topic Model
- Determine the number of topics based on the coherence scores
- Visualization and analysis of the results

## 1. Constructing the input data to apply to gensim

In [None]:
# Loading the preprocessed news data
import pandas as pd
import pickle

with open("data/common.pk", "rb") as f:
    input_data = pickle.load(f)

input_data.reset_index(drop = True, inplace = True)
print(input_data.head())
print(input_data.tail())
print(input_data.info())

In [None]:
input_data

In [None]:
# Setting time_slice 
time_slice = [len(items) for items in input_data['title']]
time_slice

In [None]:
# Declare the tokenized news_contents as a list
tokenized_data = []

for i in range(0, len(input_data['contents']), 1):
    
    for j in range(0, time_slice[i], 1):
        
        a = input_data['contents'][i][j]
        tokenized_data.append(a)
        
tokenized_data

In [None]:
def cumulative_sum(arr, a):
    arr = [0] + arr
    partial_sum = [0] * len(arr)
    
    for i in range(1, len(arr)):
        partial_sum[i] = partial_sum[i-1] + arr[i]
        
    partial_sum = partial_sum[1:]
    # print("partial_sum", partial_sum)
    
    # print("total sum", partial_sum[-1])
    
    return partial_sum[a]

In [None]:
# Check if tokenized_data is correctly organized by comparing the recent time.
k = len(time_slice) - 2
tokenized_data[cumulative_sum(time_slice, k)]

In [None]:
# Intall gensim.
!pip install gensim

In [None]:
# Loading classes to construct dictionary and corpus along with the gensim library. 
from gensim import corpora
from gensim.corpora import Dictionary, bleicorpus
import os

# Warning: If you run another model, you have to change the red-colored file name below.
# Otherwise you will lose the original dictionary and corpus.
# Construct dictionary.
if not os.path.exists('common(DTM)_dict'):
    dictionary = corpora.Dictionary(tokenized_data)
    # dictionary.filter_extremes(no_below = 5, no_above = 500)  # Use it when the frequency is less than or more than n.
    dictionary.save('common(DTM)_dict')
    print(dictionary)
else:
    dictionary = Dictionary.load('common(DTM)_dict')

# Construct corpus.
if not os.path.exists('common(DTM)_corpus'):
    corpus = [dictionary.doc2bow(doc) for doc in tokenized_data]
    corpora.BleiCorpus.serialize('common(DTM)_corpus', corpus)
else:
    corpus = bleicorpus.BleiCorpus('common(DTM)_corpus')

In [None]:
# Dictionary at a glance
print(dictionary)
for idx in dictionary:
    print(dictionary[idx])

In [None]:
# Corpus at a glance
# 사람이 이해할 수 있는 형태로 코퍼스 사전 재구성 해보기 (term-frequency)
[[(dictionary[id], freq) for id, freq in cp] for cp in corpus[:]]

## 2. Setting the range for the hypothetical number of topics

In [None]:
start = 6; end = 10; step = 1; # "end" is not included..

passes = 50

# Define one of inputs,times, to be consistent with ime_slice.
import numpy as np
times = np.arange(len(time_slice))

In [None]:
# Load the class for running Topic model.
from gensim.models import ldaseqmodel
import os

## 3. Run Dynamic Topic Model while calculating coherence scores

In [None]:
# Load the classes to calculate coherence scores.
!pip install tqdm
from gensim.models import CoherenceModel
from gensim.matutils import hellinger
from tqdm import tqdm_notebook
from time import time

In [None]:
# Constructing the functions to calculate DTM and coherence scores
def compute_coherence(dictionary, corpus, passes, texts, times, start, end, step):
    coherence_score_list = []
    model_list = []
    for num_topics in tqdm_notebook(range(start, end, step)):
        ###################
        start_dtm = time()
        
        dtm_name = "common(DTM)_" + str(num_topics)
        
        # Save the outcome at every calculation of each num_topics.
        if os.path.exists(dtm_name):
            dtm_model = ldaseqmodel.LdaSeqModel.load(dtm_name)
        else:
            dtm_model = ldaseqmodel.LdaSeqModel(corpus = corpus, id2word = dictionary,
                                                time_slice = time_slice, num_topics = num_topics, passes = passes)
            dtm_model.save(dtm_name)
            
        end_dtm = time()
        ###################
        print("Elapsed Time for DTM in %d topics: %.2f sec." % (num_topics, (end_dtm - start_dtm)))

        topic_cs_list = []
        for time_slot in range(len(times)):
            topics_dtm = dtm_model.dtm_coherence(time = time_slot)
            print(topics_dtm)
            cs = CoherenceModel(topics = topics_dtm, texts = texts,
                                dictionary = dictionary, coherence = 'c_v')
            print(cs)
            print(cs.get_coherence())
            topic_cs_list.append(cs.get_coherence())
            
        model_list.append(dtm_model)
        coherence_score_list.append(topic_cs_list)
        
    return model_list, coherence_score_list

In [None]:
# Running Dynamic Topic Model while calculating coherence scores
model_list, coherence_scores = compute_coherence(dictionary = dictionary, 
                                                 corpus = corpus,
                                                 passes = passes,
                                                 texts = tokenized_data,
                                                 times = times,
                                                 start = start, end = end, step = step)

In [None]:
# Save the outcome
import pickle

# Save the outcome of the model for each num_topics
with open("common(DTM)_ml.pk", 'wb') as f:
    pickle.dump(model_list, f)

# Save the coherence scores for each num_topics
with open("common(DTM)_cs.pk", 'wb') as f:
    pickle.dump(coherence_scores, f)
    
# #  Load the previously saved outcome
# import pickle

# # Load the previously saved outcome of the model for each num_topics
# with open("tech(DTM)_ml.pk", 'rb') as f:
#     model_list = pickle.load(f)
    
# # Load the previously saved coherence scores for each num_topics
# with open("tech(DTM)_cs.pk", 'rb') as f:
#     coherence_scores = pickle.load(f)

## 4. Determine the optimal number of topics based on coherence scores

In [None]:
# Check the model_list and coherence scores
print(model_list, coherence_scores)

In [None]:
# Check the coherence scores
print(coherence_scores)

In [None]:
# For users' convenience, convert the model_list and coherence_scores into numpy array.
import numpy as np

np_coherence_scores = np.array(coherence_scores) 
np_cs_avg_list = np_coherence_scores.mean(axis = 1)

In [None]:
# the coherence scores declared as numpy array
np_cs_avg_list

In [None]:
# Coherence Scores Graph
# Set the fonts in Korean to print the coherence scores graph.
import matplotlib.pyplot as plt
import platform
from matplotlib import font_manager, rc
%matplotlib inline

path = "c:/Windows/Fonts/malgun.ttf"
if platform.system() == 'Darwin':
    rc('font', family = 'AppleGothic')
elif platform.system() == 'Windows':
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family = font_name)
elif platform.system() == 'Linux':
    path = '/usr/share/fonts/truetype/malgun/malgun.ttf'
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family = font_name)
else:
    print('Unknown system... sorry~~~~')
    
x = range(start, end, step)

plt.figure(figsize = (12, 10)) 
plt.plot(x, np_cs_avg_list, '-b')
plt.xlabel("# of Topics")
plt.ylabel("AVG of Coherence Scores")
plt.xticks(x)
plt.show()

In [None]:
# The optimal number of coherence scores
dtm_model = model_list[np.argmax(np_cs_avg_list)]
len(dtm_model.print_topics())

In [None]:
# The outcome of the applied optimal number of CS: The lists of the model at a certain period
print(dtm_model.print_topics(time = 22, top_terms = 5))

In [None]:
# The outcome of the applied optimal number of CS: The lists of the model for a certain topic
dtm_model.print_topic_times(topic = 1, top_terms = 20)

## 5. Calculating the probability where news would belong to the n-th topic for each period

In [None]:
# Adding the temporal information to indicate the time in which the news was released
doc_time = []

for i in range(len(time_slice)):
    
    for doc_id in range(len(tokenized_data)):
        
        if doc_id < time_slice[i]:
            doc_time.append(i)
            
print(doc_time)
print(len(doc_time))

In [None]:
def cumulative_sum(arr, a):
    arr = [0] + arr
    partial_sum = [0] * len(arr)
    
    for i in range(1, len(arr)):
        partial_sum[i] = partial_sum[i-1] + arr[i]
        
    partial_sum = partial_sum[1:]
    # print("partial_sum", partial_sum)
    
    # print("total sum", partial_sum[-1])
    
    return partial_sum[a]

In [None]:
Select a certain sample size for each time_slot
sample_idx = []
sample_size = 500

for i in range(len(time_slice)):
    if i == 0:
        item = np.random.randint(i, cumulative_sum(time_slice, i), size = sample_size)
    else:
        item = np.random.randint(cumulative_sum(time_slice, i-1), cumulative_sum(time_slice, i), size = sample_size)
    
    sample_idx.append(item)
    
print(sample_idx)

In [None]:
processing_docs = []
processing_time = []

sample_indices = np.concatenate(([sample for sample in sample_idx]))

for idx in sample_indices:
    processing_docs.append(tokenized_data[idx])
    processing_time.append(doc_time[idx])
    
print(sample_indices)

In [None]:
print(len(processing_docs))
processing_docs

In [None]:
print(len(processing_time))
processing_time

In [None]:
# Calculate the probabilities where each document belongs to an individual topic for each period.
doc_dist = [] 

# dtm_model.doc_topics returns the probability where a document would belong to a specific topic.
for doc_id in sample_indices:
    doc_dist.append(dtm_model.doc_topics(doc_id))

In [None]:
print(doc_dist)

In [None]:
# Processing doc_dist to shaping it into DataFrame
doc_dist = np.array(doc_dist)
doc_topic_dist = doc_dist.T # Transpose
doc_topic_dist.shape

In [None]:
NUM_TOPICS = dtm_model.num_topics

for i in range(NUM_TOPICS):
    print("%f" % doc_dist[0][i], end = ", ")

print()

for i in range(NUM_TOPICS):
    print("%f" % doc_topic_dist[i][0], end = ", ")

In [None]:
doc_topic_dist[0].shape

In [None]:
# Shaping DataFrame (Users have to write the codes below to be consistent with the optimal number of topics.)
import pandas as pd

dtm_df = pd.DataFrame({"Time" : processing_time, "Topic0" : doc_topic_dist[0], "Topic1" : doc_topic_dist[1],
                      "Topic2" : doc_topic_dist[2], "Topic3" : doc_topic_dist[3], "Topic4" : doc_topic_dist[4],
                      "Topic5" : doc_topic_dist[5], "Topic6" : doc_topic_dist[6]})

print(dtm_df.head(10))

In [None]:
# Generate groupby object.
doctopic_timeslot = dtm_df.groupby('Time')
doctopic_timeslot.groups

In [None]:
# # Calculate the average probabilities for each time slot using mean() function.
timeslot_avg = doctopic_timeslot.mean()
timeslot_avg = timeslot_avg.reset_index()   # index reset
timeslot_avg

In [None]:
# The sum of the probabilties for each period is 1.
timeslot_avg.sum(axis = 1)

## 6. Visualization

In [None]:
#  Use the codes below to be consistent with the optimal number of topics.
plt.figure(figsize = (16, 12)) 
plt.plot(timeslot_avg['Time'], timeslot_avg['Topic0'], 'b*--', label = 'Topic 0')
plt.plot(timeslot_avg['Time'], timeslot_avg['Topic1'], 'rs--', label = 'Topic 1')
plt.plot(timeslot_avg['Time'], timeslot_avg['Topic2'], 'g^--', label = 'Topic 2')
plt.plot(timeslot_avg['Time'], timeslot_avg['Topic3'], 'y*--', label = 'Topic 3')
plt.plot(timeslot_avg['Time'], timeslot_avg['Topic4'], 'bs--', label = 'Topic 4')
plt.plot(timeslot_avg['Time'], timeslot_avg['Topic5'], 'm*--', label = 'Topic 5')
# plt.plot(timeslot_avg['Time'], timeslot_avg['Topic6'], 'k*--', label = 'Topic 6')
# plt.plot(timeslot_avg['Time'], timeslot_avg['Topic7'], 'c*--', label = 'Topic 7')
# plt.plot(timeslot_avg['Time'], timeslot_avg['Topic7'], 'c*--', label = 'Topic 8')
# plt.plot(timeslot_avg['Time'], timeslot_avg['Topic7'], 'c*--', label = 'Topic 9')
# plt.plot(timeslot_avg['Time'], timeslot_avg['Topic7'], 'c*--', label = 'Topic 10')
# plt.plot(timeslot_avg['Time'], timeslot_avg['Topic7'], 'c*--', label = 'Topic 11')

for topic_id in ['Topic0', 'Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5', 'Topic6']:
    for x, y in zip(range(len(time_slice)), timeslot_avg[topic_id]):
        plt.annotate("%.4f"%y, (x, y), textcoords = "offset points", xytext=(0,10), ha = 'center')

plt.title("# News Topics", fontsize = 14)
plt.xlabel("Time", fontsize = 13)
plt.ylabel("The Portion of Each Topic (%)", fontsize = 13)
plt.xticks(timeslot_avg['Time'], ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22']) # 분석 대상 기간에 상응하게 조정
plt.ylim([0.0, 0.25])
plt.legend(loc = "best")
plt.savefig("news_topic_distribution_graph.png")
plt.show()