## 1. Import packages

In [None]:
from pyhanlp import *
import pandas as pd
import numpy as np
from gensim import *
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from gensim.models import CoherenceModel
from googletrans import Translator

## 2. Prepare data

In [None]:
# load dataset
path = 'environmentalnews.csv'
data = pd.read_csv(path)

# get the content column of data
content = data['content']

# remove blanks
content_tight = [cont.replace(' ', '') for cont in content]

# redo segment by HanLP and change strings to lists of words
content_list = []
for cont in content_tight:
    content_list.append(HanLP.segment(cont).toString().split(', '))

# remove '[' in first element
content_clean  = [[c.replace('[','') for c in cont] for cont in content_list]

# only select nouns
noun_list = [[word.split('/')[0] for word in cont if '/n' in word 
              and '/nr' not in word and '/ns' not in word and '/nx' not in word] for cont in content_clean]

# divide the dataset into train and test
train, test = train_test_split(noun_list, test_size = 0.2, random_state = 100)

# Dictionary and Corpus on train set
dict_train = corpora.Dictionary(train)
corpus_train = [dict_train.doc2bow(t) for t in train]

## 3. Train topic model

In [None]:
# 10 topics
n_topic = 10
topic_10 = models.LdaModel(corpus_train, id2word = dict_train, num_topics = n_topic)
# 20 topics
n_topic = 20
topic_20 = models.LdaModel(corpus_train, id2word = dict_train, num_topics = n_topic)
# 30 topics
n_topic = 30
topic_30 = models.LdaModel(corpus_train, id2word = dict_train, num_topics = n_topic)
# 50 topics
n_topic = 50
topic_50 = models.LdaModel(corpus_train, id2word = dict_train, num_topics = n_topic)
# 75 topics
n_topic = 75
topic_75 = models.LdaModel(corpus_train, id2word = dict_train, num_topics = n_topic)
# 100 topics
n_topic = 100
topic_100 = models.LdaModel(corpus_train, id2word = dict_train, num_topics = n_topic)
# 200 topics
n_topic = 200
topic_200 = models.LdaModel(corpus_train, id2word = dict_train, num_topics = n_topic)
# 300 topics
n_topic = 300
topic_300 = models.LdaModel(corpus_train, id2word = dict_train, num_topics = n_topic)

## 4. Get coherence

In [None]:
# get coherance score of different models
coherence_model_lda10 = CoherenceModel(model=topic_10, texts=test, dictionary=dict_train, coherence='c_v')
coherence_model_lda20 = CoherenceModel(model=topic_20, texts=test, dictionary=dict_train, coherence='c_v')
coherence_model_lda30 = CoherenceModel(model=topic_30, texts=test, dictionary=dict_train, coherence='c_v')
coherence_model_lda50 = CoherenceModel(model=topic_50, texts=test, dictionary=dict_train, coherence='c_v')
coherence_model_lda75 = CoherenceModel(model=topic_75, texts=test, dictionary=dict_train, coherence='c_v')
coherence_model_lda100 = CoherenceModel(model=topic_100, texts=test, dictionary=dict_train, coherence='c_v')
coherence_model_lda200 = CoherenceModel(model=topic_200, texts=test, dictionary=dict_train, coherence='c_v')
coherence_model_lda300 = CoherenceModel(model=topic_300, texts=test, dictionary=dict_train, coherence='c_v')
coherence_lda10 = coherence_model_lda10.get_coherence()
coherence_lda20 = coherence_model_lda20.get_coherence()
coherence_lda30 = coherence_model_lda30.get_coherence()
coherence_lda50 = coherence_model_lda50.get_coherence()
coherence_lda75 = coherence_model_lda75.get_coherence()
coherence_lda100 = coherence_model_lda100.get_coherence()
coherence_lda200 = coherence_model_lda200.get_coherence()
coherence_lda300 = coherence_model_lda300.get_coherence()
coherence_lda10_detail = coherence_model_lda10.get_coherence_per_topic()
coherence_lda20_detail = coherence_model_lda20.get_coherence_per_topic()
coherence_lda30_detail = coherence_model_lda30.get_coherence_per_topic()
coherence_lda50_detail = coherence_model_lda50.get_coherence_per_topic()
coherence_lda75_detail = coherence_model_lda75.get_coherence_per_topic()
coherence_lda100_detail = coherence_model_lda100.get_coherence_per_topic()
coherence_lda200_detail = coherence_model_lda200.get_coherence_per_topic()
coherence_lda300_detail = coherence_model_lda300.get_coherence_per_topic()

## 5. Trend visualization preparation

In [None]:
# define a function to create topic vector for every model
def topicvec(n_topic, model):
    result = []
    for c in corpus_train:
        temp = [0] * n_topic
        for array in model[c]:
            temp[array[0]] = array[1]
        result.append(temp)
    return result

# create topic vector for each model
result10 = topicvec(10, topic_10)
result20 = topicvec(20, topic_20)
result30 = topicvec(30, topic_30)
result50 = topicvec(50, topic_50)
result75 = topicvec(75, topic_75)
result100 = topicvec(100, topic_100)
result200 = topicvec(200, topic_200)
result300 = topicvec(300, topic_300)

# get the length of every document
content_length = np.array([len(cont) for cont in content_list])

# preparation
train_length, test_length = train_test_split(content_length, test_size = 0.2, random_state = 100)
train_date, test_date = train_test_split(data['date'], test_size = 0.2, random_state = 100)

## 6. Save everything for future use

## 7. Load in pre-saved lists and models

## 8. Load in coherence scores, dictionaries and corpus

## 9. Find higher score topics

In [None]:
### find which topic is good enough to pass the standard of 0.45 or 0.5
good10 = [coherence_lda10_detail.index(cld) for cld in coherence_lda10_detail if cld > 0.5]
good20 = [coherence_lda20_detail.index(cld) for cld in coherence_lda20_detail if cld > 0.5]
good30 = [coherence_lda30_detail.index(cld) for cld in coherence_lda30_detail if cld > 0.5]
good50 = [coherence_lda50_detail.index(cld) for cld in coherence_lda50_detail if cld > 0.5]
good75 = [coherence_lda75_detail.index(cld) for cld in coherence_lda75_detail if cld > 0.5]
good100 = [coherence_lda100_detail.index(cld) for cld in coherence_lda100_detail if cld > 0.55]
good200 = [coherence_lda200_detail.index(cld) for cld in coherence_lda200_detail if cld > 0.55]
good300 = [coherence_lda300_detail.index(cld) for cld in coherence_lda300_detail if cld > 0.55]

In [None]:
# define a function to draw topic-time distribution
def topicplot(n, topic_vector):
    # extract each document's topic proportion for topic no.n
    topicn = [t[n] for t in topic_vector]
    # calculate the score of topic n of each document by mutiply the length with topic proportion
    topicn = np.array(topicn)
    score = topicn * train_length
    # create a pd dataframe of date and score
    doc_datescore = pd.DataFrame({'date':train_date,'score':score})
    # keep only year and month
    doc_datescore['date'] = [date[:7] for date in doc_datescore['date']]
    # group by months and sum scores up
    datescore = doc_datescore.groupby(['date'], as_index = False).sum()
    x = [datetime.strptime(d, '%Y-%m') for d in datescore['date']]
    y = datescore['score']
    plt.plot(x, y)
    plt.show()

In [None]:
# google translation
translate = Translator()

In [None]:
# plot topic trend for good topics of topic_10
for i in good10:
    print(topic_10.print_topics(num_topics = 10, num_words = 10)[i])
    string = str(list(topic_10.print_topics(num_topics = 10, num_words = 10)[i]))
    result = translate.translate(string)
    print(result.text)
    topicplot(i, result10)

In [None]:
# plot topic trend for good topics of topic_20
for i in good20:
    print(topic_20.print_topics(num_topics = 20, num_words = 10)[i])
    string = str(list(topic_20.print_topics(num_topics = 20, num_words = 10)[i]))
    result = translate.translate(string)
    print(result.text)
    topicplot(i, result20)

In [None]:
# plot topic trend for good topics of topic_30
for i in good30:
    print(topic_30.print_topics(num_topics = 30, num_words = 10)[i])
    string = str(list(topic_30.print_topics(num_topics = 30, num_words = 10)[i]))
    result = translate.translate(string)
    print(result.text)
    topicplot(i, result30)

In [None]:
# plot topic trend for good topics of topic_50
for i in good40:
    print(topic_50.print_topics(num_topics = 50, num_words = 10)[i])
    string = str(list(topic_50.print_topics(num_topics = 50, num_words = 10)[i]))
    result = translate.translate(string)
    print(result.text)
    topicplot(i, result50)

In [None]:
# plot topic trend for good topics of topic_75
for i in good75
    print(topic_75.print_topics(num_topics = 75, num_words = 10)[i])
    string = str(list(topic_75.print_topics(num_topics = 75, num_words = 10)[i]))
    result = translate.translate(string)
    print(result.text)
    topicplot(i, result75)

In [None]:
# plot topic trend for good topics of topic_100
for i in good100:
    print(topic_100.print_topics(num_topics = 100, num_words = 10)[i])
    string = str(list(topic_100.print_topics(num_topics = 100, num_words = 10)[i]))
    result = translate.translate(string)
    print(result.text)
    topicplot(i, result100)

In [None]:
# plot topic trend for good topics of topic_200
for i in good200:
    print(topic_200.print_topics(num_topics = 200, num_words = 10)[i])
    string = str(list(topic_200.print_topics(num_topics = 200, num_words = 10)[i]))
    result = translate.translate(string)
    print(result.text)
    topicplot(i, result200)

In [None]:
# plot topic trend for good topics of topic_300
for i in good300:
    print(topic_300.print_topics(num_topics = 300, num_words = 10)[i])
    string = str(list(topic_300.print_topics(num_topics = 300, num_words = 10)[i]))
    result = translate.translate(string)
    print(result.text)
    topicplot(i, result300)