In [82]:
import pandas as pd
import pprint
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from HanziNLP import lda_model, print_topics, word_tokenize

In [83]:
STOP_WORDS = set(
    [
        "中国",
        "发展",
        "",
        " ",
        "_",
        "  ",
        "人民",
        "新华",
        "\u2003",
        "新",
        "　",
        "—",
        "新华社",
        "国家",
        "同志们",
        "同志",
        "各位",
        "朋友们",
        "女士们",
        "先生们",
        "大家",
        "年",
        "月",
        "日",
        "坚持",
        "推进",
        "推动",
        "加强",
        "不断",
        "持续",
        "深入",
        "社会主义",
        "党的领导",
        "党的建设",
        "党校",
        "理论",
        "思想",
        "精神",
        "学习",
        "贯彻",
        "落实",
        "指导",
        "路线",
        "方针",
        "政策全面",
        "进一步",
        "扎实",
        "有效",
        "积极",
        "努力",
        "重要",
        "重大",
        "关键",
        "基本",
        "根本",
        "主要",
        "新时代",
        "新时期",
        "新阶段",
        "新发展",
        "高质量",
        "高水平",
        "共同",
        "共同体",
        "事业",
        "工作",
        "方面",
        "领域",
        "机制",
        "10",
        "记者",
        "二",
        "零",
        "2022",
        "2023",
        "2024",
        "2025",
        "1",
        "2",
        "3",
        "4",
        "5",
        "6",
        "7",
        "8",
        "9",
        "10",
        "11",
        "中",
    ]
)

In [84]:
speeches = pd.read_csv(
    "/home/lburton12/side_projects/xijinpin_talks/corpus/speeches.csv"
)

In [85]:

def tokenization(speeches):

    # Initializing tokens list
    tokens = []

    for text in speeches["text"]:
        token_pre = word_tokenize(text)

        #Ensuring additional stop words are removed
        tokens_clean = []

        for token in token_pre:
            if token not in STOP_WORDS:
                tokens_clean.append(token)

        tokens.append(tokens_clean)

    return tokens


all_speeches_tokenized = tokenization(speeches)

In [86]:
#Creating our LDA model
lda_model_all_years, corpus, dictionary = lda_model(all_speeches_tokenized, 
                                                    num_topics=20)

In [87]:

print_topics(lda_model_all_years, num_words=10)

Topic: 0 
Words: 0.000*"合作" + 0.000*"建设" + 0.000*"党" + 0.000*"经济" + 0.000*"世界" + 0.000*"现代化" + 0.000*"全面" + 0.000*"全球" + 0.000*"时代" + 0.000*"特色"
Topic: 1 
Words: 0.042*"中亚" + 0.038*"妇女" + 0.019*"中亚国家" + 0.006*"国家机关" + 0.005*"克服" + 0.004*"西安" + 0.004*"女性" + 0.004*"全国人民代表大会" + 0.004*"女童" + 0.004*"两年"
Topic: 2 
Words: 0.039*"合作" + 0.023*"中方" + 0.018*"组织" + 0.017*"非洲" + 0.015*"一带" + 0.014*"一路" + 0.013*"上海" + 0.013*"中非" + 0.010*"共建" + 0.009*"地区"
Topic: 3 
Words: 0.005*"人道主义" + 0.003*"平民" + 0.002*"止战" + 0.002*"巴以" + 0.002*"安理会" + 0.002*"非常" + 0.002*"扩员后" + 0.002*"所作" + 0.002*"当务之急" + 0.002*"南非政府"
Topic: 4 
Words: 0.003*"辽阔" + 0.003*"各族群众" + 0.003*"面貌" + 0.002*"血缘" + 0.002*"相融" + 0.002*"情感" + 0.002*"充分证明" + 0.002*"表彰大会" + 0.002*"共有" + 0.002*"疆域"
Topic: 5 
Words: 0.018*"法国" + 0.010*"中法" + 0.009*"法" + 0.007*"马克" + 0.006*"巴黎" + 0.006*"中法关系" + 0.006*"龙" + 0.006*"60" + 0.005*"巴西" + 0.004*"中哈"
Topic: 6 
Words: 0.020*"全球" + 0.019*"国际" + 0.019*"开放" + 0.017*"保护" + 0.014*"世界" + 0.013*"各国" + 0.013*"建设" 

In [88]:
speech_topics = gensimvis.prepare(lda_model_all_years, corpus, dictionary)

In [89]:
pyLDAvis.display(speech_topics)