In [186]:
import jieba
import gensim
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
from gensim.models import CoherenceModel

data = ["人工智能时代，大数据浪潮下的企业数字化转型",
        "大数据与人工智能",
        "人工智能技术在智能客服中的应用",
        "人工智能驱动的智能制造",
        "智能制造和大数据",
        "人工智能技术的应用"]

# 加载停用词表
stop_words = []
with open('stopwords.txt', 'r', encoding='utf-8') as f:
    for line in f.readlines():
        stop_words.append(line.strip())

# 对文本进行预处理
processed_docs = []
for doc in data:
    # 分词
    tokens = list(jieba.cut(doc))
    # 去除停用词
    tokens = [token for token in tokens if len(token) > 1 and token not in stop_words]
    processed_docs.append(tokens)

# 构建词典
dictionary = gensim.corpora.Dictionary(processed_docs)

# 构建语料库
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# 训练LDA模型
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=dictionary,
                                            num_topics=3,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

# 计算主题聚类的一致性得分
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('一致性得分：', coherence_lda)

# 可视化LDA模型
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)

# 查看每个主题下的关键词
for idx, topic in lda_model.show_topics(formatted=True, num_topics=3, num_words=10):
    print('主题 {}: '.format(idx))
    print('  关键词: {}'.format(topic))


一致性得分： 0.3035054268717763


  default_term_info = default_term_info.sort_values(


主题 0: 
  关键词: 0.083*"智能" + 0.083*"人工智能" + 0.083*"数据" + 0.083*"技术" + 0.083*"制造" + 0.083*"客服" + 0.083*"驱动" + 0.083*"数字化" + 0.083*"转型" + 0.083*"浪潮"
主题 1: 
  关键词: 0.124*"企业" + 0.124*"数字化" + 0.124*"浪潮" + 0.124*"时代" + 0.124*"转型" + 0.115*"数据" + 0.108*"人工智能" + 0.032*"技术" + 0.031*"智能" + 0.031*"客服"
主题 2: 
  关键词: 0.234*"人工智能" + 0.173*"智能" + 0.126*"数据" + 0.121*"制造" + 0.120*"技术" + 0.069*"驱动" + 0.069*"客服" + 0.017*"转型" + 0.017*"时代" + 0.017*"浪潮"


In [78]:
pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)   # 根据版本信息选择
vis
# pyLDAvis.save_html(vis, 'lda.html')

  default_term_info = default_term_info.sort_values(


In [79]:
import pyecharts.options as opts
from pyecharts.charts import Sankey
from pyecharts.commons.utils import JsCode

def draw_sankey(lda_model, dictionary):
    # 获取主题和词的权重
    nodes = []
    for topic_id in range(lda_model.num_topics):
        topic_words = lda_model.show_topic(topic_id)
        for word, weight in topic_words:
            if word not in nodes:
                nodes.append(word)

    links = []
    for topic_id in range(lda_model.num_topics):
        topic_words = lda_model.show_topic(topic_id)
        for word, weight in topic_words:
            source = nodes.index(word)
            target = topic_id
            value = weight
            links.append({"source": source, "target": target, "value": value})

    # 绘制桑基图
    c = (
        Sankey()
        .add(
            "LDA Topics",
            nodes,
            links,
            linestyle_opt=opts.LineStyleOpts(opacity=0.2, curve=0.5, color="source"),
            label_opts=opts.LabelOpts(position="right", font_size=10),
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(title="LDA Topic Sankey Diagram"),
            tooltip_opts=opts.TooltipOpts(trigger="item", trigger_on="mousemove|click"),
        )
    )

    return c.render_notebook()

# 用法示例
vis = draw_sankey(lda_model, dictionary)
vis


In [146]:
nodes = []
for topic_id in range(lda_model.num_topics):
    topic_words = lda_model.show_topic(topic_id)
    for word, weight in topic_words:
        if word not in nodes:
            nodes.append(word)

links = []
for topic_id in range(lda_model.num_topics):
    topic_words = lda_model.show_topic(topic_id)
    topic_words_sorted = sorted(topic_words, key=lambda x: x[1], reverse=True)
    for i in range(len(topic_words_sorted) - 1):
        source_word, source_weight = topic_words_sorted[i]
        target_word, target_weight = topic_words_sorted[i + 1]
        source_id = nodes.index(source_word)
        target_id = nodes.index(target_word)
        links.append({"source": source_id, "target": target_id, "value": source_weight})


In [148]:
for i in range(len(links)):
    if links[i]['source']==links[i]['target']:
        del links[i]


In [96]:
links

[{'source': 0, 'target': 1, 'value': 0.08343017},
 {'source': 1, 'target': 2, 'value': 0.08327714},
 {'source': 2, 'target': 3, 'value': 0.08327714},
 {'source': 4, 'target': 5, 'value': 0.08327714}]

In [112]:
from pyecharts import options as opts
from pyecharts.charts import Sankey

nodes = [
    {"name": "category1"},
    {"name": "category2"},
    {"name": "category3"},
    {"name": "category4"},
    {"name": "category5"},
    {"name": "category6"},
]


c = (
    Sankey()
    .add(
        "sankey",
        nodes,
        links[0:4:],
        linestyle_opt=opts.LineStyleOpts(opacity=0.2, curve=0.5, color="source"),
        label_opts=opts.LabelOpts(position="right"),
    )
.set_global_opts(
            title_opts=opts.TitleOpts(title="LDA Topic Sankey Diagram"),
            tooltip_opts=opts.TooltipOpts(trigger="item", trigger_on="mousemove|click"),
        )
    
)
c.render_notebook()

In [174]:
nodes = []
for topic_id in range(lda_model.num_topics):
    topic_words = lda_model.show_topic(topic_id)
    for word, weight in topic_words:
        if word not in nodes:
            nodes.append(word)


links = []
for topic_id in range(lda_model.num_topics):
    topic_words = lda_model.show_topic(topic_id)
    for word, weight in topic_words:
        if word in nodes:
            source = word
            target = f"Topic {topic_id+1}"
            value = weight
            links.append({"source": source, "target": target, "value": value})
links

[{'source': '智能', 'target': 'Topic 1', 'value': 0.0834436},
 {'source': '人工智能', 'target': 'Topic 1', 'value': 0.08343017},
 {'source': '数据', 'target': 'Topic 1', 'value': 0.08341379},
 {'source': '技术', 'target': 'Topic 1', 'value': 0.083395906},
 {'source': '制造', 'target': 'Topic 1', 'value': 0.083352566},
 {'source': '客服', 'target': 'Topic 1', 'value': 0.0832997},
 {'source': '驱动', 'target': 'Topic 1', 'value': 0.083283685},
 {'source': '数字化', 'target': 'Topic 1', 'value': 0.083279744},
 {'source': '转型', 'target': 'Topic 1', 'value': 0.08327714},
 {'source': '浪潮', 'target': 'Topic 1', 'value': 0.08327666},
 {'source': '企业', 'target': 'Topic 2', 'value': 0.12409593},
 {'source': '数字化', 'target': 'Topic 2', 'value': 0.12409071},
 {'source': '浪潮', 'target': 'Topic 2', 'value': 0.12409028},
 {'source': '时代', 'target': 'Topic 2', 'value': 0.12408992},
 {'source': '转型', 'target': 'Topic 2', 'value': 0.12408651},
 {'source': '数据', 'target': 'Topic 2', 'value': 0.115197994},
 {'source': '人工智能

In [175]:
from pyecharts.charts import Sankey
from pyecharts import options as opts

def draw_sankey(lda_model, dictionary):
    # 获取主题和词的权重



    # 绘制桑基图
    c = (
        Sankey()
        .add(
            "LDA Topics",
            nodes=nodes + [f"Topic {i+1}" for i in range(lda_model.num_topics)],  # 添加主题节点
            links=links,
            linestyle_opt=opts.LineStyleOpts(opacity=0.2, curve=0.5, color="source"),
            label_opts=opts.LabelOpts(position="right", font_size=15),
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(title="LDA Topic Sankey Diagram"),
            tooltip_opts=opts.TooltipOpts(trigger="item", trigger_on="mousemove|click"),
        )
    )

    return c.render_notebook()

# 用法示例
vis = draw_sankey(lda_model, dictionary)
vis


In [187]:
nodes1 = []
nodes = []
for topic_id in range(lda_model.num_topics):
    topic_words = lda_model.show_topic(topic_id)
    for word, weight in topic_words:
        if word not in nodes:
            nodes.append(word)
            nodes1.append({"name": word})

links = []
for topic_id in range(lda_model.num_topics):
    topic_words = lda_model.show_topic(topic_id)
    for word, weight in topic_words:
        if word in nodes:
            source = word
            target = f"Topic {topic_id+1}"
            value = weight
            links.append({"source": source, "target": target, "value": value})
links

[{'source': '智能', 'target': 'Topic 1', 'value': 0.0834436},
 {'source': '人工智能', 'target': 'Topic 1', 'value': 0.08343017},
 {'source': '数据', 'target': 'Topic 1', 'value': 0.08341379},
 {'source': '技术', 'target': 'Topic 1', 'value': 0.083395906},
 {'source': '制造', 'target': 'Topic 1', 'value': 0.083352566},
 {'source': '客服', 'target': 'Topic 1', 'value': 0.0832997},
 {'source': '驱动', 'target': 'Topic 1', 'value': 0.083283685},
 {'source': '数字化', 'target': 'Topic 1', 'value': 0.083279744},
 {'source': '转型', 'target': 'Topic 1', 'value': 0.08327714},
 {'source': '浪潮', 'target': 'Topic 1', 'value': 0.08327666},
 {'source': '企业', 'target': 'Topic 2', 'value': 0.12409593},
 {'source': '数字化', 'target': 'Topic 2', 'value': 0.12409071},
 {'source': '浪潮', 'target': 'Topic 2', 'value': 0.12409028},
 {'source': '时代', 'target': 'Topic 2', 'value': 0.12408992},
 {'source': '转型', 'target': 'Topic 2', 'value': 0.12408651},
 {'source': '数据', 'target': 'Topic 2', 'value': 0.115197994},
 {'source': '人工智能

In [232]:
def draw_sankey(lda_model, dictionary):
    # 获取主题和词的权重
    nodes1 = []
    nodes = []
    for topic_id in range(lda_model.num_topics):
        topic_words = lda_model.show_topic(topic_id)
        for word, weight in topic_words:
            if word not in nodes:
                nodes.append(word)
                nodes1.append({"name": word})

    links = []
    for topic_id in range(lda_model.num_topics):
        topic_words = lda_model.show_topic(topic_id)
        for word, weight in topic_words:
            if word in nodes:
                source = word
                target = f"第 {topic_id+1}名"
                value = weight*100
                links.append({"source": source, "target": target, "value": value})
    nodes=[]
    nodes=nodes1
    kkk=[f"第 {i+1}名" for i in range(lda_model.num_topics)]

    for i in kkk:
        nodes.append({"name": i})
    # 绘制桑基图
    c = (
        Sankey()
        .add(
            "LDA Topics",
            nodes,  # 添加主题节点
            links,
            linestyle_opt=opts.LineStyleOpts(opacity=0.2, curve=0.5, color="source"),
            label_opts=opts.LabelOpts(position="right", font_size=10),
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(title="LDA主题词聚类"),
            tooltip_opts=opts.TooltipOpts(trigger="item", trigger_on="mousemove|click"),
        )
    )

    return c.render_notebook()

# 用法示例
vis = draw_sankey(lda_model, dictionary)
vis


In [229]:
import networkx as nx
from pyecharts import options as opts
from pyecharts.charts import Graph

def draw_graph(lda_model, dictionary):
    # 创建 NetworkX 图
    G = nx.DiGraph()

    # 添加主题和词节点
    for topic_id in range(lda_model.num_topics):
        G.add_node(f"Topic {topic_id+1}")
        topic_words = lda_model.show_topic(topic_id)
        for word, weight in topic_words:
            if word not in G.nodes:
                G.add_node(word)

    # 添加节点之间的链接
    for topic_id in range(lda_model.num_topics):
        topic_words = lda_model.show_topic(topic_id)
        for word, weight in topic_words:
            if word in G.nodes:
                G.add_edge(word, f"Topic {topic_id+1}", weight=weight)

    # 绘制图形
    node_data = [{"name": node} for node in G.nodes]
    link_data = [
        {"source": u, "target": v, "value": d["weight"]}
        for u, v, d in G.edges(data=True)
    ]
    c = (
        Graph()
        .add(
            "",
            node_data,
            link_data,
            layout="force",
            label_opts=opts.LabelOpts(position="right", font_size=10),
        )
        .set_global_opts(title_opts=opts.TitleOpts(title="LDA Topic Graph"))
    )

    return c.render_notebook()

# 用法示例
vis = draw_graph(lda_model, dictionary)
vis


In [228]:
import networkx as nx
from pyecharts import options as opts
from pyecharts.charts import Graph

def draw_graph(lda_model, dictionary):
    # 创建 NetworkX 图
    G = nx.DiGraph()

    # 添加主题和词节点
    for topic_id in range(lda_model.num_topics):
        G.add_node(f"Topic {topic_id+1}", type="topic", symbol_size=20)
        topic_words = lda_model.show_topic(topic_id)
        for word, weight in topic_words:
            if word not in G.nodes:
                G.add_node(word, type="word", symbol_size=10)

    # 添加节点之间的链接
    for topic_id in range(lda_model.num_topics):
        topic_words = lda_model.show_topic(topic_id)
        for word, weight in topic_words:
            if word in G.nodes:
                G.add_edge(word, f"Topic {topic_id+1}", weight=weight)

    # 绘制图形
    node_data = [{"name": node, "category": G.nodes[node]["type"]} for node in G.nodes]
    link_data = [
        {"source": u, "target": v, "value": d["weight"]}
        for u, v, d in G.edges(data=True)
    ]
    import pandas as pd

    # 将 link_data 转换为 Pandas DataFrame 格式
    df = pd.DataFrame(link_data, columns=['source', 'target', 'value'])
    # 合并重复的边并求和
    df = df.groupby(['source', 'target']).sum().reset_index()
    # 将 DataFrame 转换回原来的格式
    link_data = df.values.tolist()

    categories = [{"name": "Topic", "itemStyle": {"color": "#C23531"}},
                  {"name": "Word", "itemStyle": {"color": "#2F4554"}}]
    print(link_data)
    c = (
        Graph()
        .add(
            "",
            node_data,
            link_data,
            categories=categories,
            layout="circular",
            label_opts=opts.LabelOpts(position="right", font_size=10),
        )
        .set_global_opts(title_opts=opts.TitleOpts(title="LDA Topic Graph"))
    )

    return c.render_notebook()

# 用法示例
vis = draw_graph(lda_model, dictionary)
vis


[['人工智能', 'Topic 1', 8.343017101287842], ['人工智能', 'Topic 2', 10.772180557250977], ['人工智能', 'Topic 3', 23.40536266565323], ['企业', 'Topic 2', 12.409593164920807], ['制造', 'Topic 1', 8.335256576538086], ['制造', 'Topic 3', 12.102148681879044], ['客服', 'Topic 1', 8.329969644546509], ['客服', 'Topic 2', 3.114352375268936], ['客服', 'Topic 3', 6.913919001817703], ['技术', 'Topic 1', 8.339590579271317], ['技术', 'Topic 2', 3.2068267464637756], ['技术', 'Topic 3', 12.04921081662178], ['数字化', 'Topic 1', 8.327974379062653], ['数字化', 'Topic 2', 12.409070879220963], ['数据', 'Topic 1', 8.341378718614578], ['数据', 'Topic 2', 11.519799381494522], ['数据', 'Topic 3', 12.61223703622818], ['时代', 'Topic 2', 12.408991903066635], ['时代', 'Topic 3', 1.7428714781999588], ['智能', 'Topic 1', 8.344359695911407], ['智能', 'Topic 2', 3.117041662335396], ['智能', 'Topic 3', 17.28692203760147], ['浪潮', 'Topic 1', 8.32766592502594], ['浪潮', 'Topic 2', 12.4090276658535], ['浪潮', 'Topic 3', 1.7428457736968994], ['转型', 'Topic 1', 8.32771435379982

 * Serving Flask app '__main__'
 * Debug mode: on


  super().__setattr__(attr, val)
 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m
 * Restarting with stat
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/traitlets/config/application.py", line 1042, in launch_instance
    app.initialize(argv)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/traitlets/config/application.py", line 113, in inner
    return method(app, *args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 678, in initialize
    self.init_sockets()
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 317, in init_sockets
    self.shell_port = self._bin

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
