In [None]:
from hanlp_restful import HanLPClient
import json
import os
import time

HanLP = HanLPClient('https://www.hanlp.com/api', auth="", language='zh')
HanLP.parse(["商品和服务。晓美焰来到北京立方庭参观自然语义科技公司。", "我爸是李刚"], tasks='pos/ctb')

{'tok/fine': [['商品',
   '和',
   '服务',
   '。',
   '晓美焰',
   '来到',
   '北京',
   '立方庭',
   '参观',
   '自然',
   '语义',
   '科技',
   '公司',
   '。'],
  ['我', '爸', '是', '李刚']],
 'pos/ctb': [['NN',
   'CC',
   'NN',
   'PU',
   'NR',
   'VV',
   'NR',
   'NR',
   'VV',
   'NN',
   'NN',
   'NN',
   'NN',
   'PU'],
  ['PN', 'NN', 'VC', 'NR']]}

In [2]:
def plot_token_freq(tokens, topk=100):
    from collections import Counter
    matplotlib.rc("font",family='YouYuan')
    counter = Counter(tokens)
    temp = list(counter.items())
    temp.sort(key=lambda var: var[1], reverse=True)
    temp = temp[:topk]
#     plt.bar([var[0] for var in temp], [var[1] for var in temp])
#     plt.show()
    print(temp)

In [3]:

def split_into_blocks(text, segment_size=150):
    # split text into a list of segments
    segment_list = []
    i = -1
    while True:
        i += 1
        left, right = i*segment_size, (i+1)*segment_size
        segment = text[left: right]
        segment_list.append(segment)
        if right >= len(text):
            break
    return segment_list

In [4]:
block_size = 150
big_block_size = 14000

results_dict = dict()

In [5]:
def join(ll):
    return_l = []
    for l in ll:
        return_l = return_l + l
    return return_l

In [None]:
texts = []
query_timestamps = []
root = "all_trans"
for file_name in os.listdir(root):
    print("currently processing", file_name)
    if file_name in results_dict:
        continue
    with open(os.path.join(root, file_name), 'r', encoding='utf-8') as rf:
        text = rf.read()
#         def read_in_blocks(text, block_size=15000):
#             while True:
#                 block = text.read(block_size)
#             if not block:
#                 return  # 文件读取完毕
#         read_in_blocks(text)
    
    all_tokens = []
    all_pos_tags = []
    
    big_block_list = split_into_blocks(text, 14000)
    
    for big_block in big_block_list:
        block_list = split_into_blocks(big_block, 150)

        if (len(query_timestamps) < 60):
            gap = 10000
        else:
            current_time = time.time()
            gap = current_time - query_timestamps[-60]
        if gap < 70:
            print("\tsleeping for", 70-gap)
            time.sleep(70-gap)
        query_timestamps.append(time.time())
        
        # 分词
        data = HanLP.parse(block_list, tasks='pos/ctb')
        tokens = data['tok/fine']
        pos_tags = data['pos/ctb']

        all_tokens = all_tokens + join(tokens)
        all_pos_tags = all_pos_tags + join(pos_tags)

    results_dict[file_name] = {"all_tokens": all_tokens, "all_pos_tags": all_pos_tags}
#     print (all_tokens)

In [6]:
import json

#with open("results_dict_all_trans.json", "w", encoding='utf-8') as f:
    #json.dump(results_dict, f)

with open("results_dict_all_trans.json", "r", encoding='utf-8') as f:
     results_dict = json.load(f)

In [7]:
for file_name in results_dict:
    all_tokens, all_pos_tags = results_dict[file_name]['all_tokens'], results_dict[file_name]['all_pos_tags']
    noun_tokens = []
    for i in range(len(all_pos_tags)):
        if all_pos_tags[i].startswith("N"):
            noun_tokens.append(all_tokens[i])
    results_dict[file_name]["noun_tokens"] = noun_tokens

In [None]:
def merge_stopwords(*stopwords_lists):
    merged_stopwords = set()

    for stopwords_file in stopwords_lists:
        with open(stopwords_file, 'r', encoding='utf-8') as file:
            stopwords = set(file.read().split())
            merged_stopwords.update(stopwords)

    return list(merged_stopwords)

stopwords_list1 = 'scu_stopwords.txt'
stopwords_list2 = 'cn_stopwords.txt'
stopwords_list3 = 'baidu_stopwords.txt'
stopwords_list4 = 'hit_stopwords_trans.txt'
stopwords_list5 = 'cust_stopwords.txt'

merged_stopwords = merge_stopwords(stopwords_list1, stopwords_list2, stopwords_list3, stopwords_list4, stopwords_list5)

In [9]:
for filename in results_dict:
    token_list = results_dict[filename]['noun_tokens']
    all_words = [word for word in token_list if word not in merged_stopwords]
    results_dict[filename]['cleaned_noun_tokens'] = all_words
#     print(all_words)

In [10]:
texts = []
numbers = []
textid2name = dict()
for key, item in results_dict.items():
    texts.append(item['cleaned_noun_tokens'])
    textid2name[len(texts)-1] = key
    numbers.append(len(item['cleaned_noun_tokens']))
numbers.sort()
print(numbers)

def split_list(l, length=800):
    target = len(l)//length + 1
    segment_length = round(len(l)/target)
    return split_into_blocks(l, segment_length)
new_texts = []

chunk2text = dict()
for i in range(len(texts)):
    text = texts[i]
    splitted_text = split_list(text)
    for j in range(len(splitted_text)):
        new_texts.append(splitted_text[j])
        chunk2text[len(new_texts)-1] = {"id of original doc": i, "name of original doc": textid2name[i], "chunk index": j, "total # chunks": len(splitted_text)}
print(len(new_texts))
texts = new_texts

[27, 28, 80, 109, 115, 179, 219, 226, 233, 290, 295, 301, 311, 342, 342, 370, 431, 446, 469, 500, 502, 503, 520, 521, 523, 530, 564, 574, 577, 581, 584, 612, 620, 650, 674, 682, 705, 723, 726, 753, 755, 763, 774, 791, 814, 823, 854, 887, 891, 894, 911, 957, 1005, 1012, 1064, 1134, 1156, 1174, 1202, 1206, 1207, 1210, 1233, 1248, 1250, 1294, 1362, 1372, 1438, 1463, 1509, 1531, 1802, 1851, 1878, 1984, 2026, 2030, 2085, 2117, 2185, 2248, 2383, 2567, 2606, 2682, 2841, 2917, 2965, 3105, 3127, 3139, 3226, 3329, 3408, 3488, 3712, 3775, 3869, 4051, 4231, 4281, 4290, 4421, 4431, 4484, 4514, 4548, 4699, 4700, 4905, 4907, 5217, 5307, 5418, 5502, 5515, 5588, 5662, 5668, 5741, 5748, 5907, 6127, 6135, 6241, 6251, 6298, 6317, 6372, 6545, 6693, 6779, 6830, 6894, 7386, 7429, 7466, 8101, 8159, 8335, 8407, 8518, 8559, 8788, 9149, 9235, 9333, 9468, 9746, 9968, 10668, 10907, 10962, 10976, 11010, 11030, 11528, 12064, 12125, 12359, 12397, 14191, 14309, 14378, 14414, 14430, 14922, 15676, 15698, 15725, 16268, 1

In [11]:
import gensim
from gensim import corpora
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import warnings
warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity
 
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel

In [12]:
dictionary = gensim.corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(abc) for abc in texts]

In [None]:
def coherence(num_topics):
    ldamodel = LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=100,random_state = 100, iterations=300, chunksize = 2200,eval_every = None)
    print(ldamodel.print_topics(num_topics=num_topics, num_words=10))
    ldacm = CoherenceModel(model=ldamodel, texts=texts, dictionary=dictionary, coherence='c_v')
    print(ldacm.get_coherence())
    return ldacm.get_coherence()

In [None]:
x = range(5,16)
# z = [perplexity(i) for i in x]  #如果想用困惑度就选这个
y = [coherence(i) for i in x]
plt.plot(x, y)
plt.xlabel('主题数目')
plt.ylabel('coherence大小')
plt.rcParams['font.sans-serif']=['SimHei']
matplotlib.rcParams['axes.unicode_minus']=False
plt.title('主题-coherence变化情况')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import table

# Sample DataFrame
df = pd.DataFrame({
    'Name': x,
    'Age': y,
    
})

# Create a subplot without frame
fig, ax = plt.subplots(figsize=(8, 3))  # Adjust the figsize to fit your needs
ax.axis('tight')
ax.axis('off')

# Create the table and adjust styling
table(ax, df, loc='center', cellLoc='center', rowLoc='center')

plt.show()


In [None]:
word_frequency = dict()
for text in texts:
    for word in text:
        if word not in word_frequency:
            word_frequency[word] = 0
        word_frequency[word] += 1
word_freq_pairs = [(key, val) for key, val in word_frequency.items()]
word_freq_pairs.sort(key=lambda var: var[1], reverse=True)
word_freq_pairs

In [None]:
def perplexity(num_topics):
    ldamodel = LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=30)
    print(ldamodel.print_topics(num_topics=num_topics, num_words=10))
    print(ldamodel.log_perplexity(corpus))
    return ldamodel.log_perplexity(corpus)


In [None]:
x = range(1,31)
z = [perplexity(i) for i in x]  #如果想用困惑度就选这个
#y = [coherence(i) for i in x]
plt.plot(x, z)
plt.xlabel('主题数目')
plt.zlabel('coherence大小')
plt.rcParams['font.sans-serif']=['SimHei']
matplotlib.rcParams['axes.unicode_minus']=False
plt.title('主题-coherence变化情况')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import table

# Sample DataFrame
df = pd.DataFrame({
    'Name': x,
    'Age': z,
    
})

# Create a subplot without frame
fig, ax = plt.subplots(figsize=(8, 3))  # Adjust the figsize to fit your needs
ax.axis('tight')
ax.axis('off')

# Create the table and adjust styling
table(ax, df, loc='center', cellLoc='center', rowLoc='center')

plt.show()

In [None]:
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=6, passes=100,random_state = 100, iterations=300, chunksize = 1000,eval_every = None)
topic_list=lda.print_topics()
print(topic_list)
 
for i in lda.get_document_topics(corpus)[:]:
    listj=[]
    for j in i:
        listj.append(j[1])
    bz=listj.index(max(listj))
    print(i[bz][0])

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary)
vis

In [None]:
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=6, passes=100,random_state = 100, iterations=300, chunksize = 2200,eval_every = None)
topic_list=lda.print_topics()
print(topic_list)
 
for i in lda.get_document_topics(corpus)[:]:
    listj=[]
    for j in i:
        listj.append(j[1])
    bz=listj.index(max(listj))
    print(i[bz][0])

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary)
vis

In [17]:
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=7, passes=100,random_state = 100, iterations=300, chunksize = 1000,eval_every = None)
topic_list=lda.print_topics()
print(topic_list)
 
for i in lda.get_document_topics(corpus)[:]:
    listj=[]
    for j in i:
        listj.append(j[1])
    bz=listj.index(max(listj))
    print(i[bz][0])

[(0, '0.026*"船" + 0.010*"夫人" + 0.007*"地方" + 0.007*"船主" + 0.006*"船上" + 0.006*"水" + 0.006*"馬" + 0.006*"衆人" + 0.005*"天" + 0.005*"東西"'), (1, '0.015*"夫人" + 0.014*"王" + 0.013*"主教" + 0.009*"王后" + 0.009*"兵" + 0.008*"女主" + 0.007*"魔" + 0.007*"男爵" + 0.006*"國" + 0.006*"女人"'), (2, '0.035*"王" + 0.024*"神" + 0.013*"石" + 0.012*"衆" + 0.009*"女王" + 0.008*"狀" + 0.008*"輩" + 0.008*"屍" + 0.006*"國" + 0.006*"公主"'), (3, '0.005*"室" + 0.005*"車" + 0.004*"面" + 0.004*"球" + 0.004*"力" + 0.004*"物" + 0.004*"門" + 0.004*"風" + 0.003*"水" + 0.003*"氣球"'), (4, '0.020*"舟" + 0.019*"物" + 0.014*"鳥" + 0.014*"樹" + 0.011*"妻" + 0.009*"水" + 0.008*"石" + 0.008*"牛" + 0.008*"狗" + 0.008*"象"'), (5, '0.018*"女" + 0.010*"狀" + 0.009*"書" + 0.008*"衆" + 0.006*"父" + 0.006*"力" + 0.005*"馬" + 0.005*"奴" + 0.005*"目" + 0.004*"衣"'), (6, '0.027*"船" + 0.025*"舟" + 0.011*"船主" + 0.009*"海" + 0.006*"船長" + 0.006*"島" + 0.006*"國" + 0.004*"風" + 0.004*"艦" + 0.004*"衆"')]
3
5
3
3
3
3
3
5
5
5
5
5
4
5
5
5
5
2
5
2
2
2
2
2
5
2
2
2
5
0
0
0
0
0
0
6
0
0
0
0
0
0
0
1
3
3
6
6
1
1


In [18]:
import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary)
vis

In [19]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
# Assuming vis_data is already prepared
# vis_data = gensimvis.prepare(lda_model, corpus, id2word)

# Save to a file
pyLDAvis.save_html(vis, 'lda_result_trans.html')