## Load data

In [1]:
import pandas as pd

# Read the dataset.csv file and store it in a dataframe
data_fan = pd.read_csv('dataset.csv')
data = pd.read_csv('dataset.csv')

# Print the first 5 rows of the dataframe
print(data_fan.head())

   Unnamed: 0      book chapter  \
0           0  analects      學而   
1           1  analects      為政   
2           2  analects      八佾   
3           3  analects      里仁   
4           4  analects     公冶長   

                                             content  
0  子曰：「學而時習之，不亦說乎？有朋自遠方來，不亦樂乎？人不知而不慍，不亦君子乎？」\n\n有...  
1  子曰：「為政以德，譬如北辰，居其所而眾星共之。」\n\n子曰：「詩三百，一言以蔽之，曰『思無...  
2  孔子謂季氏：「八佾舞於庭，是可忍也，孰不可忍也？」\n\n三家者以雍徹。子曰：「『相維辟公，...  
3  子曰：「里仁為美。擇不處仁，焉得知？」\n\n子曰：「不仁者不可以久處約，不可以長處樂。仁者...  
4  子謂公冶長，「可妻也。雖在縲絏之中，非其罪也」。以其子妻之。\n\n子謂南容，「邦有道，不廢...  


# Simplify Chinese characters

In [2]:
# Split the 'content' into separate sentences using '\n\n', then explode into new rows
data_fan = data_fan.assign(sentences=data_fan['content'].str.split('\n\n')).explode('sentences')

# Create 'book', 'chapter', 'sentences' structure
data_fan = data_fan[['book', 'chapter', 'sentences']]

# Show the modified DataFrame
data_fan.head()

# set new index
data_fan = data_fan.reset_index(drop=True)

In [3]:
data_fan.head()

Unnamed: 0,book,chapter,sentences
0,analects,學而,子曰：「學而時習之，不亦說乎？有朋自遠方來，不亦樂乎？人不知而不慍，不亦君子乎？」
1,analects,學而,有子曰：「其為人也孝弟，而好犯上者，鮮矣；不好犯上，而好作亂者，未之有也。君子務本，本立而道...
2,analects,學而,子曰：「巧言令色，鮮矣仁！」
3,analects,學而,曾子曰：「吾日三省吾身：為人謀而不忠乎？與朋友交而不信乎？傳不習乎？」
4,analects,學而,子曰：「道千乘之國：敬事而信，節用而愛人，使民以時。」


In [4]:
from zhconv import convert
# Convert the content to Simplified Chinese
data['chapter'] = data['chapter'].apply(lambda x: convert(x, 'zh-cn'))
data['content'] = data['content'].apply(lambda x: convert(x, 'zh-cn'))

# Split the 'content' into separate sentences using '\n\n', then explode into new rows
data = data.assign(sentences=data['content'].str.split('\n\n')).explode('sentences')

# Create 'book', 'chapter', 'sentences' structure
data = data[['book', 'chapter', 'sentences']]

# Show the modified DataFrame
data.head()

Unnamed: 0,book,chapter,sentences
0,analects,学而,子曰：“学而时习之，不亦说乎？有朋自远方来，不亦乐乎？人不知而不愠，不亦君子乎？”
0,analects,学而,有子曰：“其为人也孝弟，而好犯上者，鲜矣；不好犯上，而好作乱者，未之有也。君子务本，本立而道...
0,analects,学而,子曰：“巧言令色，鲜矣仁！”
0,analects,学而,曾子曰：“吾日三省吾身：为人谋而不忠乎？与朋友交而不信乎？传不习乎？”
0,analects,学而,子曰：“道千乘之国：敬事而信，节用而爱人，使民以时。”


# Calculate Ren occurence

In [3]:
# Add a new column 'contain_ren' to check if the sentence contains the character '仁'
data['contain_ren'] = data['sentences'].apply(lambda x: 1 if '仁' in x else 0)

# Show the modified DataFrame
data.head()

Unnamed: 0,book,chapter,sentences,contain_ren
0,analects,学而,子曰：“学而时习之，不亦说乎？有朋自远方来，不亦乐乎？人不知而不愠，不亦君子乎？”,0
0,analects,学而,有子曰：“其为人也孝弟，而好犯上者，鲜矣；不好犯上，而好作乱者，未之有也。君子务本，本立而道...,1
0,analects,学而,子曰：“巧言令色，鲜矣仁！”,1
0,analects,学而,曾子曰：“吾日三省吾身：为人谋而不忠乎？与朋友交而不信乎？传不习乎？”,0
0,analects,学而,子曰：“道千乘之国：敬事而信，节用而爱人，使民以时。”,0


In [5]:
print(f'The length of the DataFrame is {len(data)}', 
      f'The number of sentences that contain the character "仁" is {len(data[data["contain_ren"] == 1])}', sep='\n')

The length of the DataFrame is 12604
The number of sentences that contain the character "仁" is 921


In [31]:
# Save the modified DataFrame to a new CSV file
data.to_csv('modified_dataset.csv', index=False)

# Add time attribute

In [5]:
list_bookname = list(data_fan["book"].unique())
list_book_time = [
    ['analects',-480, -350],
    ['mengzi',  -340, -250],
    ['liji',    -475, -221],
    ['xunzi',   -475, -221],
    ['xiao-jing',-475, -221],
    ['shuo-yuan',-206, 9],
    ['chun-qiu-fan-lu',-206, 9],
    ['han-shi-wai-zhuan',-180, -120],
    ['da-dai-li-ji',100, 200],
    ['bai-hu-tong',79, 92],
    ['xin-shu', -206, 9],
    ['xin-xu',-206, 9],
    ['yangzi-fayan',-33, 18],
    ['zhong-lun',25, 220],
    ['kongzi-jiayu',-206, 220],
    ['qian-fu-lun',102, 167],
    ['lunheng', 80, 80],
    ['taixuanjing',-33, 18],
    ['fengsutongyi',190, 200],
    ['kongcongzi',25, 265],
    ['shenjian',196, 220],
    ['zhong-jing',100,166],
    ['su-shu',-250,-186],
    ['xinyu',-196, -196],
    ['duduan',167, 258],
    ['caizhong-langji', 152, 192]
    ]
data_fan["Start"] = None
data_fan["End"] = None
num = list_bookname.index(data_fan.iloc[0]["book"])
for i in range(data_fan.shape[0]):
    num = list_bookname.index(data_fan.iloc[i]["book"])
    list_book_time[num]
    data_fan.loc[i, 'Start'] = list_book_time[num][1]
    data_fan.loc[i, 'End'] = list_book_time[num][2]
data_fan['average'] = (data_fan['Start'] + data_fan['End'])/2

data_fan.head()

Unnamed: 0,book,chapter,sentences,Start,End,average
0,analects,學而,子曰：「學而時習之，不亦說乎？有朋自遠方來，不亦樂乎？人不知而不慍，不亦君子乎？」,-480,-350,-415.0
1,analects,學而,有子曰：「其為人也孝弟，而好犯上者，鮮矣；不好犯上，而好作亂者，未之有也。君子務本，本立而道...,-480,-350,-415.0
2,analects,學而,子曰：「巧言令色，鮮矣仁！」,-480,-350,-415.0
3,analects,學而,曾子曰：「吾日三省吾身：為人謀而不忠乎？與朋友交而不信乎？傳不習乎？」,-480,-350,-415.0
4,analects,學而,子曰：「道千乘之國：敬事而信，節用而愛人，使民以時。」,-480,-350,-415.0


# Bert-ancient-chiense to do word segmentation

In [6]:
import torch

In [7]:
print(torch.cuda.is_available())

True


In [8]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("Jihuai/bert-ancient-chinese")

model = AutoModel.from_pretrained("Jihuai/bert-ancient-chinese")


Some weights of BertModel were not initialized from the model checkpoint at Jihuai/bert-ancient-chinese and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
data_fan['sentences'][0]

'子曰：「學而時習之，不亦說乎？有朋自遠方來，不亦樂乎？人不知而不慍，不亦君子乎？」'

In [36]:
text = data_fan['sentences'][0]
encoded_input = tokenizer.encode(text)
print(encoded_input)
encode_tensor = (torch.tensor([encoded_input]))
decoded_string = tokenizer.decode(encoded_input)
print(decoded_string)

[101, 2094, 3288, 8038, 519, 2119, 5445, 3229, 5424, 722, 8024, 679, 771, 6303, 725, 8043, 3300, 3301, 5632, 6895, 3175, 889, 8024, 679, 771, 3556, 725, 8043, 782, 679, 4761, 5445, 679, 21628, 8024, 679, 771, 1409, 2094, 725, 8043, 520, 102]
[CLS] 子 曰 ： 「 學 而 時 習 之 ， 不 亦 說 乎 ？ 有 朋 自 遠 方 來 ， 不 亦 樂 乎 ？ 人 不 知 而 不 慍 ， 不 亦 君 子 乎 ？ 」 [SEP]


## roberta-segmentation

In [8]:
from transformers import AutoTokenizer,AutoModelForTokenClassification
rob_tokenizer = AutoTokenizer.from_pretrained("KoichiYasuoka/roberta-classical-chinese-base-sentence-segmentation")
rob_model = AutoModelForTokenClassification.from_pretrained("KoichiYasuoka/roberta-classical-chinese-base-sentence-segmentation")


In [12]:
s = "子曰學而時習之不亦説乎有朋自遠方來不亦樂乎人不知而不慍不亦君子乎"
p = [rob_model.config.id2label[q] for q in torch.argmax(rob_model(rob_tokenizer.encode(s,return_tensors="pt"))["logits"],dim=2)[0].tolist()[1:-1]]
print("".join(c+"。" if q=="E" or q=="S" else c for c,q in zip(s,p)))


子曰。學而時習之。不亦説乎。有朋自遠方來。不亦樂乎。人不知而不慍。不亦君子乎。


In [14]:
example = data_fan['sentences'][0]
tokenized_input = rob_tokenizer(example, is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

TypeError: PreTokenizedEncodeInput must be Union[PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence]]

In [23]:
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# 数据预处理：分词和去除停用词
def chinese_tokenizer(text):
    tokens = jieba.lcut(text)
    return [token for token in tokens if token not in stop_words and len(token) > 1]

# 加载数据
# 假设 data['sentences'] 是你要分析的文本数据列
corpus = data['sentences'].tolist()

# 停用词列表，需要你根据实际情况扩展
stop_words = set(['之', '乎', '者', '也', '曰', '子', ])

# 文档-词频矩阵
vectorizer = CountVectorizer(tokenizer=chinese_tokenizer, max_features=1000)  # 可调整max_features根据数据大小
dtm = vectorizer.fit_transform(corpus)

# LDA模型
lda = LatentDirichletAllocation(n_components=5, random_state=0)  # 可调整n_components为主题数
lda.fit(dtm)

# 查看主题
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# 显示主题
no_top_words = 10
display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)

# Visualize the topics
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, dtm, vectorizer)


KeyboardInterrupt: 

In [37]:
chinese_tokenizer('仁者爱人，人恒爱之。')

{'input_ids': [101, 785, 5442, 4263, 782, 8024, 782, 2608, 4263, 722, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [40]:
import torch
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("SIKU-BERT/sikubert")
model = AutoModel.from_pretrained("SIKU-BERT/sikubert")
# 示例文本
text = "你好，世界！"

# 使用分词器处理文本
encoded_input = tokenizer(text, return_tensors='pt')

# 使用模型获取文本的特征表示
with torch.no_grad():
    output = model(**encoded_input)

# 输出结果，例如输出第一个隐藏状态的向量
print(output.last_hidden_state)


pytorch_model.bin: 100%|██████████| 438M/438M [00:44<00:00, 9.90MB/s] 
  return self.fget.__get__(instance, owner)()


tensor([[[-0.8366, -0.4070,  0.5772,  ...,  1.3666, -0.6914, -0.6586],
         [-0.3661, -0.2313,  0.4520,  ...,  0.5482, -0.5783, -0.2224],
         [-0.3812,  0.3538,  0.3265,  ...,  0.3139, -0.4686, -0.6361],
         ...,
         [ 0.4973,  0.2635, -0.1911,  ...,  0.7610,  0.1643, -0.0059],
         [-0.2748,  0.4787,  0.7579,  ...,  0.6090, -0.5992, -1.0056],
         [-0.8366, -0.4070,  0.5772,  ...,  1.3666, -0.6914, -0.6586]]])


In [33]:
import re

def clean_text(text):
    # 中文标点和英文标点
    punctuations = r'[，。！？、；：“”‘’（）《》【】『』「」\s+\.\!\/_,$%^*()+\"\']+|[+——！，;:。？、~@#￥%……&*（）]+'
    text = re.sub(punctuations, '', text)
    return text

# 停用词列表（示例，可扩展）
stop_words = set(['的', '是', '在', '有', '和', '不'])

def remove_stopwords(text):
    return ''.join([char for char in text if char not in stop_words])

def generate_ngrams(text, n=1):
    text = clean_text(text)  # 清除标点
    text = remove_stopwords(text)  # 去除停用词
    ngrams = [text[i:i+n] for i in range(len(text)-n+1)]
    return ngrams

# 应用到DataFrame中
data['unigrams'] = data['sentences'].apply(lambda x: generate_ngrams(x, n=1))
data['bigrams'] = data['sentences'].apply(lambda x: generate_ngrams(x, n=2))

# 展示部分结果
print(data[['sentences', 'unigrams', 'bigrams']].head())


                                           sentences  \
0          子曰：“学而时习之，不亦说乎？有朋自远方来，不亦乐乎？人不知而不愠，不亦君子乎？”   
0  有子曰：“其为人也孝弟，而好犯上者，鲜矣；不好犯上，而好作乱者，未之有也。君子务本，本立而道...   
0                                     子曰：“巧言令色，鲜矣仁！”   
0                曾子曰：“吾日三省吾身：为人谋而不忠乎？与朋友交而不信乎？传不习乎？”   
0                        子曰：“道千乘之国：敬事而信，节用而爱人，使民以时。”   

                                            unigrams  \
0  [子, 曰, 学, 而, 时, 习, 之, 亦, 说, 乎, 朋, 自, 远, 方, 来, ...   
0  [子, 曰, 其, 为, 人, 也, 孝, 弟, 而, 好, 犯, 上, 者, 鲜, 矣, ...   
0                        [子, 曰, 巧, 言, 令, 色, 鲜, 矣, 仁]   
0  [曾, 子, 曰, 吾, 日, 三, 省, 吾, 身, 为, 人, 谋, 而, 忠, 乎, ...   
0  [子, 曰, 道, 千, 乘, 之, 国, 敬, 事, 而, 信, 节, 用, 而, 爱, ...   

                                             bigrams  
0  [子曰, 曰学, 学而, 而时, 时习, 习之, 之亦, 亦说, 说乎, 乎朋, 朋自, 自...  
0  [子曰, 曰其, 其为, 为人, 人也, 也孝, 孝弟, 弟而, 而好, 好犯, 犯上, 上...  
0                   [子曰, 曰巧, 巧言, 言令, 令色, 色鲜, 鲜矣, 矣仁]  
0  [曾子, 子曰, 曰吾, 吾日, 日三, 三省, 省吾, 吾身, 身为, 为人, 人谋, 谋...  
0  [子曰, 曰道, 道千, 千乘, 乘之, 之国, 国敬, 敬事, 事而, 而信, 信节, 节..

In [34]:
from collections import Counter

# 合并所有n-gram列表并计算频率
def get_ngram_frequencies(column):
    ngram_list = data[column].sum()  # 将所有行的n-gram列表合并
    ngram_counts = Counter(ngram_list)  # 计算每个n-gram的频率
    # 返回按频率降序排列的n-gram和它们的计数
    return sorted(ngram_counts.items(), key=lambda x: x[1], reverse=True)

# 获取unigram和bigram的频率
unigram_frequencies = get_ngram_frequencies('unigrams')
bigram_frequencies = get_ngram_frequencies('bigrams')

# 打印前10个最频繁的unigrams和bigrams
print("Top 10 unigrams:", unigram_frequencies[:10])
print("Top 10 bigrams:", bigram_frequencies[:10])


Top 10 unigrams: [('之', 50709), ('也', 30599), ('而', 22435), ('以', 21000), ('子', 17866), ('者', 17428), ('曰', 17223), ('其', 16821), ('人', 14860), ('于', 13304)]
Top 10 bigrams: [('子曰', 2924), ('孔子', 2578), ('天下', 2373), ('君子', 2201), ('人之', 1988), ('所以', 1933), ('以为', 1854), ('之所', 1636), ('子之', 1572), ('诸侯', 1466)]


In [35]:
unigram_frequencies

[('之', 50709),
 ('也', 30599),
 ('而', 22435),
 ('以', 21000),
 ('子', 17866),
 ('者', 17428),
 ('曰', 17223),
 ('其', 16821),
 ('人', 14860),
 ('于', 13304),
 ('为', 13089),
 ('则', 10001),
 ('天', 8324),
 ('君', 7557),
 ('无', 7555),
 ('故', 7457),
 ('所', 7288),
 ('王', 6226),
 ('夫', 6029),
 ('大', 5665),
 ('下', 5475),
 ('可', 5237),
 ('能', 5190),
 ('言', 5148),
 ('矣', 5138),
 ('何', 4957),
 ('公', 4709),
 ('行', 4668),
 ('知', 4634),
 ('乎', 4551),
 ('谓', 4496),
 ('与', 4222),
 ('道', 4084),
 ('一', 4071),
 ('三', 4039),
 ('得', 4034),
 ('此', 4024),
 ('事', 3862),
 ('国', 3828),
 ('非', 3820),
 ('民', 3799),
 ('如', 3798),
 ('见', 3689),
 ('然', 3664),
 ('生', 3603),
 ('礼', 3560),
 ('臣', 3535),
 ('后', 3480),
 ('上', 3336),
 ('必', 3164),
 ('至', 3056),
 ('使', 2973),
 ('自', 2966),
 ('中', 2897),
 ('时', 2881),
 ('明', 2862),
 ('文', 2811),
 ('孔', 2770),
 ('死', 2756),
 ('日', 2731),
 ('德', 2729),
 ('五', 2637),
 ('相', 2570),
 ('焉', 2542),
 ('义', 2536),
 ('诸', 2444),
 ('善', 2437),
 ('十', 2432),
 ('用', 2406),
 ('若', 2374),
 ('世', 2

In [42]:
from transformers import BertTokenizer

# 加载分词器
tokenizer = BertTokenizer.from_pretrained('SIKU-BERT/sikubert')

# 示例文本
text = "你好，世界！这是一个示例文本。"

# 使用分词器
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)


Tokens: ['你', '好', '，', '世', '界', '！', '这', '是', '一', '个', '示', '例', '文', '本', '。']


In [5]:
from bertopic import BERTopic
topic_model = BERTopic(language="multilingual", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(data['sentences'])

  from .autonotebook import tqdm as notebook_tqdm
2024-04-16 10:26:16,546 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 394/394 [03:48<00:00,  1.72it/s]
2024-04-16 10:30:08,782 - BERTopic - Embedding - Completed ✓
2024-04-16 10:30:08,783 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-04-16 10:30:26,289 - BERTopic - Dimensionality - Completed ✓
2024-04-16 10:30:26,290 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-04-16 10:30:30,053 - BERTopic - Cluster - Completed ✓
2024-04-16 10:30:30,060 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-04-16 10:30:30,808 - BERTopic - Representation - Completed ✓


In [9]:
len(topic_model.get_topics())

95

In [44]:
topic_model.get_topic_info().head(10)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,6634,-1_孔子曰_子曰_春秋_故曰,"[孔子曰, 子曰, 春秋, 故曰, 或曰, 何也, 此之谓也, 对曰, 公曰, 尚书]",[穆公问于子思曰：“立太子有常乎？”荅曰：“有之。在周公之典。”公曰：“昔文王舍适而立其次，...
1,0,868,0_对曰_王曰_公曰_孔子曰,"[对曰, 王曰, 公曰, 孔子曰, 简子曰, 何也, 文侯曰, 桓公曰, 春秋, 管仲曰]",[他日，简子出，有人当道，辟之不去，从者将刃，当道者曰：“吾有欲谒于主君。”从者以闻。简子召...
2,1,697,1____,"[, , , , , , , , , ]","[, , ]"
3,2,434,2_测曰_次八_次六_次七,"[测曰, 次八, 次六, 次七, 上九, 次五, 还于丧, 崔嵬不崩, 角解豸, 奔鹿怀鼷]","[次七，竦萃于丘冢。测曰，“竦萃丘冢”、礼不废也。, 次八，大敛大巅。测曰，“大敛”之“巅”..."
4,3,268,3_孟子曰_孟子对曰_人皆有之_吾闻西伯善养老者,"[孟子曰, 孟子对曰, 人皆有之, 吾闻西伯善养老者, 予何为不受, 盍归乎来, 未之有也,...","[孟子曰：“何以言之？”, 孟子曰：“然。”, 孟子曰：“礼也。”]"
5,4,203,4_经礼也_有世妇_有妾_有妻,"[经礼也, 有世妇, 有妾, 有妻, 春秋传, 大夫曰孺人, 士曰妇人, 昏礼经, 为之赐,...",[《春秋》有经礼，有变礼。为如安性平心者，经礼也。至有于性，虽不安，于心，虽不平，于道，无以...
6,5,182,5_孔子对曰_孔子曰_公曰_哀公曰,"[孔子对曰, 孔子曰, 公曰, 哀公曰, 哀公问于孔子曰, 子路曰, 服使然也, 而君不悟,...",[哀公问于孔子曰：“大礼何如？子之言礼，何其尊？”孔子对曰：“丘也鄙人，不足以知大礼。”公曰...
7,6,159,6_次二_初一_测曰_冲冲儿遇,"[次二, 初一, 测曰, 冲冲儿遇, 孚其肉, 赫河, 争不争, 将造邪, 家无壶, 谨于媐㐜]","[初一，丸钻钻于内隙，厉。测曰，“丸钻于内”、转丸非也。, 次二，𫔵无间。测曰，“无间”之“..."
8,7,141,7_春秋_则不得其正_其言之成理_足以欺惑愚众,"[春秋, 则不得其正, 其言之成理, 足以欺惑愚众, 然而其持之有故, 无之有也, 如此者,...",[彦曰：“人之性分气度不同，有体貌亢䟽，色厉矜庄，仪容冰栗，似若能断，而当事少决，不遂其为者...
9,8,127,8_如雨_尔雅_星𫕥如雨_何以验之,"[如雨, 尔雅, 星𫕥如雨, 何以验之, 春秋, 昏而中, 夜中, 日月之行, 下布之民, 不修]",[礼者，继天地，体阴阳，而慎主客，序尊卑、贵贱、大小之位，而差外内、远近、新故之级者也，以德...


In [11]:
topic_model.reduce_topics(data['sentences'], nr_topics=10)

2024-04-16 10:32:49,157 - BERTopic - Topic reduction - Reducing number of topics
2024-04-16 10:32:49,903 - BERTopic - Topic reduction - Reduced number of topics from 95 to 10


<bertopic._bertopic.BERTopic at 0x7fa54a744730>

In [12]:
topic_model.get_topic_info().head(10)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,6799,-1_孔子曰_子曰_春秋_对曰,"[孔子曰, 子曰, 春秋, 对曰, 故曰, 何也, 或曰, 公曰, 此之谓也, 尚书]",[樊迟问仁。子曰：“爱人。”问知。子曰：“知人。”樊迟未达。子曰：“举直错诸枉，能使枉者直。...
1,0,4521,0_测曰_孔子曰_孟子曰_子曰,"[测曰, 孔子曰, 孟子曰, 子曰, 春秋, 对曰, 公曰, 或问, 何也, 次八]","[上九，不终其德，二岁见代。测曰，“不终”之“代”、不可长也。, 次七，乘火寒泉至。测曰，“..."
2,1,937,1_散也_字子从_戟也_善也,"[散也, 字子从, 戟也, 善也, 升也, 取也, 数也, 长也, 燕伋, 狄黑]","[取荼。荼也者，以为君荐蒋也。, 跬，一举足也。倍跬，谓之歩。, 薛邦，字子从。]"
3,2,146,2_长三尺_非也_左画苍龙_两臂前后刻金,"[长三尺, 非也, 左画苍龙, 两臂前后刻金, 金鍐形如缇, 系轴头, 金燧, 金鍐方𨰿, ...",[谷之始熟曰粟，舂之于臼，簸其秕糠，蒸之于甑，爨之以火，成熟为饭，乃甘可食。可食而食之，味生...
4,3,80,3_行夏令_行春令_行秋令_行冬令,"[行夏令, 行春令, 行秋令, 行冬令, 则其国乃旱, 四鄙入保, 风从虎, 云从龙, 则其...",[然夫虫之生也，必依温湿。温湿之气常在春夏，秋冬之气寒而干燥，虫未曾生。若以虫生，罪乡部吏，...
5,4,50,4_鲁人_少孔子五十岁_名山大泽不以封_其余,"[鲁人, 少孔子五十岁, 名山大泽不以封, 其余, 三月不从政, 其余以为附庸间田, 一坐再...",[方千里者，为方百里者百。封方百里者三十国，其余，方百里者七十。又封方七十里者六十，为方百里...
6,5,36,5_太尉沛国刘矩_弘农太守河内吴匡_韩安国_城阳景王祠,"[太尉沛国刘矩, 弘农太守河内吴匡, 韩安国, 城阳景王祠, 九江太守武陵陈子威, 公车征士...","[*太尉沛国刘矩, 鼓：○□○○□□○□○○□。半：○□○□○○○□□○□○。鲁鼓。○□○○..."
7,6,13,6_九月_王狩_十月_十二月也,"[九月, 王狩, 十月, 十二月也, 十一月, 四月, 六月, 七月, 二月, 三月]","[九月：, 或说《春秋》、十二月也。, 十一月：王狩。]"
8,7,11,7_衣青衣_载白旗_服仓玉_驾仓龙,"[衣青衣, 载白旗, 服仓玉, 驾仓龙, 服白玉, 服赤玉, 衣朱衣, 乘戎路, 食麻与犬,...","[天子居青阳大庙，乘鸾路，驾仓龙，载青旗，衣青衣，服仓玉，食麦与羊，其器疏以达。, 天子居明..."
9,8,11,8_此虚言也_殆虚言也_如实论之_则政不核,"[此虚言也, 殆虚言也, 如实论之, 则政不核, 载以为是, 如真不真, 原省其实, 此虚言...","[曰：此虚言也。, 曰：此虚言也。, 此虚言也。]"


In [13]:
topic_model.get_topic(0)

[('测曰', 0.02865606933188256),
 ('孔子曰', 0.014110274406217654),
 ('孟子曰', 0.012987624795505987),
 ('子曰', 0.01121131113435812),
 ('春秋', 0.007015365708228161),
 ('对曰', 0.006602798744358283),
 ('公曰', 0.006395234385509569),
 ('或问', 0.005959937313216265),
 ('何也', 0.005644846168066878),
 ('次八', 0.005518285563469141)]

In [14]:
topic_model.get_document_info(data['sentences'])

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,子曰：“学而时习之，不亦说乎？有朋自远方来，不亦乐乎？人不知而不愠，不亦君子乎？”,0,0_测曰_孔子曰_孟子曰_子曰,"[测曰, 孔子曰, 孟子曰, 子曰, 春秋, 对曰, 公曰, 或问, 何也, 次八]","[上九，不终其德，二岁见代。测曰，“不终”之“代”、不可长也。, 次七，乘火寒泉至。测曰，“...",测曰 - 孔子曰 - 孟子曰 - 子曰 - 春秋 - 对曰 - 公曰 - 或问 - 何也 - 次八,0.641458,False
1,有子曰：“其为人也孝弟，而好犯上者，鲜矣；不好犯上，而好作乱者，未之有也。君子务本，本立而道...,-1,-1_孔子曰_子曰_春秋_对曰,"[孔子曰, 子曰, 春秋, 对曰, 故曰, 何也, 或曰, 公曰, 此之谓也, 尚书]",[樊迟问仁。子曰：“爱人。”问知。子曰：“知人。”樊迟未达。子曰：“举直错诸枉，能使枉者直。...,孔子曰 - 子曰 - 春秋 - 对曰 - 故曰 - 何也 - 或曰 - 公曰 - 此之谓也 ...,0.197736,False
2,子曰：“巧言令色，鲜矣仁！”,0,0_测曰_孔子曰_孟子曰_子曰,"[测曰, 孔子曰, 孟子曰, 子曰, 春秋, 对曰, 公曰, 或问, 何也, 次八]","[上九，不终其德，二岁见代。测曰，“不终”之“代”、不可长也。, 次七，乘火寒泉至。测曰，“...",测曰 - 孔子曰 - 孟子曰 - 子曰 - 春秋 - 对曰 - 公曰 - 或问 - 何也 - 次八,1.000000,False
3,曾子曰：“吾日三省吾身：为人谋而不忠乎？与朋友交而不信乎？传不习乎？”,-1,-1_孔子曰_子曰_春秋_对曰,"[孔子曰, 子曰, 春秋, 对曰, 故曰, 何也, 或曰, 公曰, 此之谓也, 尚书]",[樊迟问仁。子曰：“爱人。”问知。子曰：“知人。”樊迟未达。子曰：“举直错诸枉，能使枉者直。...,孔子曰 - 子曰 - 春秋 - 对曰 - 故曰 - 何也 - 或曰 - 公曰 - 此之谓也 ...,0.420131,False
4,子曰：“道千乘之国：敬事而信，节用而爱人，使民以时。”,-1,-1_孔子曰_子曰_春秋_对曰,"[孔子曰, 子曰, 春秋, 对曰, 故曰, 何也, 或曰, 公曰, 此之谓也, 尚书]",[樊迟问仁。子曰：“爱人。”问知。子曰：“知人。”樊迟未达。子曰：“举直错诸枉，能使枉者直。...,孔子曰 - 子曰 - 春秋 - 对曰 - 故曰 - 何也 - 或曰 - 公曰 - 此之谓也 ...,0.134986,False
...,...,...,...,...,...,...,...,...
12599,樊哙冠，汉将军樊哙造次所冠，以入项籍营，广七寸，前出四寸，司马殿门大护卫士服之。,-1,-1_孔子曰_子曰_春秋_对曰,"[孔子曰, 子曰, 春秋, 对曰, 故曰, 何也, 或曰, 公曰, 此之谓也, 尚书]",[樊迟问仁。子曰：“爱人。”问知。子曰：“知人。”樊迟未达。子曰：“举直错诸枉，能使枉者直。...,孔子曰 - 子曰 - 春秋 - 对曰 - 故曰 - 何也 - 或曰 - 公曰 - 此之谓也 ...,0.892590,False
12600,却敌冠、前高四寸，通长四寸，后高三寸，监门卫士服之，《礼》无文。,-1,-1_孔子曰_子曰_春秋_对曰,"[孔子曰, 子曰, 春秋, 对曰, 故曰, 何也, 或曰, 公曰, 此之谓也, 尚书]",[樊迟问仁。子曰：“爱人。”问知。子曰：“知人。”樊迟未达。子曰：“举直错诸枉，能使枉者直。...,孔子曰 - 子曰 - 春秋 - 对曰 - 故曰 - 何也 - 或曰 - 公曰 - 此之谓也 ...,0.893724,False
12601,珠冕、爵冔收、通天冠、进贤冠、长冠、缁布冠、委貌冠、皮弁、惠文冠，古者天子冠所加者，其次在汉礼。,-1,-1_孔子曰_子曰_春秋_对曰,"[孔子曰, 子曰, 春秋, 对曰, 故曰, 何也, 或曰, 公曰, 此之谓也, 尚书]",[樊迟问仁。子曰：“爱人。”问知。子曰：“知人。”樊迟未达。子曰：“举直错诸枉，能使枉者直。...,孔子曰 - 子曰 - 春秋 - 对曰 - 故曰 - 何也 - 或曰 - 公曰 - 此之谓也 ...,0.091955,False
12602,帝谥：违拂不成曰隐，靖民则法曰黄，翼善传圣曰尧，仁圣盛明曰舜，残人多垒曰桀，残义损善曰纣，慈...,0,0_测曰_孔子曰_孟子曰_子曰,"[测曰, 孔子曰, 孟子曰, 子曰, 春秋, 对曰, 公曰, 或问, 何也, 次八]","[上九，不终其德，二岁见代。测曰，“不终”之“代”、不可长也。, 次七，乘火寒泉至。测曰，“...",测曰 - 孔子曰 - 孟子曰 - 子曰 - 春秋 - 对曰 - 公曰 - 或问 - 何也 - 次八,0.929501,False


In [15]:
from bertopic.representation import KeyBERTInspired

# Fine-tune your topic representations
representation_model = KeyBERTInspired()
topic_model = BERTopic(representation_model=representation_model)


In [17]:
topic_model.fit(data['sentences'], y=data['contain_ren'])

modules.json: 100%|██████████| 349/349 [00:00<00:00, 1.48MB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 500kB/s]
README.md: 100%|██████████| 10.7k/10.7k [00:00<00:00, 8.24MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 193kB/s]
config.json: 100%|██████████| 612/612 [00:00<00:00, 1.95MB/s]
model.safetensors: 100%|██████████| 90.9M/90.9M [00:06<00:00, 14.8MB/s]
tokenizer_config.json: 100%|██████████| 350/350 [00:00<00:00, 2.40MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 2.61MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.75MB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 283kB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 596kB/s]


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [18]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
topic_model = BERTopic(ctfidf_model=ctfidf_model)


In [21]:
len(data['sentences'])

12604

In [19]:
topic_model.fit(data['sentences'])

ValueError: empty vocabulary; perhaps the documents only contain stop words