In [1]:
"""隐马尔可夫模型观测序列生成"""
import random

"""定义模型参数"""
states = ["Rainy", "Sunny"]  # 隐藏状态
observations = ["Walk", "Shop", "Clean"]  # 观测状态

"""初始化概率分布"""
start_prob = {"Rainy": 0.6, "Sunny": 0.4}  # 初始状态概率π

"""状态转移矩阵A"""
trans_mat = {
    "Rainy": {"Rainy": 0.7, "Sunny": 0.3},
    "Sunny": {"Rainy": 0.4, "Sunny": 0.6}
}

"""观测概率矩阵B"""
emit_prob = {
    "Rainy": {"Walk": 0.1, "Shop": 0.4, "Clean": 0.5},
    "Sunny": {"Walk": 0.6, "Shop": 0.3, "Clean": 0.1}
}

"""根据概率分布随机选择状态/观测值"""
def sample_from(prob_dict):
    rand = random.random()  # 生成 [0, 1) 的随机数
    cumulative_prob = 0
    for key, prob in prob_dict.items():
        cumulative_prob += prob
        if rand < cumulative_prob:
            return key
    return list(prob_dict.keys())[-1]  # 防止浮点误差

"""生成序列"""
def generate_sequence(length):
    hidden_sequence = []
    obs_sequence = []
    
    # 初始状态
    current_state = sample_from(start_prob)
    # print(current_state)
    hidden_sequence.append(current_state)
    # print(hidden_sequence)
    
    # 生成后续状态和观测
    for _ in range(length - 1):
        # 生成观测值
        obs = sample_from(emit_prob[current_state])
        # print(obs)
        obs_sequence.append(obs)
        
        # 转移到下一个状态
        current_state = sample_from(trans_mat[current_state])
        hidden_sequence.append(current_state)
    
    return hidden_sequence, obs_sequence

"""生成并输出序列"""
hidden_seq, obs_seq = generate_sequence(5)
print("状态序列:", hidden_seq)
print("观测序列:", obs_seq)

状态序列: ['Rainy', 'Sunny', 'Rainy', 'Sunny', 'Rainy']
观测序列: ['Walk', 'Walk', 'Clean', 'Walk']


In [5]:
import jieba
import jieba.analyse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

documents = [
    "人工智能是未来科技发展的核心方向之一。",
    "深度学习在计算机视觉和自然语言处理中应用广泛。",
    "气候变化对全球经济和社会结构产生深远影响。",
    "可再生能源如太阳能和风能正在取代传统能源。",
    "区块链技术为金融行业带来了去中心化的解决方案。"
]

# 分词与去停用词
def chinese_text_preprocess(texts):
    with open('../data/hit_stopwords.txt', 'r', encoding='utf-8') as f:
        stopwords = [line.rstrip('\n') for line in f]
    processed_texts = []
    for text in texts:
        words = jieba.lcut(text)
        words = [word for word in words if word not in stopwords and len(word) > 1]
        processed_texts.append(" ".join(words))  # 转为空格分隔的字符串
    return processed_texts

processed_docs = chinese_text_preprocess(documents)

# 构建词频矩阵
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(processed_docs)

# 训练LDA模型
lda = LatentDirichletAllocation(
    n_components=2,      # 主题数
    random_state=42,
    learning_method="online"
)
lda.fit(X)

# 打印每个主题的关键词
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-6:-1]]  # 取权重最高的5个词
    print(f"主题 {topic_idx}: {', '.join(top_words)}")

主题 0: 全球, 带来, 行业, 气候变化, 金融
主题 1: 风能, 太阳能, 再生能源, 自然语言, 应用
