In [59]:
# !pip install python-docx

In [2]:
import math
from collections import Counter, defaultdict
from docx import Document

In [10]:
def read_docx(file_path):
    """读取DOCX文件并返回其中的文本内容"""
    doc = Document(file_path)
    text = []
    for para in doc.paragraphs:
        text.append(para.text)
    return ''.join(text)

def collect_statistics(text):
    """统计字母、标点符号和空格的频率"""
    char_count = Counter(text)
    total_chars = sum(char_count.values())
    return char_count, total_chars

In [18]:
def calculate_entropy(filtered_text, order=0):
    """计算文本的熵值，使用不同阶的马尔可夫模型"""
    
    if order == 0:
        # 0阶马尔可夫模型：每个字符独立选择
        char_count = Counter(filtered_text)
        total_chars = sum(char_count.values())
        entropy = 0
        for count in char_count.values():
            probability = count / total_chars
            entropy -= probability * math.log2(probability)
        return entropy
    else:
        # 高阶马尔可夫模型
        storage = defaultdict(Counter)
        total_ngrams = 0

        for i in range(len(filtered_text) - order):
            prefix = filtered_text[i:i + order]
            next_char = filtered_text[i + order]
            storage[prefix][next_char] += 1
            total_ngrams += 1

        entropy = 0
        for prefix, suffix_counts in storage.items():
            prefix_total = sum(suffix_counts.values())
            for count in suffix_counts.values():
                probability = count / prefix_total
                entropy -= (prefix_total / total_ngrams) * probability * math.log2(probability)

        return entropy

In [27]:
def markov_model(text, order):
    """构建马尔可夫模型并输出模型状态和转移概率"""
    filtered_text = text.lower()
    ngrams = defaultdict(Counter)

    for i in range(len(filtered_text) - order):
        prefix = filtered_text[i:i + order]
        next_char = filtered_text[i + order]
        ngrams[prefix][next_char] += 1

    total_ngrams = sum(sum(suffix_counts.values()) for suffix_counts in ngrams.values())
    alphabet = sorted(set(filtered_text))
    alphabet_size = len(alphabet)
    model_matrix = np.zeros((alphabet_size ** order, alphabet_size))

    char_to_index = {char: index for index, char in enumerate(alphabet)}

    for prefix, suffix_counts in ngrams.items():
        prefix_index = sum(char_to_index[char] * (alphabet_size ** (order - i - 1)) for i, char in enumerate(prefix))
        prefix_total = sum(suffix_counts.values())
        for char, count in suffix_counts.items():
            char_index = char_to_index[char]
            model_matrix[prefix_index, char_index] = count / prefix_total

    model = {}
    for prefix, suffix_counts in ngrams.items():
        prefix_total = sum(suffix_counts.values())
        model[prefix] = {char: count / prefix_total for char, count in suffix_counts.items()}

    return model, total_ngrams, model_matrix

In [28]:
def print_markov_model(model):
    """打印马尔可夫模型的状态和转移概率"""
    print(model)
    for prefix, transitions in model.items():
        print(f"State: {prefix}")
        for char, prob in transitions.items():
            print(f"  {char}: {prob:.4f}")

In [29]:
Text_file = 'Elon_Musk_Speech.docx'
Text = read_docx(Text_file)

Text_stats, elon_total = collect_statistics(Text)
print("Elon Musk Speech Statistics:")
print(Text_stats)

Elon Musk Speech Statistics:
Counter({' ': 1412, 'e': 697, 't': 660, 'a': 525, 'o': 500, 'n': 418, 'i': 382, 's': 349, 'h': 345, 'r': 284, 'l': 258, 'd': 214, 'c': 201, 'u': 183, 'w': 157, 'g': 149, 'y': 137, 'f': 133, 'm': 128, 'p': 103, ',': 91, '.': 91, 'b': 79, 'k': 63, '’': 55, 'I': 54, 'v': 53, 'A': 24, 'S': 22, 'B': 16, 'T': 16, 'P': 13, '-': 9, 'C': 7, '?': 6, 'M': 5, 'x': 5, '1': 4, 'W': 4, 'q': 4, '0': 4, '–': 4, 'O': 4, '—': 3, '2': 3, 'z': 3, '‘': 3, '9': 3, 'F': 3, 'Y': 3, 'X': 3, 'R': 3, 'E': 2, 'L': 2, 'N': 2, 'G': 2, 'D': 2, '“': 1, '3': 1, '5': 1, '8': 1, 'j': 1, '4': 1, ':': 1, '”': 1})


In [30]:
Text_entropy_0 = calculate_entropy(Text, 0)
Text_entropy_3 = calculate_entropy(Text, 3)
Text_entropy_5 = calculate_entropy(Text, 5)
print(f"\nText Entropy (0th order): {Text_entropy_0}")
print(f"Text Entropy (3rd order): {Text_entropy_3}")
print(f"Text Entropy (5th order): {Text_entropy_5}")


Text Entropy (0th order): 4.385606450470013
Text Entropy (3rd order): 1.188728794461539
Text Entropy (5th order): 0.4271118482058984


对于这个结果，可以认为0-1阶的Markov模型近似满足真实的英文熵值。
而由于所提供的英文文本内容有限，所以对于更高阶的Markov模型，得到的熵值不会趋于值约为1.4的极限。

In [38]:
import numpy as np
def calculate_stationary_distribution(P):
    """计算马尔可夫链的平稳分布"""
    eigvals, eigvecs = np.linalg.eig(P.T)
    stationary_index = np.argmin(np.abs(eigvals - 1.0))
    stationary_distribution = np.real(eigvecs[:, stationary_index])
    stationary_distribution /= np.sum(stationary_distribution)
    return stationary_distribution

def calculate_entropy_rate(P, pi):
    """计算马尔可夫链的熵率"""
    H = 0
    for i in range(P.shape[0]):
        for j in range(P.shape[1]):
            if P[i, j] > 0:
                H -= pi[i] * P[i, j] * np.log2(P[i, j])
    return H

In [42]:
orders = [5]
for order in orders:
    print(f"\nElon Musk Speech Markov Model (Order {order}):")
    elon_model, elon_total_ngrams, model_matrix = markov_model(Text, order)
    print_markov_model(elon_model)
    pi = calculate_stationary_distribution(model_matrix)
    entropy_rate = calculate_entropy_rate(model_matrix, pi)

    print("平稳分布:")
    print(pi)
    print("\n熵率:")
    print(entropy_rate)

    with open(f'Elon Musk Speech Markov Model (Order {order}).txt', 'w', encoding='utf-8',) as f:
        f.write(str(elon_model))
        f.close()


Elon Musk Speech Markov Model (Order 5):


MemoryError: Unable to allocate 70.6 GiB for an array with shape (205962976, 46) and data type float64

下面是喂了10亿单词的语料库

In [68]:
import os
import glob
def read_text_files(folder_path):
    english_texts = []
    # 遍历文件夹下所有的.txt文件
    for file_path in glob.glob(os.path.join(folder_path, '*.txt')):
        with open(file_path, 'r', encoding='utf-8') as file:
            # 读取文件内容
            text = file.read()
            # 提取英文内容（假设只包含字母、空格和换行符）
            english_text = ''.join(filter(lambda x: x.isalpha() or x.isspace(), text))
            english_texts.append(english_text)
    return str(english_texts)

In [69]:
english_texts = read_text_files('text0/')

In [70]:
for order in [0,3,5]:  # 计算0阶到5阶马尔可夫模型的熵值
    entropy = calculate_entropy(english_texts, order)
    print(f'Order {order} Markov Model, Entropy: {entropy}')

Order 0 Markov Model, Entropy: 4.254037364372333
Order 3 Markov Model, Entropy: 2.3822194443288214
Order 5 Markov Model, Entropy: 1.7797140240443352
