#词频统计实验

#LLR分数选出高频词

In [None]:
#LLR分数
import pandas as pd
from collections import Counter
import jieba.posseg as pseg  # 用于分词和 POS 标注
import re  # 用于处理正则表达式
from math import log  # 用于计算对数

# 自定义停用词列表（可以扩展）
CUSTOM_STOPWORDS = {"图片", "视频", "链接", "原图", "全文", "网页链接"}

# 加载数据：读取 .txt 文件并解析为 DataFrame 格式
def load_data(file_paths):
    data = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    try:
                        # 使用制表符或空格分割文本和标签
                        if '\t' in line:
                            text, label = line.rsplit('\t', 1)
                        else:
                            text, label = line.rsplit(' ', 1)
                        data.append({'text': text.strip(), 'label': int(label)})
                    except ValueError:
                        print(f"Skipping invalid line: {line}")
    return pd.DataFrame(data)


# 清理文本：移除 "#" 和 "@" 后的内容，以及指定无意义词语和特定模式
def clean_text(text):
    if not isinstance(text, str):
        return ""

    # 移除 "#" 和 "@" 后的内容
    text = re.sub(r"#\S+", "", text)  # 移除以 "#" 开头的单词（如 "#话题"）
    text = re.sub(r"@\S+", "", text)  # 移除以 "@" 开头的单词（如 "@用户"）

    # 移除类似 "xxx的微博视频" 的模式，其中 xxx 是任意字符
    text = re.sub(r"\S+的微博视频", "", text)

    return text


# 分组统计指定类别的频次，并获取高频单字
def compute_pos_statistics(df, group_column, text_column):
    pos_statistics = {}
    
    target_pos_tags = {
        'Noun': ['n', 'nr', 'ns', 'nt', 'nz'],           # 名词及其子类，如人名、地名等
        'Verb': ['v', 'vd', 'vn'],                       # 动词及其子类，如动名词等
        'Pronouns': ['r'],                               # 代词，如“我”、“你”
        'Adjectives': ['a', 'ad', 'an'],                 # 形容词及其子类，如副形容词等
        'Adverbs': ['d'],                                # 副词，如“很”、“非常”
        'Prepositions': ['p'],                           # 介词，如“在”、“从”
        'Conjunctions': ['c']                            # 连词，如“和”、“但是”
    }

    total_texts_per_group = df[group_column].value_counts().to_dict()

    for group in df[group_column].unique():
        group_data = df[df[group_column] == group][text_column]

        pos_counts = {pos: Counter() for pos in target_pos_tags}  # 初始化计数字典

        for text in group_data:
            if isinstance(text, str):  # 确保是字符串类型
                cleaned_text = clean_text(text)  # 清理文本
                
                words_with_pos = pseg.cut(cleaned_text)  # 使用 jieba.posseg 对清理后的文本进行分词和 POS 标注
                
                for word, tag in words_with_pos:
                    if word not in CUSTOM_STOPWORDS and word.strip():  # 排除停用词和空白字符
                        for pos_category, pos_tags in target_pos_tags.items():
                            if tag in pos_tags:  # 如果当前标注属于目标 POS 类别，则计数
                                pos_counts[pos_category][word] += 1

        pos_statistics[group] = {
            'pos_counts': pos_counts,
            'total_texts': total_texts_per_group[group],
        }

    return pos_statistics


# 根据对数似然比 (LLR) 筛选高频动词和名词，分别输出两个组的数据，并包含出现频次
def compute_llr(pos_statistics):
    llr_results_group_0 = {}
    llr_results_group_1 = {}

    groups = list(pos_statistics.keys())
    
    if len(groups) != 2: 
        raise ValueError("This function expects exactly two groups to compute LLR.")

    group_0_stats, group_1_stats = pos_statistics[groups[0]], pos_statistics[groups[1]]

    for pos_category in ['Noun', 'Verb']:
        counter_0 = group_0_stats['pos_counts'][pos_category]
        counter_1 = group_1_stats['pos_counts'][pos_category]

        total_texts_0 = group_0_stats['total_texts']
        total_texts_1 = group_1_stats['total_texts']

        all_words_group_0 = set(counter_0.keys())
        all_words_group_1 = set(counter_1.keys())

        llr_scores_group_0 = []
        llr_scores_group_1 = []

        for word in all_words_group_0:
            N11 = counter_0[word]
            N01 = counter_1[word]
            N10 = total_texts_0 - N11
            N00 = total_texts_1 - N01

            if N11 > 0 and N01 > 0 and N10 > 0 and N00 > 0:  
                llr_score_group_0 = log((N11 * N00) / (N10 * N01))
                llr_scores_group_0.append((word, llr_score_group_0, N11))

        for word in all_words_group_1:
            N11 = counter_1[word]
            N01 = counter_0[word]
            N10 = total_texts_1 - N11
            N00 = total_texts_0 - N01

            if N11 > 0 and N01 > 0 and N10 > 0 and N00 > 0:  
                llr_score_group_1 = log((N11 * N00) / (N10 * N01))
                llr_scores_group_1.append((word, llr_score_group_1, N11))

        llr_results_group_0[pos_category] = sorted(llr_scores_group_0, key=lambda x: x[1], reverse=True)[:100]
        llr_results_group_1[pos_category] = sorted(llr_scores_group_1, key=lambda x: x[1], reverse=True)[:100]

    return llr_results_group_0, llr_results_group_1


# 主函数：加载数据并计算结果，并生成表格输出
def main():
    file_paths = ['/kaggle/input/cipintongji/dev.txt','/kaggle/input/cipintongji/train.txt']  # 替换为实际文件路径

    df = load_data(file_paths)

    print("\nLoaded Data:")
    print(df.head())

    pos_statistics = compute_pos_statistics(df, group_column='label', text_column='text')

    print("\nComputing LLR Scores...")
    
    llr_results_group_0, llr_results_group_1 = compute_llr(pos_statistics)

    print("\nTop Words by LLR (Group: Suicide Risk):")
    
    for pos_category, words_info in llr_results_group_0.items():
        print(f"\nTop {pos_category}:")
        for word, score, freq in words_info:
            print(f"Word: {word}, LLR Score: {score:.4f}, Frequency: {freq}")

    print("\nTop Words by LLR (Group: No Suicide Risk):")
    
    for pos_category, words_info in llr_results_group_1.items():
        print(f"\nTop {pos_category}:")
        for word, score, freq in words_info:
            print(f"Word: {word}, LLR Score: {score:.4f}, Frequency: {freq}")


if __name__ == '__main__':
    main()

#词频统计

In [1]:
import pandas as pd
from collections import Counter
import jieba.posseg as pseg  # 用于分词和 POS 标注
import re  # 用于处理正则表达式
from prettytable import PrettyTable  # 用于绘制表格

# 自定义停用词列表（可以扩展）
CUSTOM_STOPWORDS = {"图片", "视频", "链接", "原图", "全文", "网页链接"}

# 加载数据：读取 .txt 文件并解析为 DataFrame 格式
def load_data(file_paths):
    data = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    try:
                        # 使用制表符或空格分割文本和标签
                        if '\t' in line:
                            text, label = line.rsplit('\t', 1)
                        else:
                            text, label = line.rsplit(' ', 1)
                        data.append({'text': text.strip(), 'label': int(label)})
                    except ValueError:
                        print(f"Skipping invalid line: {line}")
    return pd.DataFrame(data)


# 清理文本：移除 "#" 和 "@" 后的内容，以及指定无意义词语和特定模式
def clean_text(text):
    if not isinstance(text, str):
        return ""

    # 移除 "#" 和 "@" 后的内容
    text = re.sub(r"#\S+", "", text)  # 移除以 "#" 开头的单词（如 "#话题"）
    text = re.sub(r"@\S+", "", text)  # 移除以 "@" 开头的单词（如 "@用户"）

    # 移除类似 "xxx的微博视频" 的模式，其中 xxx 是任意字符
    text = re.sub(r"\S+的微博视频", "", text)

    return text


# 分组统计指定类别的频次，并获取高频单字
def compute_pos_statistics(df, group_column, text_column):
    pos_statistics = {}
    
    # 定义目标 POS 标签映射表
    target_pos_tags = {
        'Noun': ['n', 'nr', 'ns', 'nt', 'nz'],           # 名词及其子类，如人名、地名等
        'Verb': ['v', 'vd', 'vn'],                       # 动词及其子类，如动名词等
        'Pronouns': ['r'],                               # 代词，如“我”、“你”
        'Adjectives': ['a', 'ad', 'an'],                 # 形容词及其子类，如副形容词等
        'Adverbs': ['d'],                                # 副词，如“很”、“非常”
        'Prepositions': ['p'],                           # 介词，如“在”、“从”
        'Conjunctions': ['c']                            # 连词，如“和”、“但是”
    }

    for group in df[group_column].unique():
        group_data = df[df[group_column] == group][text_column]

        pos_counts = {pos: Counter() for pos in target_pos_tags}  # 初始化计数字典

        for text in group_data:
            if isinstance(text, str):  # 确保是字符串类型
                cleaned_text = clean_text(text)  # 清理文本
                
                words_with_pos = pseg.cut(cleaned_text)  # 使用 jieba.posseg 对清理后的文本进行分词和 POS 标注
                
                for word, tag in words_with_pos:
                    if word not in CUSTOM_STOPWORDS and word.strip():  # 排除停用词和空白字符
                        for pos_category, pos_tags in target_pos_tags.items():
                            if tag in pos_tags:  # 如果当前标注属于目标 POS 类别，则计数
                                pos_counts[pos_category][word] += 1

        total_count = sum(sum(counter.values()) for counter in pos_counts.values())

        pos_statistics[group] = {
            'pos_counts': pos_counts,
            'total_count': total_count,
            'percentage': {
                pos_category: {word: (count / total_count * 100) for word, count in counter.items()}
                for pos_category, counter in pos_counts.items()
            }
        }

    return pos_statistics


# 绘制表格：按组展示各类 POS 的总频次及占比
def generate_table(pos_statistics):
    table = PrettyTable()
    
    table.field_names = ["POS Category", "Group", "Frequency", "Percentage (%)"]
    
    for group, stats in pos_statistics.items():
        total_count = stats['total_count']
        
        for pos_category, counter in stats['pos_counts'].items():
            category_total = sum(counter.values())
            table.add_row([pos_category, 
                           "Suicide Risk" if group == 0 else "No Suicide Risk",
                           category_total,
                           f"{category_total / total_count * 100:.2f}"])
    
    return table


# 获取前 N 个高频单字及其占比（针对每个 POS 类别）
def get_top_n_words(pos_statistics, top_n=200):
    top_words_by_group = {}
    
    for group, stats in pos_statistics.items():
        top_words_by_group[group] = {}
        
        for pos_category, counter in stats['pos_counts'].items():
            top_words_by_group[group][pos_category] = [
                (word, count, stats['percentage'][pos_category][word])
                for word, count in counter.most_common(top_n)
            ]
    
    return top_words_by_group


# 主函数：加载数据并计算结果，并生成表格输出
def main():
    # 文件路径
    file_paths = ['/kaggle/input/cipintongji/dev.txt', '/kaggle/input/cipintongji/train.txt']  # 替换为实际文件路径

    # 加载数据
    df = load_data(file_paths)

    print("\nLoaded Data:")
    print(df.head())

    # 计算词性统计信息
    pos_statistics = compute_pos_statistics(df, group_column='label', text_column='text')

    print("\nComputing LLR Scores...")

    # 计算 LLR 分数
    llr_results_group_0, llr_results_group_1 = compute_llr(pos_statistics)

    print("\nTop Words by LLR (Group: Suicide Risk):")
    
    for pos_category, words_info in llr_results_group_0.items():
        print(f"\nTop {pos_category}:")
        for word, score, freq, freq_ratio in words_info:
            print(f"Word: {word}, LLR Score: {score:.4f}, Frequency: {freq}, Frequency Ratio: {freq_ratio:.4%}")

    print("\nTop Words by LLR (Group: No Suicide Risk):")
    
    for pos_category, words_info in llr_results_group_1.items():
        print(f"\nTop {pos_category}:")
        for word, score, freq, freq_ratio in words_info:
            print(f"Word: {word}, LLR Score: {score:.4f}, Frequency: {freq}, Frequency Ratio: {freq_ratio:.4%}")
  if __name__ == '__main__':
    main()

人称统计

In [None]:
#统计第一人称 第二人称 第三人称
import pandas as pd
from collections import Counter
import jieba.posseg as pseg  # 用于分词和 POS 标注
import re  # 用于处理正则表达式
from prettytable import PrettyTable  # 用于绘制表格

# 自定义停用词列表（可以扩展）
CUSTOM_STOPWORDS = {"图片", "视频", "链接", "原图", "全文", "网页链接"}

# 定义代词类别及对应的词汇表
PRONOUN_CATEGORIES = {
    'First Person Singular': {"我", "自己","俺","咱"},
    'First Person Plural': {"我们", "咱们"},
    'Second Person': {"你", "你们","您","您们"},
    'Third Person': {"他", "她", "他们", "她们", "爸爸" ,"妈妈","哥哥","姐姐","叔叔","大伯"}
}

# 加载数据：读取 .txt 文件并解析为 DataFrame 格式
def load_data(file_paths):
    data = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    try:
                        # 使用制表符或空格分割文本和标签
                        if '\t' in line:
                            text, label = line.rsplit('\t', 1)
                        else:
                            text, label = line.rsplit(' ', 1)
                        data.append({'text': text.strip(), 'label': int(label)})
                    except ValueError:
                        print(f"Skipping invalid line: {line}")
    return pd.DataFrame(data)


# 清理文本：移除 "#" 和 "@" 后的内容，以及指定无意义词语和特定模式
def clean_text(text):
    if not isinstance(text, str):
        return ""

    # 移除 "#" 和 "@" 后的内容
    text = re.sub(r"#\S+", "", text)  # 移除以 "#" 开头的单词（如 "#话题"）
    text = re.sub(r"@\S+", "", text)  # 移除以 "@" 开头的单词（如 "@用户"）

    # 移除类似 "xxx的微博视频" 的模式，其中 xxx 是任意字符
    text = re.sub(r"\S+的微博视频", "", text)

    return text


# 分组统计代词类别的频次，并获取高频单字
def compute_pronoun_statistics(df, group_column, text_column):
    pronoun_statistics = {}

    for group in df[group_column].unique():
        group_data = df[df[group_column] == group][text_column]

        pronoun_counts = {category: Counter() for category in PRONOUN_CATEGORIES}  # 初始化计数字典

        for text in group_data:
            if isinstance(text, str):  # 确保是字符串类型
                cleaned_text = clean_text(text)  # 清理文本
                
                words_with_pos = pseg.cut(cleaned_text)  # 使用 jieba.posseg 对清理后的文本进行分词和 POS 标注
                
                for word, tag in words_with_pos:
                    if word not in CUSTOM_STOPWORDS and word.strip() and tag == 'r':  # 排除非代词和空白字符
                        for category, pronouns in PRONOUN_CATEGORIES.items():
                            if word in pronouns:  # 如果当前单词属于某个代词类别，则计数
                                pronoun_counts[category][word] += 1

        total_count = sum(sum(counter.values()) for counter in pronoun_counts.values())

        pronoun_statistics[group] = {
            'pronoun_counts': pronoun_counts,
            'total_count': total_count,
            'percentage': {
                category: {word: (count / total_count * 100) for word, count in counter.items()}
                for category, counter in pronoun_counts.items()
            }
        }

    return pronoun_statistics


# 绘制表格：按组展示各类代词的总频次及占比
def generate_table(pronoun_statistics):
    table = PrettyTable()
    
    table.field_names = ["Pronoun Category", "Group", "Frequency", "Percentage (%)"]
    
    for group, stats in pronoun_statistics.items():
        total_count = stats['total_count']
        
        for category, counter in stats['pronoun_counts'].items():
            category_total = sum(counter.values())
            table.add_row([category, 
                           "Suicide Risk" if group == 0 else "No Suicide Risk",
                           category_total,
                           f"{category_total / total_count * 100:.2f}"])
    
    return table


# 获取前 N 个高频单字及其占比（针对每个代词类别）
def get_top_n_pronouns(pronoun_statistics, top_n=20):
    top_words_by_group = {}
    
    for group, stats in pronoun_statistics.items():
        top_words_by_group[group] = {}
        
        for category, counter in stats['pronoun_counts'].items():
            top_words_by_group[group][category] = [
                (word, count, stats['percentage'][category][word])
                for word, count in counter.most_common(top_n)
            ]
    
    return top_words_by_group


# 主函数：加载数据并计算结果，并生成表格输出
def main():
    file_paths = ['/kaggle/input/cipintongji/dev.txt','/kaggle/input/cipintongji/train.txt']  # 替换为实际文件路径

    # 加载数据（假设文件格式为“一段中文 后面标有0或1”）
    df = load_data(file_paths)

    print("\nLoaded Data:")
    print(df.head())

    # 分析每组中各类代词的统计信息，仅针对目标类别进行分析，并清理无意义内容
    pronoun_statistics = compute_pronoun_statistics(df, group_column='label', text_column='text')

    print("\nPronoun Statistics Table:")
    
    result_table = generate_table(pronoun_statistics)
    
    print(result_table)

    print("\nTop Pronouns by Group:")
    
    top_pronouns_by_group = get_top_n_pronouns(pronoun_statistics)

    for group, words_info in top_pronouns_by_group.items():
        print(f"\nGroup {'Suicide Risk' if group == 0 else 'No Suicide Risk'}:")
        
        for category, words_list in words_info.items():
            print(f"\nTop {category}:")
            for word_info in words_list:
                print(f"Word: {word_info[0]}, Count: {word_info[1]}, Percentage: {word_info[2]:.2f}%")


if __name__ == '__main__':
    main()

In [2]:
import matplotlib.pyplot as plt

# 数据
nouns_control = ['depression', 'friend', 'thing', 'time', 'get', 'boyfriend', 'relationship', 'day', 'people', 'anxiety']
nouns_depression = ['depression', 'friend', 'thing', 'time', 'get', 'boyfriend', 'relationship', 'day', 'people', 'anxiety']
verbs_control = ['be', 'feel', 'have', 'want', 'go', 'do', 'talk', 'know', 'thank', 'love']
verbs_depression = ['be', 'feel', 'have', 'want', 'go', 'do', 'talk', 'know', 'thank', 'love']

# 假设的频率数据
noun_freq_control = [4000, 3500, 3000, 2500, 2000, 1800, 1600, 1400, 1200, 1000]
noun_freq_depression = [3800, 3300, 2800, 2300, 1800, 1600, 1400, 1200, 1000, 900]
verb_freq_control = [3000, 2800, 2600, 2400, 2200, 2000, 1800, 1600, 1400, 1200]
verb_freq_depression = [2900, 2700, 2500, 2300, 2100, 1900, 1700, 1500, 1300, 1100]

# 创建图形和轴
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

# 绘制名词条形图
ax1.barh(nouns_control, noun_freq_control, color='green', label='Control')
ax1.barh(nouns_depression, noun_freq_depression, color='blue', label='Depression')
ax1.set_title('Keyness Nouns')
ax1.set_xlabel('Frequency')
ax1.legend()

# 绘制动词条形图
ax2.barh(verbs_control, verb_freq_control, color='green', label='Control')
ax2.barh(verbs_depression, verb_freq_depression, color='blue', label='Depression')
ax2.set_title('Keyness Verbs')
ax2.set_xlabel('Frequency')
ax2.legend()

# 显示图形
plt.tight_layout()
plt.show()

KeyError: 'PolyCollection:kwdoc'

In [None]:
#IG
import pandas as pd
import re
from math import log2

# 加载数据：读取 .txt 文件并解析为 DataFrame 格式
def load_data(file_paths):
    data = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    try:
                        # 使用制表符或空格分割文本和标签
                        if '\t' in line:
                            text, label = line.rsplit('\t', 1)
                        else:
                            text, label = line.rsplit(' ', 1)
                        data.append({'text': text.strip(), 'label': int(label)})
                    except ValueError:
                        print(f"Skipping invalid line: {line}")
    return pd.DataFrame(data)

# 清理文本：移除 "#" 和 "@" 后的内容，以及指定无意义词语和特定模式
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # 移除 "#" 和 "@" 后的内容
    text = re.sub(r"#\S+", "", text)  # 移除以 "#" 开头的单词（如 "#话题"）
    text = re.sub(r"@\S+", "", text)  # 移除以 "@" 开头的单词（如 "@用户"）
    return text

# 从 word.txt 中加载高频词
def load_words(word_file):
    words = []
    with open(word_file, 'r', encoding='utf-8') as f:
        for line in f:
            match = re.search(r'Word:\s*(\S+),', line)  # 提取 "Word:" 后面的部分
            if match:
                words.append(match.group(1))
    return words

# 计算熵
def entropy(probabilities):
    return -sum(p * log2(p) for p in probabilities if p > 0)

# 计算信息增益
def compute_information_gain(word, group_0_texts, group_1_texts, total_texts):
    # 自杀组和非自杀组包含该词的文本数
    N11 = sum(1 for text in group_0_texts if word in text)  # 自杀组包含该词的数量
    N01 = sum(1 for text in group_1_texts if word in text)  # 非自杀组包含该词的数量

    # 自杀组和非自杀组不包含该词的文本数
    N10 = len(group_0_texts) - N11  # 自杀组不包含该词的数量
    N00 = len(group_1_texts) - N01  # 非自杀组不包含该词的数量

    # 总计数
    total_group_0 = len(group_0_texts)
    total_group_1 = len(group_1_texts)

    # 类别分布概率（先验概率）
    P_group_0 = total_group_0 / total_texts
    P_group_1 = total_group_1 / total_texts

    # 总熵 H(T)
    H_T = entropy([P_group_0, P_group_1])

    # 条件概率 P(w)
    P_w = (N11 + N01) / total_texts
    P_not_w = (N10 + N00) / total_texts

    # 条件熵 H(T|w)
    H_T_given_w = 0
    H_T_given_not_w = 0

    if P_w > 0:
        P_T_given_w = [N11 / (N11 + N01), N01 / (N11 + N01)]
        H_T_given_w += P_w * entropy(P_T_given_w)

    if P_not_w > 0:
        P_T_given_not_w = [N10 / (N10 + N00), N00 / (N10 + N00)]
        H_T_given_not_w += P_not_w * entropy(P_T_given_not_w)

    IG_value = H_T - (H_T_given_w + H_T_given_not_w)

    return IG_value, N11, N01

# 主函数：加载数据、计算信息增益并归一化结果
def main():
    # 文件路径
    train_file_path = '/kaggle/input/shyan3-4/train.txt'
    dev_file_path = '/kaggle/input/shyan3-4/dev.txt'
    word_file_path = '/kaggle/input/high-freq-word/1-LLR-V.txt'

    # 加载数据集并合并 train 和 dev 数据集
    df = load_data([train_file_path, dev_file_path])
    
    # 清理文本内容
    df['text'] = df['text'].apply(clean_text)

    # 分组文本
    group_0_texts = df[df['label'] == 0]['text'].tolist()  # 自杀组文本列表
    group_1_texts = df[df['label'] == 1]['text'].tolist()  # 非自杀组文本列表

    total_texts = len(df)  # 总文本数量

    # 加载高频词
    words = load_words(word_file_path)

    # 存储计算结果
    results = []

    print("\nCalculating Information Gain for each word...\n")

    for word in words:
        ig_value, count_group_0, count_group_1 = compute_information_gain(
            word, group_0_texts, group_1_texts, total_texts)

        # 归一化为百分比形式
        total_count = count_group_0 + count_group_1
        if total_count > 0:
            percentage_group_0 = (count_group_0 / total_count) * 100
            percentage_group_1 = (count_group_1 / total_count) * 100
        else:
            percentage_group_0 = percentage_group_1 = 0

        results.append({
            'word': word,
            'information_gain': ig_value,
            'percentage_suicide': f"{percentage_group_0:.2f}%",
            'percentage_non_suicide': f"{percentage_group_1:.2f}%"
        })

        print(f"Word: {word}, IG: {ig_value:.4f}, Suicide: {percentage_group_0:.2f}%, Non-Suicide: {percentage_group_1:.2f}%")

    # 将结果保存为 CSV 文件或打印输出
    results_df = pd.DataFrame(results)
    results_df.to_csv('word_information_gain4.csv', index=False, encoding='utf-8')
    
    print("\nResults saved to 'word_information_gain3.csv'.")

if __name__ == '__main__':
    main()

In [None]:
import pandas as pd
from collections import Counter
import jieba.posseg as pseg  # 用于分词和 POS 标注
import re  # 用于处理正则表达式

# 自定义停用词列表（可以扩展）
CUSTOM_STOPWORDS = {"图片", "视频", "链接", "原图", "全文", "网页链接"}

# 加载数据：读取 .txt 文件并解析为 DataFrame 格式
def load_data(file_paths):
    data = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    try:
                        # 使用制表符或空格分割文本和标签
                        if '\t' in line:
                            text, label = line.rsplit('\t', 1)
                        else:
                            text, label = line.rsplit(' ', 1)
                        data.append({'text': text.strip(), 'label': int(label)})
                    except ValueError:
                        print(f"Skipping invalid line: {line}")
    return pd.DataFrame(data)


# 清理文本：移除 "#" 和 "@" 后的内容，以及指定无意义词语和特定模式
def clean_text(text):
    if not isinstance(text, str):
        return ""

    # 移除 "#" 和 "@" 后的内容
    text = re.sub(r"#\S+", "", text)  # 移除以 "#" 开头的单词（如 "#话题"）
    text = re.sub(r"@\S+", "", text)  # 移除以 "@" 开头的单词（如 "@用户"）

    # 移除类似 "xxx的微博视频" 的模式，其中 xxx 是任意字符
    text = re.sub(r"\S+的微博视频", "", text)

    return text


# 分组统计指定类别的频次，并获取高频单字
def compute_pos_statistics(df, group_column, text_column):
    pos_statistics = {}
    
    # 定义目标 POS 标签映射表
    target_pos_tags = {
        'Noun': ['n', 'nr', 'ns', 'nt', 'nz'],           # 名词及其子类，如人名、地名等
        'Verb': ['v', 'vd', 'vn'],                       # 动词及其子类，如动名词等
        'Pronouns': ['r'],                               # 代词，如“我”、“你”
        'Adjectives': ['a', 'ad', 'an'],                 # 形容词及其子类，如副形容词等
        'Adverbs': ['d'],                                # 副词，如“很”、“非常”
        'Prepositions': ['p'],                           # 介词，如“在”、“从”
        'Conjunctions': ['c']                            # 连词，如“和”、“但是”
    }

    for group in df[group_column].unique():
        group_data = df[df[group_column] == group][text_column]

        pos_counts = {pos: Counter() for pos in target_pos_tags}  # 初始化计数字典

        for text in group_data:
            if isinstance(text, str):  # 确保是字符串类型
                cleaned_text = clean_text(text)  # 清理文本
                
                words_with_pos = pseg.cut(cleaned_text)  # 使用 jieba.posseg 对清理后的文本进行分词和 POS 标注
                
                for word, tag in words_with_pos:
                    if word not in CUSTOM_STOPWORDS and word.strip():  # 排除停用词和空白字符
                        for pos_category, pos_tags in target_pos_tags.items():
                            if tag in pos_tags:  # 如果当前标注属于目标 POS 类别，则计数
                                pos_counts[pos_category][word] += 1

        total_count = sum(sum(counter.values()) for counter in pos_counts.values())

        pos_statistics[group] = {
            'pos_counts': pos_counts,
            'total_count': total_count,
            'percentage': {
                pos_category: {word: (count / total_count * 100) for word, count in counter.items()}
                for pos_category, counter in pos_counts.items()
            }
        }

    return pos_statistics


# 保存统计结果到 CSV 文件，每组每个类别生成一个文件
def save_to_csv_by_group_and_category(pos_statistics, output_dir="output"):
    import os

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for group, stats in pos_statistics.items():
        group_name = "Suicide_Risk" if group == 0 else "No_Suicide_Risk"
        
        for pos_category, counter in stats['pos_counts'].items():
            all_data = []
            
            for word, freq in counter.items():
                percentage = stats['percentage'][pos_category][word]
                all_data.append({
                    "Word": word,
                    "Frequency": freq,
                    "Percentage (%)": f"{percentage:.4f}"
                })
            
            df_output = pd.DataFrame(all_data)
            
            category_file_name = f"{group_name}_{pos_category}_Statistics.csv"
            output_file_path = os.path.join(output_dir, category_file_name)
            
            df_output.to_csv(output_file_path, index=False, encoding='utf-8-sig')
            
            print(f"Saved results to {output_file_path}")


# 主函数：加载数据并计算结果，并生成 CSV 输出
def main():
    # 文件路径
    file_paths = ['/kaggle/input/cipintongji/dev.txt', '/kaggle/input/cipintongji/train.txt']  # 替换为实际文件路径

    # 加载数据
    df = load_data(file_paths)

    print("\nLoaded Data:")
    print(df.head())

    # 计算词性统计信息
    pos_statistics = compute_pos_statistics(df, group_column='label', text_column='text')

    print("\nSaving POS Statistics to CSV files...")
    
    save_to_csv_by_group_and_category(pos_statistics)

if __name__ == '__main__':
    main()

In [None]:
#人称代词 数据存入到 
import pandas as pd
from collections import Counter
import jieba.posseg as pseg  # 用于分词和 POS 标注
import re  # 用于处理正则表达式

# 自定义停用词列表（可以扩展）
CUSTOM_STOPWORDS = {"图片", "视频", "链接", "原图", "全文", "网页链接"}

# 定义代词类别及对应的词汇表
PRONOUN_CATEGORIES = {
    'First Person Singular': {"我", "自己", "俺", "咱"},
    'First Person Plural': {"我们", "咱们"},
    'Second Person': {"你", "你们", "您", "您们"},
    'Third Person': {"他", "她", "他们", "她们", "爸爸", "妈妈", "哥哥", "姐姐", "叔叔", "大伯"}
}

# 加载数据：读取 .txt 文件并解析为 DataFrame 格式
def load_data(file_paths):
    data = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    try:
                        # 使用制表符或空格分割文本和标签
                        if '\t' in line:
                            text, label = line.rsplit('\t', 1)
                        else:
                            text, label = line.rsplit(' ', 1)
                        data.append({'text': text.strip(), 'label': int(label)})
                    except ValueError:
                        print(f"Skipping invalid line: {line}")
    return pd.DataFrame(data)


# 清理文本：移除 "#" 和 "@" 后的内容，以及指定无意义词语和特定模式
def clean_text(text):
    if not isinstance(text, str):
        return ""

    # 移除 "#" 和 "@" 后的内容
    text = re.sub(r"#\S+", "", text)  # 移除以 "#" 开头的单词（如 "#话题"）
    text = re.sub(r"@\S+", "", text)  # 移除以 "@" 开头的单词（如 "@用户"）

    # 移除类似 "xxx的微博视频" 的模式，其中 xxx 是任意字符
    text = re.sub(r"\S+的微博视频", "", text)

    return text


# 分组统计代词类别的频次，并获取高频单字
def compute_pronoun_statistics(df, group_column, text_column):
    pronoun_statistics = {}

    for group in df[group_column].unique():
        group_data = df[df[group_column] == group][text_column]

        pronoun_counts = {category: Counter() for category in PRONOUN_CATEGORIES}  # 初始化计数字典

        for text in group_data:
            if isinstance(text, str):  # 确保是字符串类型
                cleaned_text = clean_text(text)  # 清理文本
                
                words_with_pos = pseg.cut(cleaned_text)  # 使用 jieba.posseg 对清理后的文本进行分词和 POS 标注
                
                for word, tag in words_with_pos:
                    if word not in CUSTOM_STOPWORDS and word.strip() and tag == 'r':  # 排除非代词和空白字符
                        for category, pronouns in PRONOUN_CATEGORIES.items():
                            if word in pronouns:  # 如果当前单词属于某个代词类别，则计数
                                pronoun_counts[category][word] += 1

        total_count = sum(sum(counter.values()) for counter in pronoun_counts.values())

        pronoun_statistics[group] = {
            'pronoun_counts': pronoun_counts,
            'total_count': total_count,
            'percentage': {
                category: {word: (count / total_count * 100) for word, count in counter.items()}
                for category, counter in pronoun_counts.items()
            }
        }

    return pronoun_statistics


# 保存统计结果到 CSV 文件，每组每类生成一个文件
def save_to_csv_by_group_and_category(pronoun_statistics, output_dir="output1"):
    import os

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for group, stats in pronoun_statistics.items():
        group_name = "Suicide_Risk" if group == 0 else "No_Suicide_Risk"
        
        for category, counter in stats['pronoun_counts'].items():
            all_data = []
            
            for word, freq in counter.items():
                percentage = stats['percentage'][category][word]
                all_data.append({
                    "Word": word,
                    "Frequency": freq,
                    "Percentage (%)": f"{percentage:.4f}"
                })
            
            df_output = pd.DataFrame(all_data)
            
            category_file_name = f"{group_name}_{category.replace(' ', '_')}_Statistics.csv"
            output_file_path = os.path.join(output_dir, category_file_name)
            
            df_output.to_csv(output_file_path, index=False, encoding='utf-8-sig')
            
            print(f"Saved results to {output_file_path}")


# 主函数：加载数据并计算结果，并生成 CSV 输出
def main():
    file_paths = ['/kaggle/input/cipintongji/dev.txt', '/kaggle/input/cipintongji/train.txt']  # 替换为实际文件路径

    # 加载数据（假设文件格式为“一段中文 后面标有0或1”）
    df = load_data(file_paths)

    print("\nLoaded Data:")
    print(df.head())

    # 分析每组中各类代词的统计信息，仅针对目标类别进行分析，并清理无意义内容
    pronoun_statistics = compute_pronoun_statistics(df, group_column='label', text_column='text')

    print("\nSaving Pronoun Statistics to CSV files...")
    
    save_to_csv_by_group_and_category(pronoun_statistics)


if __name__ == '__main__':
    main()