In [1]:
import os
import torch
import numpy as np
import gensim.downloader
from gensim.models import Word2Vec
import jieba
from transformers import AutoTokenizer,AutoModelForMaskedLM,BertTokenizer
from utilz import *

In [None]:
import os
import torch
import numpy as np
from utilz import *

# ============ 配置 ============
FOLD_FILES = [
    ('/home/user2_4/code/shujuji_jupyter/fold0_train.csv', 'fold0'),
    ('/home/user2_4/code/shujuji_jupyter/fold1_train.csv', 'fold1'),
    ('/home/user2_4/code/shujuji_jupyter/fold2_train.csv', 'fold2')
]
OUTPUT_DIR = './data2025/'
os.makedirs(OUTPUT_DIR, exist_ok=True)

label_dict = {'无聊': 0, '快乐': 1, '感兴趣': 2, '疲倦': 3, '困惑': 4}

# ============ 循环处理每个fold ============
print("="*60)
print("开始处理3个fold...")

all_fold_data = []

for fold_file, fold_name in FOLD_FILES:
    print(f"\n加载 {fold_file}...")
    
    ids, paths, nlps, classes, modes = load_data(fold_file)
    
    print(f"  样本数: {len(ids)}, train: {modes.count('train')}, test: {modes.count('test')}")
    
    # 组织标签
    labels = {'train': [], 'test': []}
    for classe, mode in zip(classes, modes):
        labels[mode].append(label_dict[classe])
    
    # 保存标签
    label_file = os.path.join(OUTPUT_DIR, f'{fold_name}_labels.pkl')
    save_features(labels, label_file)
    print(f"  ✓ 标签已保存: {label_file}")
    print(f"    train={len(labels['train'])}, test={len(labels['test'])}")
    
    # 保存到列表中供后续特征提取使用
    all_fold_data.append({
        'fold_name': fold_name,
        'ids': ids,
        'paths': paths,
        'nlps': nlps,
        'classes': classes,
        'modes': modes
    })

print("\n" + "="*60)
print("✅ 标签文件生成完成！")

In [None]:
import jieba
from gensim.models import Word2Vec
from transformers import AutoTokenizer, AutoModelForMaskedLM

# ============ 收集所有文本用于训练Word2Vec ============
print("\n" + "="*60)
print("收集所有文本用于训练Word2Vec模型...")

all_vocabs = []
max_len = 0

for fold_data in all_fold_data:
    for s in fold_data['nlps']:
        tokens = list(jieba.lcut(s))
        all_vocabs.append(tokens)
        if len(tokens) > max_len:
            max_len = len(tokens)

print(f"总文本数: {len(all_vocabs)}, 最大序列长度: {max_len}")

# ============ 训练Word2Vec模型 ============
print("\n训练Word2Vec模型...")
model_own = Word2Vec(sentences=all_vocabs, vector_size=100, sg=1, min_count=1)
model_own.save('word2vec_3fold.model')
model_own.train(all_vocabs, total_examples=len(all_vocabs), epochs=10)
print("✓ Word2Vec模型训练完成")

# ============ 加载BERT模型 ============
print("\n加载BERT模型...")
model_path = './bert-base-chinese'
tokenizer = AutoTokenizer.from_pretrained(model_path)
bert_model = AutoModelForMaskedLM.from_pretrained(model_path, output_hidden_states=True)
print("✓ BERT模型加载完成")

# ============ 循环处理每个fold的特征提取 ============
print("\n" + "="*60)
print("开始提取特征...")

for fold_data in all_fold_data:
    fold_name = fold_data['fold_name']
    print(f"\n{'='*60}")
    print(f"处理 {fold_name}...")
    
    # ---- Word2Vec 特征提取 ----
    texts_embs_w2v = {'train': [], 'test': []}
    
    for nlp, mode in zip(fold_data['nlps'], fold_data['modes']):
        tokens = list(jieba.lcut(nlp))
        tmp_embs = []
        for w in tokens:
            tmp_embs.append(model_own.wv[w])
        
        if len(tmp_embs) < max_len:
            tmp_embs = np.concatenate([tmp_embs, np.zeros((max_len - len(tmp_embs), 100))], axis=0)
        
        texts_embs_w2v[mode].append(tmp_embs[:max_len])
    
    # 保存Word2Vec特征
    w2v_file = os.path.join(OUTPUT_DIR, f'{fold_name}_textual_wav2vec.pkl')
    save_features(texts_embs_w2v, w2v_file)
    print(f"  ✓ Word2Vec特征已保存: {w2v_file}")
    print(f"    train={len(texts_embs_w2v['train'])}, test={len(texts_embs_w2v['test'])}")
    
    # ---- BERT 特征提取 ----
    texts_embs_bert = {'train': [], 'test': []}
    
    for idx, (nlp, mode) in enumerate(zip(fold_data['nlps'], fold_data['modes'])):
        marked_text = '[CLS]' + nlp + '[SEP]'
        tokenized_text = tokenizer.tokenize(marked_text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [1] * len(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensor = torch.tensor([segments_ids])
        
        with torch.no_grad():
            outputs = bert_model(tokens_tensor, segments_tensor)
            tmp_embs = outputs[-1][0].cpu().detach().numpy()[0]  # (N, 768)
        
        if len(tmp_embs) < max_len:
            tmp_embs = np.concatenate([tmp_embs, np.zeros((max_len - len(tmp_embs), 768))], axis=0)
        
        texts_embs_bert[mode].append(tmp_embs[:max_len])
        
        if (idx + 1) % 100 == 0:
            print(f"    已处理 {idx + 1}/{len(fold_data['nlps'])} 个样本")
    
    # 保存BERT特征
    bert_file = os.path.join(OUTPUT_DIR, f'{fold_name}_textual_bert.pkl')
    save_features(texts_embs_bert, bert_file)
    print(f"  ✓ BERT特征已保存: {bert_file}")
    print(f"    train={len(texts_embs_bert['train'])} (shape: {np.array(texts_embs_bert['train'][0]).shape})")
    print(f"    test={len(texts_embs_bert['test'])} (shape: {np.array(texts_embs_bert['test'][0]).shape})")

print("\n" + "="*60)
print("✅ 所有特征提取完成！")
print(f"\n生成的文件列表:")
print(f"  ./data2025/fold0_labels.pkl")
print(f"  ./data2025/fold0_textual_wav2vec.pkl")
print(f"  ./data2025/fold0_textual_bert.pkl")
print(f"  ./data2025/fold1_labels.pkl")
print(f"  ./data2025/fold1_textual_wav2vec.pkl")
print(f"  ./data2025/fold1_textual_bert.pkl")
print(f"  ./data2025/fold2_labels.pkl")
print(f"  ./data2025/fold2_textual_wav2vec.pkl")
print(f"  ./data2025/fold2_textual_bert.pkl")

In [None]:
# ============ 百度百科预训练词向量特征提取 ============
print("\n" + "="*60)
print("加载百度百科预训练词向量...")

from gensim.models.keyedvectors import KeyedVectors

BAIDU_WORD2VEC_PATH = 'sgns.baidubaike.bigram-char'
word2vec_model = KeyedVectors.load_word2vec_format(BAIDU_WORD2VEC_PATH, binary=False)
print(f"✓ 词向量加载完成，词汇量: {len(word2vec_model.index_to_key)}")
print(f"✓ 词向量维度: {word2vec_model.vector_size}")

embedding_dim = word2vec_model.vector_size  # 通常是300维

# ============ 提取百度百科特征 ============
print("\n" + "="*60)
print("开始提取百度百科词向量特征...")

for fold_data in all_fold_data:
    fold_name = fold_data['fold_name']
    print(f"\n{'='*60}")
    print(f"处理 {fold_name}...")

    texts_embs_baidu = {'train': [], 'test': []}

    for idx, (nlp, mode) in enumerate(zip(fold_data['nlps'], fold_data['modes'])):
        # 分词
        tokens = list(jieba.cut(nlp))

        # 转换为词向量
        tmp_embs = []
        for word in tokens:
            if word in word2vec_model:
                tmp_embs.append(word2vec_model[word])
            else:
                # 未登录词用零向量表示
                tmp_embs.append(np.zeros(embedding_dim))

        # 填充或截断到max_len
        if len(tmp_embs) < max_len:
            padding = np.zeros((max_len - len(tmp_embs), embedding_dim))
            tmp_embs = np.concatenate([tmp_embs, padding], axis=0)
        else:
            tmp_embs = np.array(tmp_embs[:max_len])

        texts_embs_baidu[mode].append(tmp_embs)

        if (idx + 1) % 100 == 0:
            print(f"  已处理 {idx + 1}/{len(fold_data['nlps'])} 个样本")

    # 保存特征
    baidu_file = os.path.join(OUTPUT_DIR, f'{fold_name}_textual_baidu.pkl')
    save_features(texts_embs_baidu, baidu_file)
    print(f"  ✓ 百度百科特征已保存: {baidu_file}")
    print(f"    train={len(texts_embs_baidu['train'])} (shape: {np.array(texts_embs_baidu['train'][0]).shape})")
    print(f"    test={len(texts_embs_baidu['test'])} (shape: {np.array(texts_embs_baidu['test'][0]).shape})")

print("\n" + "="*60)
print("✅ 所有百度百科特征提取完成！")
print(f"\n生成的文件列表:")
print(f"  ./data2025/fold0_textual_baidu.pkl")
print(f"  ./data2025/fold1_textual_baidu.pkl")
print(f"  ./data2025/fold2_textual_baidu.pkl")
print(f"\n特征格式: {{'train': [样本列表], 'test': [样本列表]}}")
print(f"每个样本的shape: ({max_len}, {embedding_dim})")