# MIND 数据集预处理 (基于新划分)

## 说明
本脚本基于重新划分的数据 (7:2:1) 生成训练样本,输出文件名保持不变以兼容现有配置。

**输出文件**:
- `./data/processed/mind_train.json` (覆盖)
- `./data/processed/mind_val.json` (覆盖)
- `./data/processed/mind_test.json` (覆盖)

In [16]:
import pandas as pd
import json
import os
from tqdm import tqdm

In [17]:
# ===== 样本数量限制配置 =====
MAX_TRAIN_SAMPLES = 120000  # 训练集最大样本数 (None=不限制, 例如: 1000000)
MAX_VAL_SAMPLES = 20000    # 验证集最大样本数 (None=不限制, 例如: 100000)
MAX_TEST_SAMPLES = 10000   # 测试集最大样本数 (None=不限制, 例如: 50000)
RANDOM_SEED = 42          # 随机种子 (保证可复现性)
# ==========================

print("样本数量限制配置:")
print(f"  训练集: {MAX_TRAIN_SAMPLES if MAX_TRAIN_SAMPLES else '不限制'}")
print(f"  验证集: {MAX_VAL_SAMPLES if MAX_VAL_SAMPLES else '不限制'}")
print(f"  测试集: {MAX_TEST_SAMPLES if MAX_TEST_SAMPLES else '不限制'}")
print(f"  随机种子: {RANDOM_SEED}")

样本数量限制配置:
  训练集: 120000
  验证集: 20000
  测试集: 10000
  随机种子: 42


## 第一步: 构建统一的新闻字典

In [18]:
# 使用合并后的 news.tsv 构建统一字典
print("正在读取合并后的 news.tsv...")
news_merged = pd.read_csv('./data/MIND/merged/news.tsv', sep='\t', header=None)
news_merged.columns = ["ID", "类别", "子类别", "标题", "摘要", "链接", "标题实体", "摘要实体"]

# 构建新闻ID到标题的映射
news_dict = dict(zip(news_merged["ID"], news_merged["标题"]))

print(f"新闻字典大小: {len(news_dict):,} 条")
print(f"\n示例:")
for i, (news_id, title) in enumerate(list(news_dict.items())[:3]):
    print(f"  {news_id}: {title}")

正在读取合并后的 news.tsv...
新闻字典大小: 104,151 条

示例:
  N88753: The Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By
  N45436: Walmart Slashes Prices on Last-Generation iPads
  N23144: 50 Worst Habits For Belly Fat


## 第二步: 读取划分后的 behaviors 数据

In [19]:
# 读取三个数据集的 behaviors.tsv
print("正在读取 behaviors 数据...")
behaviors_train = pd.read_csv('./data/MIND/new_split/train/behaviors.tsv', sep='\t', header=None)
behaviors_val = pd.read_csv('./data/MIND/new_split/val/behaviors.tsv', sep='\t', header=None)
behaviors_test = pd.read_csv('./data/MIND/new_split/test/behaviors.tsv', sep='\t', header=None)

# 处理 NaN 值
behaviors_train = behaviors_train.fillna('')
behaviors_val = behaviors_val.fillna('')
behaviors_test = behaviors_test.fillna('')

# 设置列名
behaviors_train.columns = ["曝光ID", "用户ID", "曝光时间", "曝光前的新闻点击历史", "曝光明细 (1表示点击；0表示非点击)"]
behaviors_val.columns = ["曝光ID", "用户ID", "曝光时间", "曝光前的新闻点击历史", "曝光明细 (1表示点击；0表示非点击)"]
behaviors_test.columns = ["曝光ID", "用户ID", "曝光时间", "曝光前的新闻点击历史", "曝光明细 (1表示点击；0表示非点击)"]

print(f"训练集: {len(behaviors_train):,} 条")
print(f"验证集: {len(behaviors_val):,} 条")
print(f"测试集: {len(behaviors_test):,} 条")

正在读取 behaviors 数据...
训练集: 1,823,973 条
验证集: 523,103 条
测试集: 262,143 条


## 第三步: 构建样本的通用函数

In [20]:
def sample_data(samples, max_samples, dataset_name, random_seed=42):
    """
    随机采样限制样本数量

    Args:
        samples: 原始样本列表
        max_samples: 最大样本数 (None 表示不限制)
        dataset_name: 数据集名称 (用于日志)
        random_seed: 随机种子

    Returns:
        sampled_samples: 采样后的样本列表
    """
    if max_samples is None or len(samples) <= max_samples:
        print(f"{dataset_name}: 保留全部 {len(samples):,} 个样本")
        return samples

    import random
    random.seed(random_seed)
    sampled = random.sample(samples, max_samples)
    print(f"{dataset_name}: 从 {len(samples):,} 个样本中随机采样 {max_samples:,} 个 ({max_samples/len(samples)*100:.1f}%)")
    return sampled

In [21]:
# 指令模板
instruction = "You are a news recommendation expert. Given the news click history (if the user has no news click history, indicate it as no click history) of the user and the news they like and dislike to watch, please decide whether the user likes to watch the target news by outputting \"Yes.\" or \"No.\""

def build_samples(behaviors_df, dataset_name):
    """
    从 behaviors DataFrame 构建训练样本
    
    Args:
        behaviors_df: behaviors DataFrame
        dataset_name: 数据集名称 (用于日志)
    
    Returns:
        samples: 样本列表
    """
    samples = []
    error_count = 0
    
    print(f"\n正在处理 {dataset_name} 数据集...")
    
    for idx, row in tqdm(behaviors_df.iterrows(), total=len(behaviors_df), desc=f"处理 {dataset_name}"):
        try:
            sample_input = ""
            
            # 处理点击历史
            click_news_histories = []
            if len(row["曝光前的新闻点击历史"]) > 0:
                click_news = row["曝光前的新闻点击历史"].split(" ")
                
                # 如果大于3个,则选最后3个
                if len(click_news) >= 3:
                    click_news = click_news[-3:]
                
                for each_news in click_news:
                    if each_news in news_dict:
                        click_news_histories.append('"' + news_dict[each_news] + '"')
            else:
                click_news_histories.append('the user do not has click history.')
            
            # 处理曝光明细
            impressions = row["曝光明细 (1表示点击；0表示非点击)"].split(" ")
            likes = []
            dislikes = []
            
            # 前面的新闻作为训练数据,最后一个新闻作为预测目标
            for impression in impressions[:-1]:
                news_id, click = impression.split("-")
                if news_id in news_dict:
                    news_title = news_dict[news_id]
                    if int(click) == 1:
                        likes.append('"' + news_title + '"')
                    else:
                        dislikes.append('"' + news_title + '"')
            
            # 构建输入
            sample_input = "User click histories: " + ", ".join(click_news_histories) + "\n"
            sample_input += "User likes: " + ', '.join(likes) + "\n" + "User dislikes: " + ', '.join(dislikes)
            
            # 目标新闻
            news_id, click = impressions[-1].split("-")
            if news_id not in news_dict:
                continue
                
            output = "Yes." if int(click) == 1 else "No."
            sample_input = sample_input + "\n" + "Whether the user will like the target news " + '"' + news_dict[news_id] + '"?'
            
            sample = {
                "instruction": instruction,
                "input": sample_input,
                "output": output,
                "用户ID": row["用户ID"]
            }
            samples.append(sample)
            
        except Exception as e:
            error_count += 1
            if error_count <= 5:  # 只打印前5个错误
                print(f"\n解析曝光ID {row['曝光ID']} 时出错: {e}")
    
    print(f"\n{dataset_name} 处理完成: {len(samples):,} 个样本, {error_count} 个错误")
    return samples

## 第四步: 生成训练集样本

In [22]:
samples_train = build_samples(behaviors_train, "训练集")


正在处理 训练集 数据集...


处理 训练集: 100%|██████████| 1823973/1823973 [02:36<00:00, 11681.63it/s]



训练集 处理完成: 1,823,973 个样本, 0 个错误


In [23]:
# 查看样本示例
print("\n训练集样本示例:")
print(json.dumps(samples_train[0], indent=2, ensure_ascii=False))


训练集样本示例:
{
  "instruction": "You are a news recommendation expert. Given the news click history (if the user has no news click history, indicate it as no click history) of the user and the news they like and dislike to watch, please decide whether the user likes to watch the target news by outputting \"Yes.\" or \"No.\"",
  "input": "User click histories: \"A couple's attempt to re-create a picture-perfect engagement photo with a bottle of Champagne totally backfired, but the result is going viral\", \"Which Royal Wore It Best?\", \"Meghan King Edmonds and Jim Edmonds Split After 5 Years of Marriage\"\nUser likes: \"John Travolta Shares Rare Photo of Son Ben, 8, in Plane Cockpit: He's 'Taking My Place!'\", \"Rep. Ilhan Omar is accused of 'dog whistle' anti-Semitism after she posts tweet implying billionaire businessman Leon Cooperman is only supporting Michael Bloomberg's presidential run because he is Jewish\", \"Missing California hiker found dead at top of glacier just weeks before

## 第五步: 生成验证集样本

In [24]:
samples_val = build_samples(behaviors_val, "验证集")


正在处理 验证集 数据集...


处理 验证集: 100%|██████████| 523103/523103 [00:43<00:00, 12062.37it/s]


验证集 处理完成: 523,103 个样本, 0 个错误





## 第六步: 生成测试集样本

In [25]:
samples_test = build_samples(behaviors_test, "测试集")


正在处理 测试集 数据集...


处理 测试集: 100%|██████████| 262143/262143 [00:21<00:00, 12330.02it/s]


测试集 处理完成: 262,143 个样本, 0 个错误





## 第七步: 保存样本 (覆盖原文件)

In [26]:
# 应用样本数量限制
print("\n" + "="*60)
print("应用样本数量限制")
print("="*60)

samples_train = sample_data(samples_train, MAX_TRAIN_SAMPLES, "训练集", RANDOM_SEED)
samples_val = sample_data(samples_val, MAX_VAL_SAMPLES, "验证集", RANDOM_SEED)
samples_test = sample_data(samples_test, MAX_TEST_SAMPLES, "测试集", RANDOM_SEED)

print("\n采样后的数据集大小:")
print(f"  - 训练集: {len(samples_train):,} 样本")
print(f"  - 验证集: {len(samples_val):,} 样本")
print(f"  - 测试集: {len(samples_test):,} 样本")


应用样本数量限制
训练集: 从 1,823,973 个样本中随机采样 120,000 个 (6.6%)
验证集: 从 523,103 个样本中随机采样 20,000 个 (3.8%)
测试集: 从 262,143 个样本中随机采样 10,000 个 (3.8%)

采样后的数据集大小:
  - 训练集: 120,000 样本
  - 验证集: 20,000 样本
  - 测试集: 10,000 样本


In [27]:
# 确保输出目录存在
os.makedirs('./data/processed', exist_ok=True)

print(f"\n正在保存样本文件...")
print(f"总样本数: {len(samples_train) + len(samples_val) + len(samples_test):,}")
print(f"  - 训练集: {len(samples_train):,}")
print(f"  - 验证集: {len(samples_val):,}")
print(f"  - 测试集: {len(samples_test):,}")

# 保存训练集
with open("./data/processed/mind_train.json", "w", encoding='utf-8') as save_file:
    json.dump(samples_train, save_file, indent=4, ensure_ascii=False)
print("\n✓ 已保存: ./data/processed/mind_train.json")

# 保存验证集
with open("./data/processed/mind_val.json", "w", encoding='utf-8') as save_file:
    json.dump(samples_val, save_file, indent=4, ensure_ascii=False)
print("✓ 已保存: ./data/processed/mind_val.json")

# 保存测试集
with open("./data/processed/mind_test.json", "w", encoding='utf-8') as save_file:
    json.dump(samples_test, save_file, indent=4, ensure_ascii=False)
print("✓ 已保存: ./data/processed/mind_test.json")


正在保存样本文件...
总样本数: 150,000
  - 训练集: 120,000
  - 验证集: 20,000
  - 测试集: 10,000

✓ 已保存: ./data/processed/mind_train.json
✓ 已保存: ./data/processed/mind_val.json
✓ 已保存: ./data/processed/mind_test.json


## 第八步: 数据统计

In [28]:
# 统计 Yes/No 分布
def count_labels(samples, dataset_name):
    yes_count = sum(1 for s in samples if s['output'] == 'Yes.')
    no_count = sum(1 for s in samples if s['output'] == 'No.')
    print(f"\n{dataset_name} 标签分布:")
    print(f"  Yes: {yes_count:,} ({yes_count/len(samples)*100:.1f}%)")
    print(f"  No: {no_count:,} ({no_count/len(samples)*100:.1f}%)")

count_labels(samples_train, "训练集")
count_labels(samples_val, "验证集")
count_labels(samples_test, "测试集")


训练集 标签分布:
  Yes: 12,889 (10.7%)
  No: 107,111 (89.3%)

验证集 标签分布:
  Yes: 2,152 (10.8%)
  No: 17,848 (89.2%)

测试集 标签分布:
  Yes: 1,076 (10.8%)
  No: 8,924 (89.2%)


In [29]:
print("\n" + "="*60)
print("数据预处理完成!")
print("="*60)
print("\n下一步: 使用以下命令开始训练")
print("llamafactory-cli train new_train.yaml")
print("\n注意: 配置文件自动加载新的 mind_train.json")


数据预处理完成!

下一步: 使用以下命令开始训练
llamafactory-cli train new_train.yaml

注意: 配置文件自动加载新的 mind_train.json
