In [1]:
!pip install bitsandbytes -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━

In [2]:
%%writefile train.py

import os
import platform
import time
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group
from torch.amp import GradScaler

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

from transformers import AutoModel, AutoTokenizer, AutoConfig, get_cosine_schedule_with_warmup
from peft import get_peft_model, LoraConfig, TaskType
from transformers import BitsAndBytesConfig

# 禁用 tokenizer 的多线程警告，避免多进程冲突或控制台警告
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# ========= 模型与训练的常量定义 =========
model_path = '/kaggle/input/qwen2.5/transformers/14b/1'  # Qwen2.5-14B 模型路径
num_folds = 3                 # 3折交叉验证
num_epochs = 3                # 每折训练3轮
batch_size = 2                # 每卡每步处理2条样本
grad_accum_steps = 8          # 梯度累计8步，相当于有效 batch size 为 2×8=16

# 加载 Qwen tokenizer，并设置为左侧 padding（适合 decoder-only 模型）
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.padding_side = 'left'


class MathDataset(Dataset):
    def __init__(self, prompts, targets):
        self.prompts = prompts      # prompt 是模型的文本输入（如：Classify the topic of this problem: ...）
        self.targets = targets      # targets 是对应的标签（整数，0~7）

    def __getitem__(self, idx):
        return self.prompts[idx], self.targets[idx]  # 支持通过索引取出一组样本

    def __len__(self):
        return len(self.targets)    # 数据集总长度


class Net(nn.Module):
    def __init__(self, model_path, rank):
        super(Net, self).__init__()
        # 读取模型配置（如 hidden_size、层数等）
        self.config = AutoConfig.from_pretrained(model_path)


        # 使用 bitsandbytes 进行 4bit 量化加载配置（节省显存）
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,                       # 启用 4bit 权重量化
            bnb_4bit_use_double_quant=True,          # 双重量化以进一步压缩
            bnb_4bit_quant_type="nf4",               # nf4 是非对称量化（更强表达能力）
            bnb_4bit_compute_dtype=torch.float16     # 使用 float16 进行前向/反向计算
        )


        # 加载 Qwen 模型主干（不带语言模型头），采用量化加载方式
        self.backbone = AutoModel.from_pretrained(
            model_path,
            use_cache=False,                   # 不使用缓存（节省显存）
            torch_dtype=torch.float16,         # 模型用 float16 推理
            quantization_config=bnb_config,    # 使用上面定义的 4bit 量化策略
            device_map=rank                    # 指定 GPU 编号（用于 DDP）
        )


        # 定义 LoRA 配置：对所有线性层注入 r=8 的低秩结构
        peft_config = LoraConfig(
            task_type=TaskType.FEATURE_EXTRACTION,  # LoRA 类型为特征抽取（非生成）
            target_modules='all-linear',            # 应用于所有线性层
            bias='none',                            # 不引入额外 bias
            inference_mode=False,                   # 开启训练模式
            r=8,                                     # LoRA 的秩（子空间维度）
            lora_alpha=16,                          # 放缩因子
            lora_dropout=0.05                       # Dropout 防止过拟合
        )

        # 将主干模型转换为 LoRA 可训练模型
        self.backbone = get_peft_model(self.backbone, peft_config)

        self.head = nn.Linear(self.config.hidden_size, 8, bias=False)

    def forward(self, x):
        # 使用 Qwen 模型获取最后一层所有 token 的输出
        x = self.backbone(**x).last_hidden_state[:, -1, :]
        # 取最后一个 token 的向量（因为是左 padding，最后一个是输入末尾）
        return self.head(x)  # 投影到 8 维作为 logits 输出


def ddp_setup(rank, world_size):
    # 设置主进程的地址和端口（用于各 GPU 之间通信）
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    # 如果是 Windows 系统，使用 GLOO 后端（Windows 不支持 NCCL）
    if platform.system() == 'Windows':
        os.environ['USE_LIBUV'] = '0'  # 避免与 LibUV 冲突
        init_process_group(backend='gloo', rank=rank, world_size=world_size)
    else:
        # Linux 或其他系统，使用 NCCL（GPU 间通信更快）
        init_process_group(backend='nccl', rank=rank, world_size=world_size)

    # 指定当前进程使用哪张 GPU（每个 rank 对应一个 GPU）
    torch.cuda.set_device(rank)


def get_optimizer(model, learning_rate=0.0001, diff_lr=0.00001, weight_decay=0.01):
    # 不进行权重衰减的参数（如 LayerNorm 和 bias）
    no_decay = ['bias', 'LayerNorm.weight']
    
    # 需要设置“低学习率”的模块（如大模型主干部分）
    differential_layers = ['backbone']

    optimizer = torch.optim.AdamW(
        [
            # 🔹1. 主干之外，且需要 weight decay 的参数
            {
                "params": [
                    param for name, param in model.named_parameters()
                    if (not any(layer in name for layer in differential_layers)) and
                       (not any(nd in name for nd in no_decay))
                ],
                "lr": learning_rate,
                "weight_decay": weight_decay,
            },
            # 🔹2. 主干之外，但不需要 weight decay 的参数（如 LayerNorm、bias）
            {
                "params": [
                    param for name, param in model.named_parameters()
                    if (not any(layer in name for layer in differential_layers)) and
                       (any(nd in name for nd in no_decay))
                ],
                "lr": learning_rate,
                "weight_decay": 0,
            },
            # 🔹3. 主干内，且需要 weight decay 的参数（例如 encoder layers）
            {
                "params": [
                    param for name, param in model.named_parameters()
                    if (any(layer in name for layer in differential_layers)) and
                       (not any(nd in name for nd in no_decay))
                ],
                "lr": diff_lr,
                "weight_decay": weight_decay,
            },
            # 🔹4. 主干内，不需要 weight decay 的参数
            {
                "params": [
                    param for name, param in model.named_parameters()
                    if (any(layer in name for layer in differential_layers)) and
                       (any(nd in name for nd in no_decay))
                ],
                "lr": diff_lr,
                "weight_decay": 0,
            },
        ],
        lr=learning_rate,           # 默认学习率（传给 optimizer，用于兼容性）
        weight_decay=weight_decay, # 默认 weight decay（一般不生效，因为已分组）
    )

    return optimizer


def train_model(rank, world_size, num_epochs, fold, train_index, val_index, all_prompts, all_targets):
    ddp_setup(rank, world_size)

    train_prompts = [all_prompts[i] for i in train_index]
    val_prompts = [all_prompts[i] for i in val_index]
    train_targets = [all_targets[i] for i in train_index]
    val_targets = [all_targets[i] for i in val_index]

    class_weights = 1 / (np.unique(train_targets, return_counts=True)[1] / len(train_targets)) #类别越稀有，权重越大（使其 loss 更重要）
    class_weights = torch.tensor(class_weights, dtype=torch.half)

    train_dataset = MathDataset(train_prompts, train_targets)
    val_dataset = MathDataset(val_prompts, val_targets)

    train_sampler = DistributedSampler(train_dataset)  #每张 GPU 采样不同数据，保证数据不重复
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, sampler=train_sampler, pin_memory=True, shuffle=False, drop_last=True)
    val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size*2, shuffle=False, drop_last=False)

    model = Net(model_path, rank).to(rank)
    model = DDP(model, device_ids=[rank])

    optimizer = get_optimizer(model, learning_rate=2e-4, diff_lr=2e-4, weight_decay=0.01)

    scheduler = get_cosine_schedule_with_warmup(optimizer=optimizer,
                                                num_warmup_steps=0, 
                                                num_training_steps=(len(train_loader) // grad_accum_steps) * num_epochs)
    scaler = GradScaler()

    best_f1 = 0.0
    MAX_LEN = 400

    for epoch in range(num_epochs):
        train_loader.sampler.set_epoch(epoch)
        model.train()
        optimizer.zero_grad()

        for step, (batch_prompts, batch_targets) in enumerate(tqdm(train_loader)):
            max_len = max(len(x) for x in tokenizer(batch_prompts).input_ids)

            encodings = tokenizer(batch_prompts,
                                  return_tensors='pt',
                                  padding='max_length' if max_len > MAX_LEN else 'longest',
                                  truncation=max_len > MAX_LEN,
                                  max_length=MAX_LEN).to(rank)

            batch_targets = batch_targets.long().to(rank)

            with torch.autocast(device_type='cuda', dtype=torch.float16):
                logits = model(encodings)
                loss = F.cross_entropy(logits, batch_targets, weight=class_weights.to(rank))
                loss = loss / grad_accum_steps

            scaler.scale(loss).backward()

            if (step + 1) % grad_accum_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

        # Validation
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch_prompts, batch_targets in tqdm(val_loader, total=len(val_loader)):
                max_len = max(len(x) for x in tokenizer(batch_prompts).input_ids)

                encodings = tokenizer(batch_prompts,
                                      return_tensors='pt',
                                      padding='max_length' if max_len > MAX_LEN else 'longest',
                                      truncation=max_len > MAX_LEN,
                                      max_length=MAX_LEN).to(rank)

                with torch.autocast(device_type='cuda', dtype=torch.float16):
                    logits = model(encodings)
                    preds = torch.argmax(logits, dim=1).cpu().tolist()

                all_preds.extend(preds)
                all_labels.extend(batch_targets)

        f1 = f1_score(all_labels, all_preds, average='micro')
        print(f'[GPU {rank}] Fold {fold+1} | Epoch {epoch+1}/{num_epochs} | Val F1-micro: {f1:.4f}')

        if rank == 0 and f1 > best_f1:
            best_f1 = f1
            model.eval()
            model.module.backbone.save_pretrained(f'backbone_fold_{fold}_best')
            torch.save(model.module.head.state_dict(), f'head_fold_{fold}_best.pt')

    destroy_process_group()

def run_ddp(rank, world_size, num_epochs, splits, fold, all_prompts, all_targets):
    train_index, val_index = splits[fold]
    train_model(rank, world_size, num_epochs, fold, train_index, val_index, all_prompts, all_targets)

if __name__ == '__main__':
    print("PyTorch version:", torch.__version__)
    print("CUDA available:", torch.cuda.is_available())
    print("Number of GPUs available:", torch.cuda.device_count())

    seed = 252
    torch.manual_seed(seed)

    df = pd.read_csv('/kaggle/input/classification-of-math-problems-by-kasut-academy/train.csv')
    df.columns = ['problem', 'target']

    prompts = [
        f"""'<|im_start|>user
Your task is to classify each Math problem into one of these eight topics using a machine learning or NLP-based approach.
0: Algebra
1: Geometry and Trigonometry
2: Calculus and Analysis
3: Probability and Statistics
4: Number Theory
5: Combinatorics and Discrete Math
6: Linear Algebra
7: Abstract Algebra and Topology

Your answer should be an integer that assigns the most appropriate topic category to the given Math problem based on its content and required reasoning.

Math Problem: {tokenizer.decode(tokenizer(p.strip(), return_tensors='pt', padding='max_length', max_length=300, truncation=True).input_ids[0], skip_special_tokens=True)}

Answer: """
        for p in df['problem']
    ]

    targets = df['target'].tolist()

    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)
    splits = list(skf.split(prompts, targets))

    world_size = torch.cuda.device_count()

    for fold in range(num_folds):
        mp.spawn(run_ddp, args=(world_size, num_epochs, splits, fold, prompts, targets), nprocs=world_size)


Writing train.py


In [3]:
!python train.py

2025-05-11 10:23:48.906993: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746959029.139072     180 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746959029.207550     180 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
PyTorch version: 2.5.1+cu124
CUDA available: True
Number of GPUs available: 4
2025-05-11 10:24:12.469980: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746959052.492010     248 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one ha