In [1]:
# install necessary packages
%pip install -q torch numpy pandas transformers peft pyarrow pybind11 pylatexenc datasets tiktoken wandb tqdm matplotlib math-verify[antlr4_9_3]
%pip install -q --upgrade --force-reinstall scikit-learn # -q 使用静默模式安装

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
# 在下载Hugging Face模型前配置镜像站，防止AutoDL等实例无法访问hugging face官网
import os     #Python 的内置os模块，用于操作操作系统的环境变量、文件路径等系统相关功能
# HF_ENDPOINT是 Hugging Face 库的环境变量配置项，用于指定模型 / 数据集的下载端点（即服务器地址）
# 将Hugging Face的默认地址从 https://huggingface.co（官网）改为 https://hf-mirror.com
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'     
# 这是进程级别的环境变量，程序运行期间有效，程序退出后自动消失
# 不会修改操作系统级别的环境变量配置

In [3]:
!nvidia-smi # GPU configuration

Wed Dec  3 16:00:28 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.76.05              Driver Version: 580.76.05      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 5090        On  |   00000000:B8:00.0 Off |                  N/A |
| 42%   28C    P8             13W /  575W |       0MiB /  32607MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

# prepare data

In [4]:
# !python prepare.py --debug # Debug with small-scale data
!python prepare.py # formal training using the full dataset
# 在 prepare.py 中可修改使用的分词器模型（要与微调时使用的模型一致）

Loading tokenizer: 'Qwen/Qwen2.5-Math-1.5B'
tokenizer_config.json: 7.32kB [00:00, 12.3MB/s]
vocab.json: 2.78MB [00:01, 2.04MB/s]
merges.txt: 1.67MB [00:00, 3.04MB/s]
tokenizer.json: 7.03MB [00:00, 30.4MB/s]
Loading dataset: 'ricdomolm/MATH-500'
README.md: 1.80kB [00:00, 5.27MB/s]
train-00000-of-00001.parquet: 100%|████████| 4.72M/4.72M [00:02<00:00, 2.24MB/s]
test-00000-of-00001.parquet: 100%|████████████| 199k/199k [00:00<00:00, 627kB/s]
Generating train split: 100%|██| 12000/12000 [00:00<00:00, 240464.61 examples/s]
Generating test split: 100%|████████| 500/500 [00:00<00:00, 59920.34 examples/s]
Dataset split into 10800 training and 1200 validation examples.

Tokenizing and formatting datasets...
Map (num_proc=208): 100%|█████████| 10800/10800 [00:26<00:00, 409.46 examples/s]
Map (num_proc=208): 100%|████████████| 1200/1200 [00:25<00:00, 47.01 examples/s]
Filter (num_proc=208): 100%|█████| 10800/10800 [00:03<00:00, 2964.90 examples/s]
Filter (num_proc=208): 100%|████████| 1200/1200 [

# Finetune the baseline model with different hyperparameter configurations

In [None]:
# 默认模式（最小日志） 不设置verbose
# 详细模式（显示进度条）--verbose 2
# 静默模式 --verbose 0

# 在finetune.py中可修改使用的 based model

由于个人GPU算力极其有限，我们只能使用Qwen3-0.6B-Base等参数很少的模型进行微调
大模型训练需要的算力及其巨大，没有个人或学校可以完成，只有科技巨头如Google、微软、Meta等有足够的算力资源

在绝大多数情况下，AdamW + LoRA的理论和实践表现都优于其他配置

## SGD

In [None]:
!python finetune.py --optimization_method "sgd" --learning_rate 1e-5 --num_epochs 2 --output_dir "saves/sgd_lr1e-5_epoch2" --experiment_name "SGD_lr1e-5_epoch2" --verbose 1

In [None]:
!python finetune.py --optimization_method "sgd" --learning_rate 2e-5 --num_epochs 2 --output_dir "saves/sgd_lr2e-5_epoch2" --experiment_name "SGD_lr2e-5_epoch2" --verbose 1

In [None]:
!python finetune.py --optimization_method "sgd" --learning_rate 5e-5 --num_epochs 2 --output_dir "saves/sgd_lr5e-5_epoch2" --experiment_name "SGD_lr5e-5_epoch2" --verbose 1

In [6]:
!python finetune.py --optimization_method "sgd" --learning_rate 1e-5 --num_epochs 1 --output_dir "saves/sgd_lr1e-5_epoch1" --experiment_name "SGD_lr1e-5_epoch1" --verbose 1

Loading model and tokenizer from Qwen/Qwen2.5-Math-1.5B...
Loaded 9010 examples from /root/data/train.pkl
Loaded 1018 examples from /root/data/val.pkl
Setting up optimizer: sgd
Starting training...
Total training steps: 1126

--- Epoch 1/1 ---
Step 50: Train Loss = 0.9826

Step 50: Train Loss = 0.9826, Val Loss = 0.8007
  ✓ New best! Saving model to /root/saves/sgd_lr1e-5_epoch1
Step 100: Train Loss = 0.5262

Step 100: Train Loss = 0.5262, Val Loss = 0.7998
  ✓ New best! Saving model to /root/saves/sgd_lr1e-5_epoch1
Step 150: Train Loss = 0.5815

Step 150: Train Loss = 0.5815, Val Loss = 0.7997
  ✓ New best! Saving model to /root/saves/sgd_lr1e-5_epoch1
Step 200: Train Loss = 0.7079

Step 200: Train Loss = 0.7079, Val Loss = 0.7991
  ✓ New best! Saving model to /root/saves/sgd_lr1e-5_epoch1
Step 250: Train Loss = 0.4779

Step 250: Train Loss = 0.4779, Val Loss = 0.7988
  ✓ New best! Saving model to /root/saves/sgd_lr1e-5_epoch1
Step 300: Train Loss = 0.6675

Step 300: Train Loss = 0.66

## Adam

In [7]:
!python finetune.py --optimization_method "adam" --learning_rate 1e-5 --num_epochs 1 --output_dir "saves/adam_lr1e-5_epoch1" --experiment_name "Adam_lr1e-5_epoch1" --verbose 1

Loading model and tokenizer from Qwen/Qwen2.5-Math-1.5B...
Loaded 9010 examples from /root/data/train.pkl
Loaded 1018 examples from /root/data/val.pkl
Setting up optimizer: adam
Starting training...
Total training steps: 1126

--- Epoch 1/1 ---
Step 50: Train Loss = 0.9385

Step 50: Train Loss = 0.9385, Val Loss = 0.6868
  ✓ New best! Saving model to /root/saves/adam_lr1e-5_epoch1
Step 100: Train Loss = 0.3779

Step 100: Train Loss = 0.3779, Val Loss = 0.6765
  ✓ New best! Saving model to /root/saves/adam_lr1e-5_epoch1
Step 150: Train Loss = 0.5394

Step 150: Train Loss = 0.5394, Val Loss = 0.6700
  ✓ New best! Saving model to /root/saves/adam_lr1e-5_epoch1
Step 200: Train Loss = 0.5531

Step 200: Train Loss = 0.5531, Val Loss = 0.6648
  ✓ New best! Saving model to /root/saves/adam_lr1e-5_epoch1
Step 250: Train Loss = 0.4065

Step 250: Train Loss = 0.4065, Val Loss = 0.6607
  ✓ New best! Saving model to /root/saves/adam_lr1e-5_epoch1
Step 300: Train Loss = 0.5296

Step 300: Train Loss 

In [None]:
!python finetune.py --optimization_method "adam" --learning_rate 2e-5 --num_epochs 1 --output_dir "saves/adam_lr2e-5_epoch1" --experiment_name "Adam_lr2e-5_epoch1" --verbose 1

In [None]:
!python finetune.py --optimization_method "adam" --learning_rate 5e-5 --num_epochs 1 --output_dir "saves/adam_lr5e-5_epoch1" --experiment_name "Adam_lr5e-5_epoch1" --verbose 1

## AdamW with LoRA

In [5]:
# 在finetune.py中lora默认使用AdamW优化器
!python finetune.py --optimization_method "lora" --learning_rate 1e-5 --num_epochs 1 --output_dir "saves/adamW_lr1e-5_epoch1" --lora_rank 8 --experiment_name "AdamW_lr1e-5_epoch1" --verbose 1

Loading model and tokenizer from Qwen/Qwen2.5-Math-1.5B...
config.json: 676B [00:00, 2.13MB/s]
model.safetensors: 100%|███████████████████| 3.09G/3.09G [04:25<00:00, 11.7MB/s]
generation_config.json: 138B [00:00, 428kB/s]
Loaded 9010 examples from /root/data/train.pkl
Loaded 1018 examples from /root/data/val.pkl
Setting up optimizer: lora
Setting up LoRA with rank=8
Starting training...
Total training steps: 1126

--- Epoch 1/1 ---
Step 50: Train Loss = 0.7190

Step 50: Train Loss = 0.7190, Val Loss = 0.7631
  ✓ New best! Saving model to /root/saves/adamW_lr1e-5_epoch1
Step 100: Train Loss = 0.6921

Step 100: Train Loss = 0.6921, Val Loss = 0.7206
  ✓ New best! Saving model to /root/saves/adamW_lr1e-5_epoch1
Step 150: Train Loss = 0.5609

Step 150: Train Loss = 0.5609, Val Loss = 0.6974
  ✓ New best! Saving model to /root/saves/adamW_lr1e-5_epoch1
Step 200: Train Loss = 0.7454

Step 200: Train Loss = 0.7454, Val Loss = 0.6872
  ✓ New best! Saving model to /root/saves/adamW_lr1e-5_epoch

In [None]:
!python finetune.py --optimization_method "lora" --learning_rate 2e-5 --num_epochs 1 --output_dir "saves/adamW_lr2e-5_epoch1" --lora_rank 8 --experiment_name "AdamW_lr2e-5_epoch1" --verbose 1

In [None]:
!python finetune.py --optimization_method "lora" --learning_rate 5e-5 --num_epochs 1 --output_dir "saves/adamW_lr5e-5_epoch1" --lora_rank 8 --experiment_name "AdamW_lr5e-5_epoch1" --verbose 1

In [None]:
# 比较sgd不同超参数配置的结果
!python compare_experiments.py \
  --experiment_dirs "saves/sgd_lr1e-5_epoch2" "saves/sgd_lr2e-5_epoch2" "saves/sgd_lr5e-5_epoch2" \
  --output_dir "sgd_lr_comparison_results" \
  --title_suffix " - SGD Learning Rate Comparison"

In [None]:
# 比较Adam不同超参数配置的结果
!python compare_experiments.py \
  --experiment_dirs "saves/adam_lr1e-5_epoch1" "saves/adam_lr2e-5_epoch1" "saves/adam_lr5e-5_epoch1" "saves/adam_lr1e-4_epoch1" "saves/adam_lr3e-4_epoch1" \
  --output_dir "comparison_results/adam_lr_comparison" \
  --title_suffix " - Adam Learning Rate Comparison (epoch=1)"

In [None]:
# 比较AdamW + LoRA不同超参数配置的结果
!python compare_experiments.py \
  --experiment_dirs "saves/adamW_lr1e-5_epoch1" "saves/adamW_lr2e-5_epoch1" "saves/adamW_lr5e-5_epoch1" \
  --output_dir "comparison_results/adamW_lr_comparison" \
  --title_suffix " - AdamW Learning Rate Comparison (epoch=1)"

In [8]:
# 比较 sgd adam adamW在相同学习率下的结果
!python compare_experiments.py \
  --experiment_dirs "saves/adamW_lr1e-5_epoch1" "saves/adam_lr1e-5_epoch1" "saves/sgd_lr1e-5_epoch1" \
  --output_dir "comparison_results/optimizer_comparison" \
  --title_suffix " - Optimizer Comparison (epoch=1)"

Comparing 3 experiments: ['saves/adamW_lr1e-5_epoch1', 'saves/adam_lr1e-5_epoch1', 'saves/sgd_lr1e-5_epoch1']
Loaded data from 3 experiments
Training loss comparison saved to comparison_results/optimizer_comparison/train_loss_comparison.png
Validation loss comparison saved to comparison_results/optimizer_comparison/val_loss_comparison.png

Comparison complete!
Training comparison saved to: comparison_results/optimizer_comparison/train_loss_comparison.png
Validation comparison saved to: comparison_results/optimizer_comparison/val_loss_comparison.png


# Rollout the finetuned model

使用模型对测试集进行推理，生成预测结果

功能：
加载测试数据集（MATH-500）
加载指定的模型（基础模型或LoRA微调模型）
对每个测试问题生成模型的回答
将原始预测结果保存为JSONL格式

In [None]:
# !python rollout.py --model "Qwen/Qwen3-0.6B-Base" --lora_path "saves/lora-tuned" --output_file "output/qwen3_0.6b_base_nosys_it_lora_debug.jsonl"
!python rollout.py --model "saves/sgd_lr1e-5_epoch2" --output_file "output/rolled/sgd_lr1e-5_epoch2.jsonl"
!python rollout.py --model "saves/sgd_lr2e-5_epoch2" --output_file "output/rolled/sgd_lr2e-5_epoch2.jsonl"
!python rollout.py --model "saves/sgd_lr5e-5_epoch2" --output_file "output/rolled/sgd_lr5e-5_epoch2.jsonl"

In [None]:
!python rollout.py --model "saves/adam_lr1e-5_epoch1" --output_file "output/rolled/adam_lr1e-5_epoch1.jsonl"
!python rollout.py --model "saves/adam_lr2e-5_epoch1" --output_file "output/rolled/adam_lr2e-5_epoch1.jsonl"
!python rollout.py --model "saves/adam_lr5e-5_epoch1" --output_file "output/rolled/adam_lr5e-5_epoch1.jsonl"

In [None]:
!python rollout.py --model "saves/adamW_lr1e-5_epoch1" --output_file "output/rolled/adamW_lr1e-5_epoch1.jsonl"
!python rollout.py --model "saves/adamW_lr2e-5_epoch1" --output_file "output/rolled/adamW_lr2e-5_epoch1.jsonl"
!python rollout.py --model "saves/adamW_lr5e-5_epoch1" --output_file "output/rolled/adamW_lr5e-5_epoch1.jsonl"

In [9]:
!python rollout.py --model "saves/sgd_lr1e-5_epoch1" --output_file "output/rolled/sgd_lr1e-5_epoch1.jsonl"
!python rollout.py --model "saves/adam_lr1e-5_epoch1" --output_file "output/rolled/adam_lr1e-5_epoch1.jsonl"
!python rollout.py --model "saves/adamW_lr1e-5_epoch1" --output_file "output/rolled/adamW_lr1e-5_epoch1.jsonl"


Loading base model: saves/sgd_lr1e-5_epoch1
No LoRA adapter specified, running the base model.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Processed batch 1/8
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Processed batch 2/8
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Processed batch 3/8
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Processed batch 4/8
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Processed batch 5/8
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Processed batch 6/8
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Processed batch 7/8
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Processed batch 8/8
Output will be saved to: /root/output/rolled/sgd_lr1e-5_epoch1.jsonl
Saved generations to /root/output/rolled/sgd_lr1e-5_epoch1.jsonl
Loading base model: saves/a

# Evaluate the rollout results

In [None]:
!python evaluate.py --input_file "output/rolled/sgd_lr1e-5_epoch2.jsonl" --output_file "output/sgd_lr1e-5_epoch2_evaled.jsonl"
!python evaluate.py --input_file "output/rolled/sgd_lr2e-5_epoch2.jsonl" --output_file "output/sgd_lr2e-5_epoch2_evaled.jsonl"
!python evaluate.py --input_file "output/rolled/sgd_lr5e-5_epoch2.jsonl" --output_file "output/sgd_lr5e-5_epoch2_evaled.jsonl"

In [None]:
!python evaluate.py --input_file "output/rolled/adam_lr1e-5_epoch1.jsonl" --output_file "output/evaluated/adam_lr1e-5_epoch1_evaled.jsonl"
!python evaluate.py --input_file "output/rolled/adam_lr2e-5_epoch1.jsonl" --output_file "output/evaluated/adam_lr2e-5_epoch1_evaled.jsonl"
!python evaluate.py --input_file "output/rolled/adam_lr5e-5_epoch1.jsonl" --output_file "output/evaluated/adam_lr5e-5_epoch1_evaled.jsonl"

In [None]:
!python evaluate.py --input_file "output/rolled/adamW_lr1e-5_epoch1.jsonl" --output_file "output/evaluated/adamW_lr1e-5_epoch1_evaled.jsonl"
!python evaluate.py --input_file "output/rolled/adamW_lr2e-5_epoch1.jsonl" --output_file "output/evaluated/adamW_lr2e-5_epoch1_evaled.jsonl"
!python evaluate.py --input_file "output/rolled/adamW_lr5e-5_epoch1.jsonl" --output_file "output/evaluated/adamW_lr5e-5_epoch1_evaled.jsonl"

In [10]:
!python evaluate.py --input_file "output/rolled/sgd_lr1e-5_epoch1.jsonl" --output_file "output/sgd_lr1e-5_epoch1_evaled.jsonl"
!python evaluate.py --input_file "output/rolled/adam_lr1e-5_epoch1.jsonl" --output_file "output/evaluated/adam_lr1e-5_epoch1_evaled.jsonl"
!python evaluate.py --input_file "output/rolled/adamW_lr1e-5_epoch1.jsonl" --output_file "output/evaluated/adamW_lr1e-5_epoch1_evaled.jsonl"


Evaluation complete.
Overall Accuracy: 11.80%
Scored results saved to output/sgd_lr1e-5_epoch1_evaled.jsonl

Evaluation complete.
Overall Accuracy: 24.60%
Scored results saved to output/evaluated/adam_lr1e-5_epoch1_evaled.jsonl

Evaluation complete.
Overall Accuracy: 26.20%
Scored results saved to output/evaluated/adamW_lr1e-5_epoch1_evaled.jsonl
