In [None]:
!pip install pandas openpyxl
!pip install bitsandbytes==0.38.1
!pip install torch transformers==4.31.0 peft==0.4.0 sentencepiece bitsandbytes accelerate

In [None]:
# 替换 Llama 的相关模块为 ChatGLM2
from transformers import AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq, AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
import torch
import json
import gc
from datasets import Dataset

# 超参数
BATCH_SIZE = 2
EPOCHS = 2
LR = 1e-4
MAX_LENGTH = 1024
OUTPUT_DIR = "ChatGLM2-lora-out"
TRAIN_FILE = "train.json"
PHASE_SIZE = 100

# 加载 ChatGLM2 模型和 tokenizer
base_model_name = "THUDM/chatglm2-6b"
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    trust_remote_code=True,
    load_in_8bit=True,
    device_map="auto"
)
model.config.use_cache = False

# LoRA 配置
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],  # ChatGLM2 支持的 LoRA 模块
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model.config.use_cache = False
model = get_peft_model(model, lora_config)

# 数据处理函数
def load_data(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        return json.load(f)

def format_prompt_and_labels(example):
    instruction = example["instruction"]
    input_text = example.get("input", "")
    output_text = example["output"]
    if input_text.strip():
        prompt = f"""問題：{instruction}\n\n{input_text}\n\n答："""
    else:
        prompt = f"""問題：{instruction}\n\n答："""
    return prompt, output_text

def tokenize_fn(example):
    prompt, answer = format_prompt_and_labels(example)
    full_text = prompt + answer

    tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt")["input_ids"]
    tokenized_answer = tokenizer(answer, truncation=False, return_tensors="pt")["input_ids"]
    dynamic_max_length = min(MAX_LENGTH, len(tokenized_prompt[0]) + len(tokenized_answer[0]) + 10)

    encoding = tokenizer(
        full_text,
        max_length=dynamic_max_length,  
        truncation=True,
        padding="max_length",          
    )

    prompt_len = len(tokenized_prompt[0])
    labels = encoding["input_ids"].copy()
    labels[:prompt_len] = [-100] * prompt_len
    # print(f"有效 labels 数量: {sum(1 for label in labels if label != -100)}")
    
    return {"input_ids": encoding["input_ids"], "labels": labels}

train_data_list = load_data(TRAIN_FILE)
num_phases = (len(train_data_list) + PHASE_SIZE - 1) // PHASE_SIZE

train_data_list = load_data(TRAIN_FILE)
train_dataset = Dataset.from_list(train_data_list)
train_dataset = train_dataset.map(tokenize_fn)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,    
    gradient_accumulation_steps=8,     
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    bf16=True,
    logging_steps=5,
    save_steps=1000,
    save_total_limit=5,
    disable_tqdm=False,                 # 顯示tqdm
    logging_strategy="steps",           # 每steps都log
    report_to=[],                       # 不用wandb
    gradient_checkpointing=False,       # 不用梯度检查点
)

# --------------- G) 建立 Trainer 並微調 ---------------
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="longest",  
    pad_to_multiple_of=8,  
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,  
)

for step, _ in enumerate(trainer.get_train_dataloader()):
    if step % 100 == 0:  # 定期清理缓存
        torch.cuda.empty_cache()
        gc.collect()

for phase in range(num_phases):
    print(f"Phase {phase + 1}/{num_phases}")
    partial_train_data = train_data_list[phase * PHASE_SIZE:(phase + 1) * PHASE_SIZE]
    partial_dataset = Dataset.from_list(partial_train_data)
    partial_dataset = partial_dataset.map(tokenize_fn)

    trainer.train_dataset = partial_dataset  # 更新当前阶段的数据集
    trainer.train()

    # 定期释放 GPU 内存
    torch.cuda.empty_cache()
    gc.collect()
    
trainer.save_model(OUTPUT_DIR)
print("ChatGLM2 LoRA fine-tuning completed.")


In [None]:
import pandas as pd
import csv
import torch
import re
from transformers import AutoTokenizer, AutoModel

# ---------------------------
# 1) 載入 ChatGLM2 模型
# ---------------------------
base_model_name = "THUDM/chatglm2-6b"
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(base_model_name, trust_remote_code=True).half().cuda()
model.eval()

# ---------------------------
# 2) Prompt 和推理
# ---------------------------
def generate_prompt(instruction, input_text=""):
    """
    設計 Chain-of-Thought (CoT) 推理提示，要求模型逐步分析選項並選擇答案。
    """
    return f"""以下是一段文章和一個問題，請逐步分析文章和每個選項的內容，並選出最符合邏輯的正確答案。
    請按照以下步驟進行：
    1. 閱讀文章，理解核心內容。
    2. 分析問題的含義，逐一比較選項。
    3. 排除不符合條件的選項。
    4. 清晰地給出 "最終答案：" 並僅返回一個數字選項 (1, 2, 3, 或 4)。

    ### Instruction:
    {instruction}

    ### Input:
    {input_text}

    ### Response:"""

def ask(instruction, input_text="", max_new_tokens=64):
    """
    使用 ChatGLM2 模型生成回答，並提取最終答案。
    """
    prompt = generate_prompt(instruction, input_text)
    response, _ = model.chat(tokenizer, prompt, max_length=max_new_tokens, temperature=0.8, top_p=0.85)

    # 提取 "最終答案" 或第一個合法數字
    match = re.search(r"最終答案：([1234])", response)
    if match:
        final_answer = match.group(1)
    else:
        # 如果沒有找到，則取第一個合法的 1, 2, 3, 4
        numbers = re.findall(r"[1234]", response)
        final_answer = numbers[0] if numbers else "1"  # 默認答案為 1

    return final_answer

# ---------------------------
# 3) 讀取測試集並推理
# ---------------------------
if __name__ == "__main__":
    df = pd.read_excel("AI1000.xlsx")  # 測試集

    results = []

    for idx, row in df.iterrows():
        q_id = row["題號"]
        article = str(row["文章"]).strip()
        question = str(row["問題"]).strip()
        option1 = str(row["選項1"]).strip()
        option2 = str(row["選項2"]).strip()
        option3 = str(row["選項3"]).strip()
        option4 = str(row["選項4"]).strip()

        # 組合成 input_text
        input_text = (
            f"【文章】{article}\n"
            f"【問題】{question}\n"
            f"【選項】\n1) {option1}\n2) {option2}\n3) {option3}\n4) {option4}"
        )

        instruction = (
            "請仔細閱讀以下文章和問題，並比較每個選項的內容。"
            "根據文章中的信息，逐一排除不符合邏輯的選項，最後選擇最符合條件的正確答案（1、2、3 或 4）。"
            "請注意，只允許輸出一個數字答案，並避免任何解釋或多餘的字符。"
        )

        # 取得模型回答
        answers = [ask(instruction, input_text, max_new_tokens=128) for _ in range(3)]
        final_answer = max(set(answers), key=answers.count)  # 投票選出最多的答案

        print(f"ID={q_id}, Answer={final_answer}")
        results.append([q_id, final_answer])

    # ---------------------------
    # 4) 寫入 CSV 檔
    # ---------------------------
    with open("Kaggle-sample.csv", "w", newline="", encoding="utf-8") as fw:
        writer = csv.writer(fw)
        writer.writerow(["ID", "Answer"])
        writer.writerows(results)

    print("推理完成，答案已寫入 Kaggle-sample.csv")