In [None]:
!pip install pandas openpyxl
!pip install bitsandbytes==0.38.1
!pip install torch transformers>=4.28.0 peft==0.4.0 sentencepiece bitsandbytes accelerate
!pip install sentencepiece

In [None]:
import os
import json
import torch
from datasets import Dataset
from transformers import (
    LlamaTokenizer,
    LlamaForCausalLM,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model

# --------------- A) 超參數 ---------------
BATCH_SIZE = 2
EPOCHS = 5
LR = 1e-4
MAX_LENGTH = 256
OUTPUT_DIR = "chinese-alpaca-pro-33b-out"
BASE_MODEL = "minlik/chinese-alpaca-pro-33b-merged"

# --------------- B) Prompt 設計 ---------------
def generate_prompt(text):
    """
    通用 Prompt 設計，適用於微調和推理
    """
    return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{text}

### Response:"""

def format_prompt_and_labels(example):
    """
    用於微調的格式化：生成 Prompt 和輸出
    """
    instruction = example["instruction"]
    input_text = example.get("input", "")
    output_text = example["output"]

    if input_text.strip():
        prompt = generate_prompt(f"{instruction}\n\n{input_text}")
    else:
        prompt = generate_prompt(instruction)

    return prompt, output_text

# --------------- C) 加載數據集 ---------------
def load_data(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        return json.load(f)

def tokenize_fn(example):
    prompt, answer = format_prompt_and_labels(example)
    full_text = prompt + answer

    encoding = tokenizer(
        full_text,
        max_length=MAX_LENGTH,
        truncation=True,
        padding="max_length",
    )
    prompt_len = len(tokenizer(prompt, truncation=True, max_length=MAX_LENGTH)["input_ids"])
    labels = encoding["input_ids"].copy()

    # Mask Prompt 部分的 Label
    for i in range(prompt_len):
        if i < MAX_LENGTH:
            labels[i] = -100

    encoding["labels"] = labels
    return encoding

# --------------- D) 模型加載和微調 ---------------
print("Loading base model and tokenizer...")
tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("[PAD]")

model = LlamaForCausalLM.from_pretrained(
    BASE_MODEL,
    load_in_8bit=True,
    device_map="auto",
)
model.resize_token_embeddings(len(tokenizer))

# LoRA 配置
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

# 準備數據集
train_data_list = load_data("train.json")
train_dataset = Dataset.from_list(train_data_list)
train_dataset = train_dataset.map(tokenize_fn)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    fp16=True,
    logging_steps=50,
    save_steps=200,
    save_total_limit=5,
    disable_tqdm=False,
    logging_strategy="steps",
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

print("Starting LoRA fine-tuning...")
trainer.train()
trainer.save_model(OUTPUT_DIR)
print("LoRA fine-tuning is complete! The adapter is saved at:", OUTPUT_DIR)

In [None]:
import pandas as pd
import csv
import torch
import re
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig

# ---------------------------
# 1) 載入基礎模型
# ---------------------------
base_model_name = "minlik/chinese-alpaca-pro-33b-merged"

tokenizer = LlamaTokenizer.from_pretrained(base_model_name)
model = LlamaForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype=torch.float16,  # 使用 float16 提升性能
)
model.eval()

def generate_prompt(instruction, input_text=""):
    """
    生成 Prompt，適配 Chinese-Alpaca 模型
    """
    return f"""以下是一個問題，請逐步分析並提供每個選項的比較，然後得出最符合邏輯的答案。請在最後明確給出 "最終選擇：" 和數字答案。

### Instruction:
{instruction}

### Input:
{input_text}

### Response:"""

def ask(instruction, input_text="", max_new_tokens=64):
    """
    使用模型生成回答
    """
    prompt = generate_prompt(instruction, input_text)
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    gen_config = GenerationConfig(
        temperature=0.7,
        top_p=0.9,
        num_beams=4,
        max_new_tokens=max_new_tokens,
    )

    with torch.no_grad():
        output_ids = model.generate(**inputs, generation_config=gen_config)

    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    raw_answer = output.split("### Response:")[-1].strip()

    # 提取最終選擇的數字
    final_answer_match = re.search(r"最終選擇：([1234])", raw_answer)
    if final_answer_match:
        final_answer = final_answer_match.group(1)
    else:
        final_answer = "1"  # 如果未匹配到答案，設置默認值

    return final_answer

# ---------------------------
# 2) 讀取測試集，進行推理
# ---------------------------
if __name__ == "__main__":
    df = pd.read_excel("AI1000.xlsx")  # 測試集文件

    results = []

    for idx, row in df.iterrows():
        q_id = row["題號"]
        article = str(row["文章"]).strip()
        question = str(row["問題"]).strip()
        option1 = str(row["選項1"]).strip()
        option2 = str(row["選項2"]).strip()
        option3 = str(row["選項3"]).strip()
        option4 = str(row["選項4"]).strip()

        # 組合 input_text
        input_text = (
            f"【文章】{article}\n"
            f"【問題】{question}\n"
            f"【選項】\n1) {option1}\n2) {option2}\n3) {option3}\n4) {option4}"
        )

        instruction = (
            "請仔細閱讀以下文章和問題，並比較每個選項的內容。"
            "根據文章中的信息，逐一排除不符合邏輯的選項，最後選擇最符合條件的正確答案（1、2、3 或 4）。"
            "請注意，只允許輸出一個數字答案，並避免任何解釋或多餘的字符。"
        )

        # 使用模型回答
        answers = [ask(instruction, input_text, max_new_tokens=64) for _ in range(3)]
        final_answer = max(set(answers), key=answers.count)  # 投票選出最多的答案

        print(f"ID={q_id}, Answer={final_answer}")
        results.append([q_id, final_answer])

    # ---------------------------
    # 3) 寫入 CSV 文件
    # ---------------------------
    with open("Kaggle-sample.csv", "w", newline="", encoding="utf-8") as fw:
        writer = csv.writer(fw)
        writer.writerow(["ID", "Answer"])
        writer.writerows(results)

    print("推理完成，答案已寫入 Kaggle-sample.csv")