In [1]:
import json
import pandas as pd
import torch
import os
from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from huggingface_hub import login
from datasets import Dataset
from evaluate import load

# 載入數據
with open('train.json', 'r') as f:
    train_data = [json.loads(line) for line in f]
    
with open('test.json', 'r') as f:
    test_data = [json.loads(line) for line in f]
    
# 轉換為 DataFrame
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

# 登錄到 Hugging Face
login("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")

# 設置輸出目錄
output_dir = "./model_checkpoint"
os.makedirs(output_dir, exist_ok=True)

# 模型名稱
model_name = "meta-llama/Llama-3.1-8B-Instruct"

# 初始化標記器
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [2]:
# 設置量化配置
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True
)

# 加載模型
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=quantization_config,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
)

# 設置 LoRA 配置
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,  # 降低 rank 以減少顯存需求
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
)

# 獲取 PEFT 模型
model = get_peft_model(model, peft_config)

# 準備訓練提示
train_prompts = []
for intro, abstract in zip(train_df["introduction"], train_df["abstract"]):
    train_prompts.append({
        "text": f"Generate an abstract for the following paper introduction:\n\n{intro}\n\nAbstract: {abstract}"
    })

# 轉換為數據集格式
train_dataset = Dataset.from_list(train_prompts)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
# 數據處理函數
def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=1536,
        padding="max_length",
    )

# 對數據集應用預處理
tokenized_train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["text"],
)

# 設置訓練參數
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    save_strategy="epoch",
    logging_steps=10,
    fp16=True,
    optim="adamw_torch",
)

# 初始化訓練器
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

# 訓練模型
trainer.train()

# 保存模型
model.save_pretrained(os.path.join(output_dir, "final_model"))
tokenizer.save_pretrained(os.path.join(output_dir, "final_model"))


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mhanry5517[0m ([33mhanry5517-national-yang-ming-chiao-tung-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,2.0538
20,1.9891
30,1.9236
40,2.0211
50,1.9502
60,1.968
70,1.9808
80,1.9475
90,1.8984
100,1.9429


('./model_checkpoint/final_model/tokenizer_config.json',
 './model_checkpoint/final_model/special_tokens_map.json',
 './model_checkpoint/final_model/tokenizer.json')

In [4]:
# 釋放 CUDA 緩存
import gc

torch.cuda.empty_cache()
gc.collect()  # 這會強制執行 Python 的垃圾回收

# 打印當前 GPU 內存使用情況（確認清理成功）
print(f"內存使用情況: {torch.cuda.memory_allocated() / 1024**3:.2f} GB / {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# 睡眠幾秒鐘，讓系統有時間完全釋放資源
import time
time.sleep(5)

內存使用情況: 8.58 GB / 23.64 GB


In [None]:
model_path = os.path.join(output_dir, "final_model")
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto"
)
model = PeftModel.from_pretrained(base_model, model_path)

with open('submit.json', 'w') as f:
    pass

# 生成預測
predictions = []

for i, intro in enumerate(tqdm(test_df["introduction"], desc="Generating abstracts")):
    # 準備提示文本
    prompt = f"Generate an abstract for the following paper introduction:\n\n{intro}\n\nAbstract:"
    
    # 編碼輸入
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1536).to(model.device)
    
    # 生成輸出
    output = model.generate(
        **inputs, 
        max_new_tokens=512,
        num_beams=2,
        no_repeat_ngram_size=3,
        early_stopping=False
    )
    
    # 解碼輸出
    pred_text = tokenizer.decode(output[0], skip_special_tokens=True)
    # 提取摘要部分
    abstract = pred_text.split("Abstract:")[-1].strip()
    predictions.append(abstract)
    
    # 寫入當前生成的摘要到文件
    with open('submit.json', 'a') as f:
        json_line = {
            "paper_id": int(test_df.iloc[i]["paper_id"]),
            "abstract": str(abstract)
        }
        f.write(json.dumps(json_line) + '\n')
    
    # 釋放內存
    torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Generating abstracts:   0%|          | 0/103 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for