In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import pandas as pd
import gc
from tqdm import tqdm

def load_model(model_path, base_model_name):
    print(f"Loading tokenizer from {base_model_name}")
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    print(f"Loading base model from {base_model_name}")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )

    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
    )

    print(f"Loading fine-tuned model from {model_path}")
    model = PeftModel.from_pretrained(base_model, model_path)
    model = model.merge_and_unload()  # Merge LoRA weights into base model

    return model, tokenizer

def prepare_prompt(dialogue):
    instruction = f"다음 대화를 요약해주세요.\n\n{dialogue}"
    messages = [
        {"role": "user", "content": instruction}
    ]
    return messages

def generate_summary(model, tokenizer, dialogue):
    prompt = prepare_prompt(dialogue)
    inputs = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
    input_ids = tokenizer(inputs, return_tensors="pt").input_ids.to(model.device)

    # Define the termination tokens
    terminators = [
        tokenizer.eos_token_id,
        tokenizer.encode("<|eot_id|>", add_special_tokens=False)[-1]
    ]

    outputs = model.generate(
        input_ids,
        do_sample=False,
        eos_token_id=terminators
    )

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary.split("assistant")[-1].strip()

base_model_name = "MLP-KTLim/llama-3-Korean-Bllossom-8B"
model_path = "./fine_tuned_model"  # 파인튜닝된 모델이 저장된 경로

model, tokenizer = load_model(model_path, base_model_name)

test_df = pd.read_csv("./data/test.csv")
    
summaries = []
print("추론 시작...")
for dialogue in tqdm(test_df['dialogue'], desc="Generating summaries"):
    summary = generate_summary(model, tokenizer, dialogue)
    # print(summary)
    summaries.append(summary)

    # 메모리 정리
    torch.cuda.empty_cache()
    gc.collect()

# 결과 저장
output_df = pd.DataFrame({
    'fname': test_df['fname'],
    'summary': summaries
})

output_df.to_csv("output.csv", index=False)
print("추론 완료. 결과가 output.csv에 저장되었습니다.")