In [None]:
!pip install bitsandbytes

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
model_id = "google/gemma-2-2b"

tokenizer = AutoTokenizer.from_pretrained(model_id)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            quantization_config=bnb_config,
            torch_dtype=torch.bfloat16,
        )

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset
datasets = load_dataset('neural-bridge/rag-dataset-12000', split='test')
print(f"Train dataset size: {len(datasets)} ")
datasets = datasets.select(range(100))
print(len(datasets))

In [None]:
from tqdm import tqdm
import json

# 推論
results = []
for dt in tqdm(datasets):
  context=dt["context"]
  question = dt["question"]
  answer = dt["answer"]

  prompt = f"""Given the following passage, answer the related question.\n### Passage\n{context}\n### Question\n{question}\n### Answer\n"""

  inputs = tokenizer([prompt], return_tensors = "pt").to(model.device)

  outputs = model.generate(**inputs, max_new_tokens = 512, use_cache = True, do_sample=False, repetition_penalty=1.2)
  prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).split('\n### Answer')[-1]

  results.append({"question": question, "output": prediction, "answer": answer})

In [None]:
with open(f"gemma-2-2b_output.jsonl", 'w', encoding='utf-8') as f:
    for result in results:
        json.dump(result, f, ensure_ascii=False)
        f.write('\n')