# Evaluating HF models fine-tuned on QA on RepLiQA

In [None]:
# Consider using the optional_requirements.txt file to set up an environment to run this notebook.

from typing import Dict, Any
import torch
import transformers
import datasets
import tqdm

## Get repliqa and its set of topics
Here we're only using the `repliqa_0` split.

In [None]:
repliqa = datasets.load_dataset("ServiceNow/repliqa")["repliqa_0"]

## Pre-processors

### QA processor

In [None]:
class BaselineQAEvaluator:

    def __init__(
        self,
        model_name: str = "nvidia/Llama3-ChatQA-1.5-8B",
        cache_path: str = None,
        max_length: int = 10_000,
    ) -> None:

        self.model_name = model_name
        self.model = transformers.AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="auto",
            cache_dir=cache_path,
        ).eval()
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, truncation=True)
        self.terminators = [
            self.tokenizer.eos_token_id,
            self.tokenizer.convert_tokens_to_ids("<|eot_id|>"), # As per 'How to use' in https://huggingface.co/nvidia/Llama3-ChatQA-1.5-8B
        ]
        self.max_length = max_length
        self.device = next(self.model.parameters()).device

    def __call__(self, example: Dict[str, Any]) -> Dict[str, Any]:

        tokenized_inputs = self._make_inputs(example)

        with torch.no_grad(), torch.cuda.amp.autocast():
            outputs = self.model.generate(
                **tokenized_inputs,
                max_new_tokens=128,
                eos_token_id=self.terminators,
            )

        response = outputs[0][tokenized_inputs["input_ids"].shape[-1] :]

        raw_response = self.tokenizer.decode(response, skip_special_tokens=True)

        return {
            "answer_pred": raw_response,
            "cleaned_answer_pred": self.process_answer(raw_response),
            "model_name": self.model_name,
        }

    def process_answer(self, raw_answer: str) -> str:
        return raw_answer.split(":")[-1]

    def _make_inputs(self, example: Dict[str, Any]) -> Dict[str, Any]:
        if "document_extracted" in example:
            context_str = example["document_extracted"]
        elif "entity_pages":
            context_str = example["entity_pages"]["wiki_context"][0]

        question_str = example["question"]

        # We followed the prompt format recommended in:
        # https://huggingface.co/nvidia/Llama3-ChatQA-1.5-8B
        input_str = (
            "System: This is a chat between a user and an artificial intelligence assistant. "
            "The assistant gives helpful, detailed, and polite answers to the user's "
            "questions based on the context. The assistant should also indicate when the answer cannot be found in the context."
            "concisely and precisely, using information provided by the user."
            f"\n\n{context_str}"
            f"\n\nUser: Please give a full and complete answer for the question. {question_str}"
            "\n\nAssistant: "
        )

        tokenized_inputs = self.tokenizer(
                [self.tokenizer.bos_token + input_str],
                return_tensors="pt",
                max_length=self.max_length,
            ).to(self.device)

        return tokenized_inputs


## Evaluation

### List models to evaluate

In [None]:
# Values in 'models' should be either 'nvidia/Llama3-ChatQA-1.5-8B or https://huggingface.co/nvidia/Llama3-ChatQA-1.5-70B'
models = ["nvidia/Llama3-ChatQA-1.5-8B", ]

### Actual evaluation

In [None]:
inference_datasets = {}

for model in tqdm.tqdm(models, total=len(models), desc="Models"):
  qa_pre_processor = BaselineQAEvaluator(model_name=model)
  prediction_dataset = repliqa.map(qa_pre_processor)

  inference_datasets[model] = prediction_dataset

In [None]:
inference_datasets