In [1]:
from ragna.core import Assistant, PackageRequirement, Source


class AiroborosAssistant(Assistant):
    @classmethod
    def display_name(cls):
        return "TheBloke/Airoboros-L2-7B-2.2-GPTQ"

    @classmethod
    def requirements(cls):
        return [
            PackageRequirement("torch"),
            PackageRequirement("optimum"),
            PackageRequirement("auto-gptq"),
        ]

    @classmethod
    def is_available(cls):
        requirements_available = super().is_available()
        if not requirements_available:
            return False

        import torch

        return torch.cuda.is_available()

    def __init__(self, config):
        super().__init__(config)

        from auto_gptq import AutoGPTQForCausalLM
        from transformers import AutoTokenizer

        self.tokenizer = AutoTokenizer.from_pretrained(str(self), use_fast=True)
        self.model = AutoGPTQForCausalLM.from_quantized(
            str(self),
            device_map="auto",
            use_triton=False,
            use_safetensors=True,
            trust_remote_code=False,
            inject_fused_attention=False,
        )

    @property
    def max_input_size(self) -> int:
        # FIXME
        return 1024

    def answer(
        self, prompt: str, sources: list[Source], *, max_new_tokens: int = 256
    ) -> str:
        template = """
        A chat about the content of documents.
        Only use the content listed below to answer any questions from the user.
        Do not make up information.
        If you can't answer a question based on the information you are given, just say so.

        {sources}
        
        USER: {prompt}
        ASSISTANT: 
        """
        templated_prompt = template.format(
            sources="- " + "\n - ".join(source.content for source in sources),
            prompt=prompt,
        )
        input_ids = self.tokenizer(
            templated_prompt, return_tensors="pt"
        ).input_ids.cuda()
        output_ids = self.model.generate(
            inputs=input_ids,
            do_sample=False,
            max_new_tokens=max_new_tokens,
        )
        output = self.tokenizer.decode(output_ids[0])
        return output.rsplit("ASSISTANT:", 1)[-1].replace("</s>", "").strip()


assert AiroborosAssistant.is_available()

platform/c++/implementation/internal.cpp:205:reinit_singlethreaded(): Reinitialising as single-threaded.


In [2]:
from ragna.core import Rag
from ragna.source_storages import RagnaDemoSourceStorage

rag = Rag()

path = "ragna.txt"
with open(path, "w") as file:
    file.write(
        "Ragna is an OSS app for RAG workflows that offers a Python and REST API as well as web UI\n"
    )

async with rag.chat(
    documents=[path],
    source_storage=RagnaDemoSourceStorage,
    assistant=AiroborosAssistant,
) as chat:
    prompt = "What is Ragna?"
    message = await chat.answer(prompt)
    answer = message.content

print(f"User: {prompt}")
print(f"Assistant: {answer}")

skip module injection for FusedLlamaMLPForQuantizedModel not support integrate without triton yet.


User: What is Ragna?
Assistant: Ragna is an open-source application for RAG workflows. It offers a Python and REST API as well as a web UI.
