# Local Models playbook


In [1]:
!pip install -qU "torch" "python-dotenv" "chromadb" "sentence-transformers" "transformers" "psycopg2-binary" "rich" "pydantic"

## General


In [1]:
import abc
from enum import Enum
import logging
import os
from datetime import datetime, timedelta
from typing import List, Dict, Any, Literal
from pprint import pprint
import time

import chromadb
import psutil
import torch
from chromadb import Documents, EmbeddingFunction, Embeddings
from dotenv import load_dotenv, find_dotenv
from jinja2 import Environment, meta
from psycopg2 import connect
from rich import print as rprint
from rich.console import Console
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import get_json_schema
from pydantic import BaseModel, Field

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load environment variables
load_dotenv(find_dotenv())
console = Console()

# Local models
local_models = {
    "llama-mini": "meta-llama/Llama-3.2-1B-Instruct",
    "llama": "meta-llama/Llama-3.2-3B-Instruct",
    "qwen-mini": "Qwen/Qwen2.5-3B-Instruct",
    "qwen": "Qwen/Qwen2.5-7B-Instruct",
    "gemma-mini": "google/gemma-2-2b-it",
    "gemma": "google/gemma-2-9b-it",
    "phi-mini": "microsoft/Phi-4-mini-instruct",
    "phi": "microsoft/Phi-4-multimodal-instruct",
}

### Checkpoint

In [3]:
# Let's loop through the models and see their chat templates
for model_name, model_id in local_models.items():
    if "mini" in model_name:
        continue
    console.print(f"Model: {model_name}", style="bold green")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    print(tokenizer.get_chat_template())
    console.print("*" * 50, style="bold black")
    # console.print(tokenizer.get_chat_template(), style="Red on White")

{{- bos_token }}
{%- if custom_tools is defined %}
    {%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
    {%- set tools_in_user_message = true %}
{%- endif %}
{%- if not date_string is defined %}
    {%- if strftime_now is defined %}
        {%- set date_string = strftime_now("%d %b %Y") %}
    {%- else %}
        {%- set date_string = "26 Jul 2024" %}
    {%- endif %}
{%- endif %}
{%- if not tools is defined %}
    {%- set tools = none %}
{%- endif %}

{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
    {%- set system_message = messages[0]['content']|trim %}
    {%- set messages = messages[1:] %}
{%- else %}
    {%- set system_message = "" %}
{%- endif %}

{#- System message #}
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
{%- if tools is not none %}
    {{- "Environment: ipython\n" }}
{%- endif %}
{{- "Cutting Knowledge Date: December 2023\n" }}
{{- 

{%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0]['role'] == 'system' %}
        {{- messages[0]['content'] }}
    {%- else %}
        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
    {%- endif %}
    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
    {%- if messages[0]['role'] == 'system' %}
        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
    {%- else %}
        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba C

{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '
' + message['content'] | trim + '<end_of_turn>
' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model
'}}{% endif %}


{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}


In [None]:
# Creating dummy functions for testing


def current_time():
    """Get the current local time as a string."""
    return str(datetime.now())


def multiply(a: float, b: float):
    """
    A function that multiplies two numbers

    Args:
        a: The first number to multiply
        b: The second number to multiply
    """
    return a * b


tools = [current_time, multiply]

messages = [
    {"role": "system", "content": "Your name is Iida, You are a helpful assistant."},
    {"role": "user", "content": "Tell me something about large language models."},
    {
        "role": "assistant",
        "content": "Large language models are powerful models that can generate human-like text.",
    },
    {
        "role": "user",
        "content": "Can you show me an example of a large language model?",
    },
]

fallback_messages = [
    {"role": "user", "content": "Hello, how are you?"},
    {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
    {"role": "user", "content": "I'd like to show off how chat templating works!"},
]

In [9]:
# Loop through the local models and see the templates
for model_name, model_id in local_models.items():
    if "mini" in model_name:
        continue
    # Print memory usage
    print(f"Memory usage: {psutil.virtual_memory().percent}%")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    console.print(f"Model: {model_name}", style="red on white")
    try:
        text = tokenizer.apply_chat_template(
            messages, tools=tools, tokenize=False, add_generation_prompt=True
        )
    except Exception as e:
        print(f"Error: {e}")
        text = tokenizer.apply_chat_template(
            fallback_messages, tools=tools, tokenize=False, add_generation_prompt=True
        )
    # Delete the tokenizer to free up memory
    del tokenizer
    console.print(text, style="italic magenta on yellow")
    print("\n\n")

Memory usage: 63.6%





Memory usage: 63.6%





Memory usage: 63.5%


Error: System role not supported





Memory usage: 63.5%







## Base classes


In [4]:
# Defining pydantic model for chat history

class RoleEnum(str, Enum):
    user = "user"
    assistant = "assistant"
    system = "system"

class ChatHistory(BaseModel):
    """
    Pydantic model for chat history.
    """
    role: RoleEnum = Field(..., description="The role of the speaker (user or assistant).")
    content: str = Field(..., description="The content of the message.")

    class Config:
        schema_extra = {
            "example": {
                "role": "user",
                "content": "Hello, how are you?",
            }
        }

* 'schema_extra' has been renamed to 'json_schema_extra'


In [5]:
# Creating a base class for the models, since we will be experimenting with different models which have different requirements
class BaseLLM(abc.ABC):
    """
    Abstract base class for LLM models, defining common functionality.
    """

    def __init__(
        self, model: str, max_history: int = 10, system_prompt: str = "", **kwargs
    ):
        self.logger = logging.getLogger(self.__class__.__name__)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.system_prompt = system_prompt
        self.max_history = max_history
        self.history: List[tuple] = []

        # Load model and tokenizer
        self.tokenizer = None
        self.model = None
        self.rag_prompt: str = None
        self.rag_prompt_template: str = None
        self.tools_prompt_template: str = None
        self.default_prompt_template: str = None
        self.system_template: str = None
        self.user_turn_template: str = None
        self.assistant_turn_template: str = None
        self.assistant_template: str = None
        self.non_sys_prompt_template: str = None
        self.load_model_and_tokenizer(model, **kwargs)
        self.load_rag_prompt()  # This is just defined as a seperate function for keeping the code clean
        self.load_prompt_templates()

    def load_model_and_tokenizer(self, model: str, **kwargs) -> None:
        """
        Loads the tokenizer and model.
        """
        self.logger.info("Initializing tokenizer and model...")
        self.tokenizer = AutoTokenizer.from_pretrained(
            model, torch_dtype=torch.bfloat16, **kwargs
        )
        self.model = AutoModelForCausalLM.from_pretrained(
            model, torch_dtype=torch.bfloat16, **kwargs
        )
        self.model.to(self.device)

        self.logger.info("Loaded model: %s", model)
        self.logger.info("Model type: %s", type(self.model).__name__)
        self.logger.info("Number of parameters: %s", self.model.num_parameters())
        self.logger.info("Device: %s", self.device.type)

    def get_token_count(self, text: str) -> int:
        """
        Gets the token count of the given text.
        """
        return len(self.tokenizer(text)["input_ids"])

    def trim_conversation(self, conversation_history, token_limit) -> List:
        """
        Trims the conversation history to fit within the given token limit.
        """
        total_tokens = 0
        tokenized_history = []

        if not conversation_history:
            return []

        for user, assistant in conversation_history:
            user_tokens = self.get_token_count(user)
            assistant_tokens = self.get_token_count(assistant)
            total_tokens += user_tokens + assistant_tokens
            tokenized_history.append((user, assistant, user_tokens + assistant_tokens))

        while total_tokens > token_limit and tokenized_history:
            removed_entry = tokenized_history.pop(0)
            total_tokens -= removed_entry[2]

        return [(entry[0], entry[1]) for entry in tokenized_history]

    def clear_history(self) -> None:
        """Clears the stored conversation history."""
        self.history = []

    def add_to_history(self, user_input, model_response) -> None:
        """Adds an interaction to history and maintains max history size."""
        _user = {"role": "user", "content": user_input}
        _assistant = {"role": "assistant", "content": model_response}
        self.history.extend([_user, _assistant])
        if len(self.history) > self.max_history:
            self.history.pop(0)

    # Method for getting the templates
    def get_templates(self) -> Dict[str, str]:
        """
        Get the templates from the model.
        """
        self.logger.debug("User turn template: ", self.user_turn_template)
        self.logger.debug("Assistant turn template: ", self.assistant_turn_template)
        self.logger.debug("Assistant template: ", self.assistant_template)
        self.logger.debug("RAG prompt: ", self.rag_prompt)
        self.logger.debug("RAG prompt template: ", self.rag_prompt_template)
        self.logger.debug("Tools prompt template: ", self.tools_prompt_template)
        self.logger.debug("Default prompt template: ", self.default_prompt_template)
        self.logger.debug("Non system prompt template: ", self.non_sys_prompt_template)
        self.logger.debug("System prompt template: ", self.system_template)
        self.logger.debug("Tool calling prompt: ", self.tool_calling_prompt)

        return {
            "user_turn_template": self.user_turn_template,
            "assistant_turn_template": self.assistant_turn_template,
            "assistant_template": self.assistant_template,
            "rag_prompt": self.rag_prompt,
            "rag_prompt_template": self.rag_prompt_template,
            "tools_prompt_template": self.tools_prompt_template,
            "default_prompt_template": self.default_prompt_template,
            "non_sys_prompt_template": self.non_sys_prompt_template,
            "system_prompt_template": self.system_template,
            "tool_calling_prompt": self.tool_calling_prompt,
        }

    def generate_text(
        self,
        prompt: str,
        max_new_tokens: int = 120,
        skip_special_tokens: bool = False,
        **kwargs,
    ) -> str:
        """
        Generates text based on the given prompt.

        Parameters:
        ----------
        prompt : str
            The prompt text to generate text from.
        max_new_tokens : int, optional
            The maximum length of the generated text (default is 120).
        skip_special_tokens : bool, optional
            Flag to indicate if special tokens should be skipped (default is False).

        Returns:
        -------
        str
            The generated text.
        """

        self.logger.info("Generating response for prompt: %s", prompt)
        try:
            with torch.inference_mode():
                self.logger.debug("Tokenizing prompt...", prompt)
                print("Tokenizing prompt...", prompt)
                inputs = self.tokenizer(prompt, return_tensors="pt")
                inputs = {k: v.to(self.device) for k, v in inputs.items()}

                _start_time = time.time()
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    pad_token_id=self.tokenizer.eos_token_id,
                    **kwargs,
                )
                _end_time = time.time()
                self.logger.debug("Time taken: %.2f seconds", _end_time - _start_time)

        except Exception as e:
            self.logger.error("Error generating response: %s", e)
            return "Error generating response"

        decoded_output = self.tokenizer.decode(
            outputs[0], skip_special_tokens=skip_special_tokens
        )
        self.logger.debug("Generated response: %s", decoded_output)
        print("Generated response: ", decoded_output)

        return decoded_output

    @abc.abstractmethod
    def chat(
        self, prompt: str, clear_session: bool = False, **kwargs
    ) -> Dict[str, Any]:
        """
        Abstract method for chatting with the model.
        """
        pass

    def format_prompt(
        self,
        prompt: str,
        system_prompt: str = None,
        tools_schema: str = None,
        documents: List[Dict] = None,
        create_chat_session: bool = False,
        chat_history: List[Dict] = None,
    ) -> str:
        """
        Formats the prompt using the prompt template.
        """

        system_prompt = system_prompt or self.system_prompt
        final_prompt = prompt

        if chat_history:
            print("Formatting prompt with chat history")
            final_prompt = self.bos_token
            self.logger.debug("Formatting prompt with chat history")
            # Look for system prompt in chat history
            system_prompt = next(
                (msg.get('content') for msg in chat_history if msg.get('role') == "system"), None
            )
            if system_prompt:
                final_prompt += (
                    f"\n{self.system_template.format(system_prompt=system_prompt)}"
                )
            # Build the formatted prompt by looping over the chat history
            for msg in chat_history:
                if msg.get('role') == "user":
                    final_prompt += (
                        f"\n{self.user_turn_template.format(user_prompt=msg.get('content'))}"
                    )
                elif msg.get('role') == "assistant":
                    final_prompt += f"\n{self.assistant_turn_template.format(assistant_response=msg.get('content'))}"
            final_prompt += f"\n{self.user_turn_template.format(user_prompt=prompt)}"  
            final_prompt += f"\n{self.assistant_template}"  # Add the assistant template at the end so the model knows it's the assistant's turn
            return final_prompt
        
        if create_chat_session:
            print("Formatting prompt with chat history - create chat session")
            final_prompt = self.bos_token
            self.logger.debug("Formatting prompt with chat history")
            if system_prompt:
                final_prompt += (
                    f"\n{self.system_template.format(system_prompt=system_prompt)}"
                )
            final_prompt += f"\n{self.user_turn_template.format(user_prompt=prompt)}"
            final_prompt += f"\n{self.assistant_template}"
            return final_prompt

        if tools_schema:
            print("Formatting prompt with tool schema", tools_schema)
            self.logger.debug("Formatting prompt with tool schema")
            formatted_prompt = self.tool_calling_prompt.format(functions_definition=tools_schema)
            system_prompt = formatted_prompt
            final_prompt = self.tools_prompt_template.format(
                system_prompt=system_prompt, user_prompt=prompt
            )

            return final_prompt

        if documents:
            print("Formatting prompt with documents")
            required_keys = {"reference", "content"}
            assert all(
                required_keys.issubset(doc.keys()) for doc in documents
            ), "Documents must contain 'reference' and 'content' keys."

            self.logger.debug("Formatting prompt with documents")
            _documents = "\n".join(
                [
                    f"**Document {doc['reference']}**: {doc['content']}"
                    for doc in documents
                ]
            )
            formatted_prompt = self.rag_prompt.format(
                documents=_documents, question=prompt
            )
            system_prompt = formatted_prompt

            final_prompt = self.rag_prompt_template.format(
                system_prompt=system_prompt, user_prompt=prompt
            )

            return final_prompt

        if system_prompt:
            self.logger.debug("Formatting prompt with system prompt")
            final_prompt = self.default_prompt_template.format(
                system_prompt=system_prompt, user_prompt=prompt
            )
        else:
            final_prompt = self.non_sys_prompt_template.format(user_prompt=prompt)

        return final_prompt

    def __call__(self, prompt: str, **kwargs) -> str:
        """
        Enables direct inference by calling the model instance.
        """
        return self.generate_response(prompt, **kwargs)

    def __repr__(self):
        """
        Official string representation for debugging.
        """
        return f"{self.__class__.__name__}(model={self.model.name_or_path!r}, device={self.device})"

    def __str__(self):
        """
        User-friendly string representation.
        """
        return f"{self.__class__.__name__} running on {self.device.type}, max history: {self.max_history}"

    def __len__(self):
        """
        Returns the number of stored conversation history entries.
        """
        return len(self.history)

    def __getitem__(self, index):
        """
        Retrieves conversation history entries like an array.
        """
        return self.history[index]

    def load_rag_prompt(self):
        """
        Loads the RAG prompt from the model.
        """
        # Check for env variable
        if "RAG_PROMPT" in os.environ:
            self.rag_prompt = os.environ["RAG_PROMPT"]
            self.logger.info("Loaded RAG prompt from environment variable.")
        else:
            self.rag_prompt = (
                self.rag_prompt
            ) = """You are an advanced AI assistant with expertise in retrieving and synthesizing information from provided references. Your role is to analyze the given documents and accurately answer the question based on their content.

## Context:
You will be provided with multiple documents, each containing relevant information. Each document is referenced with a unique identifier. Your response should be derived strictly from the given documents while maintaining clarity and conciseness. If the documents do not contain sufficient information, indicate that explicitly.

## Instructions:
1. **Extract information** only from the provided documents.
2. **Cite references** where applicable by mentioning the document identifier.
3. **Maintain coherence** while summarizing details from multiple sources.
4. **Avoid speculation** or adding external knowledge.
5. **If unclear**, state that the answer is not available in the provided documents.

## Expected Output:
- A **concise and accurate** response based on the referenced documents.
- **Citations** to the corresponding documents where relevant.
- A disclaimer if the answer cannot be found within the given context.

## Documents:
{documents}


## User's Question:
{question}
"""
            self.logger.info("Loaded default RAG prompt.")

    @abc.abstractmethod
    def generate_response(self, prompt: str, **kwargs) -> str:
        """
        Generates a response to the given prompt.
        """
        return self.generate_text(prompt, **kwargs)

    @abc.abstractmethod
    def load_prompt_templates(self):
        """
        Loads the prompt templates from the model.
        """
        pass

## Llama


In [6]:
class LocalLLM(BaseLLM):
    """
    A class to represent a DOTLLM model for text generation.

    Attributes:
    ----------
    model : str
        The model name or path.
    max_history : int, optional
        The maximum number of history entries to keep (default is 5).
    local_files_only : bool, optional
        Flag to indicate if the model is local or remote (default is False).
    tokenizer : AutoTokenizer
        The tokenizer for the model.
    model : AutoModelForCausalLM
        The model for causal language modeling.
    history : list
        The history of text inputs.
    """

    def __init__(
        self,
        model: str = "",
        max_history: int = 10,
        system_prompt: str = None,
        **kwargs,
    ):
        """
        Constructs all the necessary attributes for the DOTLLM object.

        Parameters:
        ----------
        model : str
            The model name or path.
        max_history : int, optional
            The maximum number of history entries to keep (default is 100).
        system_prompt : str, optional
            The system prompt text (default is "You are a helpful AI assistant").
            Note: This is only used if prompt_template is provided.
        kwargs : dict,
            Additional keyword arguments for the model and tokenizer.
        """
        if not model:
            _model = "meta-llama/Llama-3.2-3B-Instruct"
        else:
            _model = model
        self.bos_token = "<|begin_of_text|>"
        self.tool_calling_prompt = """You are an expert in composing functions. You are given a question and a set of possible functions. 
Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
If none of the function can be used, point it out. If the given question lacks the parameters required by the function,
also point it out. You should only return the function call in tools call sections.

If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]\n
You SHOULD NOT include any other text in the response.

Here is a list of functions in JSON format that you can invoke.\n\n{functions_definition}\n"""
        super().__init__(_model, max_history, system_prompt, **kwargs)
        self.logger.debug("Default role of the AI assistant: %s", system_prompt)

    def generate_response(
        self,
        prompt: str,
        system_prompt: str = None,
        tools_schema: str = None,
        documents: List[Dict] = None,
        create_chat_session: bool = False,
        chat_history: List[Dict] = None,
        max_new_tokens: int = 120,
        skip_special_tokens: bool = False,
        **kwargs,
    ) -> Dict[str, Any]:
        """
        Generates text based on the given prompt.

        Parameters:
        ----------
        prompt : str
            The prompt text to generate text from.
        system_prompt : str, optional
            The system prompt text (default is None).
        tools_schema : str, optional
            The schema for the tools prompt (default is None).
        documents : list, optional
            The list of documents for the RAG prompt (default is None).
        create_chat_session : bool, optional
            Flag to indicate if a chat session should be created (default is False).
        chat_history : list, optional
            The chat history for the prompt (default is None).
        max_new_tokens : int, optional
            The maximum length of the generated text (default is 120).
        skip_special_tokens : bool, optional
            Flag to indicate if special tokens should be skipped (default is False).

        Returns:
        -------
        str
            The generated text.
        """
        _chat_history = []
        special_tokens = [
            "<|begin_of_text|>",
            "<|start_header_id|>",
            "<|end_header_id|>",
            "<|eot_id|>",
        ]
        # Check if the chat history aligns with the pydantic model
        if chat_history:
            try:
                _ = [ChatHistory(**msg) for msg in chat_history]
                _chat_history.extend(chat_history)
            except Exception as e:
                self.logger.error("Error validating chat history: %s", e)
        input_prompt = self.format_prompt(
            prompt,
            system_prompt=system_prompt,
            tools_schema=tools_schema,
            documents=documents,
            create_chat_session=create_chat_session,
            chat_history=chat_history,
        )

        model_response = self.generate_text(
            input_prompt,
            max_new_tokens=max_new_tokens,
            skip_special_tokens=skip_special_tokens,
            **kwargs,
        )
        # removing the prompt and special tokens from the model response
        model_response = model_response.replace(input_prompt, "")
        for token in special_tokens:
            model_response = model_response.replace(token, "")
        model_response = model_response.strip()
        # Add the user input and model response to the chat history
        _chat_history.append({"role": "user", "content": prompt})
        _chat_history.append({"role": "assistant", "content": model_response})

        return {"response": model_response, "chat_history": _chat_history}

    def chat(
        self,
        prompt: str,
        chat_history: List[Dict] = None,
        clear_session: bool = False,
        **kwargs,
    ) -> Dict[str, Any]:
        """
        Chat with the model.

        Parameters:
        ----------
        prompt : str
            The user prompt.
        clear_session : bool, optional
            Flag to indicate if the session history should be cleared (default is False).

        Returns:
        -------
        dict
            The response and chat history.
        """
        _history_checker: bool = (
            True  # flag to see if the chat history is passed, so we can return the chat history in the response without affecting original
        )
        if clear_session:
            self.clear_history()

        # Initialize chat history if not provided
        if chat_history is None:
            chat_history = []
            _history_checker = False

        # Determine if we need to create a new chat session
        create_chat_session = not self.history and not chat_history

        # If self.history exists, use it as chat_history
        if self.history and not chat_history:
            chat_history = self.history
        # Adding the chat prompt to chat history
        generated_response = self.generate_response(
            prompt,
            create_chat_session=create_chat_session,
            chat_history=chat_history,
            **kwargs,
        )

        extracted_response = generated_response.get(
            "response", "Error generating response"
        )

        # If no chat history is passed, add the user input and model response to the history
        if not _history_checker:
            self.add_to_history(prompt, extracted_response)
            generated_response["chat_history"] = self.history
        else:  # if chat history is passed, return the chat history as is
            generated_response["chat_history"] = chat_history
            generated_response["chat_history"].extend(
                [
                    {"role": "user", "content": prompt},
                    {"role": "assistant", "content": extracted_response},
                ]
            )

        return generated_response

    def load_prompt_templates(self):
        """
        Loads the prompt templates for the Llama.
        """
        self.system_template = (
            "<|start_header_id|>system<|end_header_id|> {system_prompt} <|eot_id|>"
        )
        self.user_turn_template = (
            "<|start_header_id|>user<|end_header_id|> {user_prompt} <|eot_id|>"
        )
        self.assistant_turn_template = "<|start_header_id|>assistant<|end_header_id|> {assistant_response} <|eot_id|>"
        self.assistant_template = "<|start_header_id|>assistant<|end_header_id|>"
        self.rag_prompt_template = f"{self.bos_token }\n{self.system_template}\n{self.user_turn_template}\n{self.assistant_template}"
        self.tools_prompt_template = f"{self.bos_token }\n{self.system_template}\n{self.user_turn_template}\n{self.assistant_template}"
        self.default_prompt_template = f"{self.bos_token }\n{self.system_template}\n{self.user_turn_template}\n{self.assistant_template}"
        self.non_sys_prompt_template = (
            f"{self.bos_token }\n{self.user_turn_template}\n{self.assistant_template}"
        )

In [7]:
# Testing the llama model
llama1 = LocalLLM()

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  5.45it/s]


In [8]:
llama1.get_templates()

{'user_turn_template': '<|start_header_id|>user<|end_header_id|> {user_prompt} <|eot_id|>',
 'assistant_turn_template': '<|start_header_id|>assistant<|end_header_id|> {assistant_response} <|eot_id|>',
 'assistant_template': '<|start_header_id|>assistant<|end_header_id|>',
 'rag_prompt': "You are an advanced AI assistant with expertise in retrieving and synthesizing information from provided references. Your role is to analyze the given documents and accurately answer the question based on their content.\n\n## Context:\nYou will be provided with multiple documents, each containing relevant information. Each document is referenced with a unique identifier. Your response should be derived strictly from the given documents while maintaining clarity and conciseness. If the documents do not contain sufficient information, indicate that explicitly.\n\n## Instructions:\n1. **Extract information** only from the provided documents.\n2. **Cite references** where applicable by mentioning the docum

In [10]:
# Let's test the generate_response method
response = llama1.generate_response("What is the capital of France?")

Tokenizing prompt... <|begin_of_text|>
<|start_header_id|>user<|end_header_id|> What is the capital of France? <|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
Generated response:  <|begin_of_text|><|begin_of_text|>
<|start_header_id|>user<|end_header_id|> What is the capital of France? <|eot_id|>
<|start_header_id|>assistant<|end_header_id|>

The capital of France is Paris.<|eot_id|>


In [11]:
rprint(response)

In [12]:
# Let's ask different question
response = llama1.generate_response("What is the capital of Germany?")
rprint(response)

Tokenizing prompt... <|begin_of_text|>
<|start_header_id|>user<|end_header_id|> What is the capital of Germany? <|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
Generated response:  <|begin_of_text|><|begin_of_text|>
<|start_header_id|>user<|end_header_id|> What is the capital of Germany? <|eot_id|>
<|start_header_id|>assistant<|end_header_id|>

The capital of Germany is Berlin.<|eot_id|>


In [13]:
# one more question with system prompt
response = llama1.generate_response("What is the capital of Italy?", system_prompt="You are a helpful AI assistant who always responds with one added zen quote")
rprint(response)

Tokenizing prompt... <|begin_of_text|>
<|start_header_id|>system<|end_header_id|> You are a helpful AI assistant who always responds with one added zen quote <|eot_id|>
<|start_header_id|>user<|end_header_id|> What is the capital of Italy? <|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
Generated response:  <|begin_of_text|><|begin_of_text|>
<|start_header_id|>system<|end_header_id|> You are a helpful AI assistant who always responds with one added zen quote <|eot_id|>
<|start_header_id|>user<|end_header_id|> What is the capital of Italy? <|eot_id|>
<|start_header_id|>assistant<|end_header_id|>

The capital of Italy is Rome. 

"The world is a book, and those who do not travel read only one page." - Saint Augustine<|eot_id|>


In [14]:
llama1.history

[]

In [15]:
# let's try chat method
response = llama1.chat("What is the capital of Spain?")
rprint(response)

Formatting prompt with chat history - create chat session
Tokenizing prompt... <|begin_of_text|>
<|start_header_id|>user<|end_header_id|> What is the capital of Spain? <|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
Generated response:  <|begin_of_text|><|begin_of_text|>
<|start_header_id|>user<|end_header_id|> What is the capital of Spain? <|eot_id|>
<|start_header_id|>assistant<|end_header_id|>

The capital of Spain is Madrid.<|eot_id|>


In [16]:
llama1.history

[{'role': 'user', 'content': 'What is the capital of Spain?'},
 {'role': 'assistant', 'content': 'The capital of Spain is Madrid.'}]

In [17]:
# now let's continue the chat as we already have the session
response = llama1.chat("can you tell me some history about it")
rprint(response)

Formatting prompt with chat history
Tokenizing prompt... <|begin_of_text|>
<|start_header_id|>user<|end_header_id|> What is the capital of Spain? <|eot_id|>
<|start_header_id|>assistant<|end_header_id|> The capital of Spain is Madrid. <|eot_id|>
<|start_header_id|>user<|end_header_id|> can you tell me some history about it <|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
Generated response:  <|begin_of_text|><|begin_of_text|>
<|start_header_id|>user<|end_header_id|> What is the capital of Spain? <|eot_id|>
<|start_header_id|>assistant<|end_header_id|> The capital of Spain is Madrid. <|eot_id|>
<|start_header_id|>user<|end_header_id|> can you tell me some history about it <|eot_id|>
<|start_header_id|>assistant<|end_header_id|>

Madrid, the vibrant capital of Spain, has a rich and fascinating history spanning over 2,000 years. Here's a brief overview:

**Ancient Times (9th century BC - 5th century AD)**

* The city of Madrid was founded by the Celtic tribe of the Matiloci in the

In [18]:
llama1.history

[{'role': 'user', 'content': 'What is the capital of Spain?'},
 {'role': 'assistant', 'content': 'The capital of Spain is Madrid.'},
 {'role': 'user', 'content': 'can you tell me some history about it'},
 {'role': 'assistant',
  'content': 'Madrid, the vibrant capital of Spain, has a rich and fascinating history spanning over 2,000 years. Here\'s a brief overview:\n\n**Ancient Times (9th century BC - 5th century AD)**\n\n* The city of Madrid was founded by the Celtic tribe of the Matiloci in the 9th century BC.\n* In 218 BC, the Romans conquered the city and named it "Madrid" after the Roman god of war, Mars.\n* During the Roman Empire, Madrid was an important center for trade and commerce.\n\n**Middle Ages (5th'}]

In [19]:
# let's check the tool calling prompt
function_definitions = """[
    {
        "name": "get_user_info",
        "description": "Retrieve details for a specific user by their unique identifier. Note that the provided function is in Python 3 syntax.",
        "parameters": {
            "type": "dict",
            "required": [
                "user_id"
            ],
            "properties": {
                "user_id": {
                "type": "integer",
                "description": "The unique identifier of the user. It is used to fetch the specific user details from the database."
            },
            "special": {
                "type": "string",
                "description": "Any special information or parameters that need to be considered while fetching user details.",
                "default": "none"
                }
            }
        }
    }
]
"""

response = llama1.generate_response("Can you retrieve the details for the user with the ID 7890, who has black as their special request?", tools_schema=function_definitions)
rprint(response)

Formatting prompt with tool schema [
    {
        "name": "get_user_info",
        "description": "Retrieve details for a specific user by their unique identifier. Note that the provided function is in Python 3 syntax.",
        "parameters": {
            "type": "dict",
            "required": [
                "user_id"
            ],
            "properties": {
                "user_id": {
                "type": "integer",
                "description": "The unique identifier of the user. It is used to fetch the specific user details from the database."
            },
            "special": {
                "type": "string",
                "description": "Any special information or parameters that need to be considered while fetching user details.",
                "default": "none"
                }
            }
        }
    }
]

Tokenizing prompt... <|begin_of_text|>
<|start_header_id|>system<|end_header_id|> You are an expert in composing functions. You are given a questio

In [20]:
# Let's now test the RAG prompt
documents = [
    {
        "reference": "Doc1",
        "content": "Quantum computing leverages quantum mechanics to perform computations at speeds unattainable by classical computers. It relies on principles like superposition, where quantum bits (qubits) exist in multiple states simultaneously, and entanglement, which enables qubits to be linked regardless of distance. These properties allow quantum computers to solve complex problems efficiently. Current research is focused on improving qubit stability and error correction."
    },
    {
        "reference": "Doc2",
        "content": "The theory of relativity, proposed by Albert Einstein, revolutionized our understanding of space and time. It consists of special relativity, which deals with objects moving at high velocities, and general relativity, which explains gravity as the curvature of spacetime. This theory has been experimentally confirmed through observations like gravitational lensing and time dilation. Modern GPS systems rely on relativity corrections for accurate positioning."
    },
    {
        "reference": "Doc3",
        "content": "Machine learning is a subset of artificial intelligence that enables computers to learn from data without explicit programming. It includes supervised, unsupervised, and reinforcement learning techniques. These models are used in applications like image recognition, fraud detection, and recommendation systems. The effectiveness of a machine learning model depends on the quality and quantity of training data."
    },
    {
        "reference": "Doc4",
        "content": "Blockchain technology provides a decentralized and secure way to record transactions. It uses cryptographic hashing and distributed consensus to ensure data integrity. Originally developed for Bitcoin, blockchain is now used in supply chain management, digital identity, and smart contracts. The technology faces challenges like scalability and energy consumption."
    },
    {
        "reference": "Doc5",
        "content": "The human brain consists of billions of neurons that communicate through electrical and chemical signals. Neural networks in artificial intelligence are inspired by this biological structure. The brain's plasticity allows it to adapt and learn new information throughout life. Research in neuroscience is uncovering new treatments for cognitive disorders."
    }
]

response = llama1.generate_response("What are the key principles of quantum computing?", documents=documents)
rprint(response)

Formatting prompt with documents
Tokenizing prompt... <|begin_of_text|>
<|start_header_id|>system<|end_header_id|> You are an advanced AI assistant with expertise in retrieving and synthesizing information from provided references. Your role is to analyze the given documents and accurately answer the question based on their content.

## Context:
You will be provided with multiple documents, each containing relevant information. Each document is referenced with a unique identifier. Your response should be derived strictly from the given documents while maintaining clarity and conciseness. If the documents do not contain sufficient information, indicate that explicitly.

## Instructions:
1. **Extract information** only from the provided documents.
2. **Cite references** where applicable by mentioning the document identifier.
3. **Maintain coherence** while summarizing details from multiple sources.
4. **Avoid speculation** or adding external knowledge.
5. **If unclear**, state that the a

In [21]:
rprint(response.get("response"))

In [22]:
# Let's try to make a chat session continuing this rag prompt
response = llama1.chat("What do you think about it on whole", chat_history=response.get("chat_history"))
rprint(response)


Formatting prompt with chat history
Tokenizing prompt... <|begin_of_text|>
<|start_header_id|>user<|end_header_id|> What are the key principles of quantum computing? <|eot_id|>
<|start_header_id|>assistant<|end_header_id|> According to Document Doc1, the key principles of quantum computing are:

1. **Superposition**: Quantum bits (qubits) can exist in multiple states simultaneously.
2. **Entanglement**: Qubits can be linked regardless of distance, enabling correlations between them.

These properties allow quantum computers to solve complex problems efficiently.

(Note: Document Doc1 is the primary source for this information.) <|eot_id|>
<|start_header_id|>user<|end_header_id|> What do you think about it on whole <|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
Generated response:  <|begin_of_text|><|begin_of_text|>
<|start_header_id|>user<|end_header_id|> What are the key principles of quantum computing? <|eot_id|>
<|start_header_id|>assistant<|end_header_id|> According to Do

In [23]:
# now let's check the history
llama1.history

[{'role': 'user', 'content': 'What is the capital of Spain?'},
 {'role': 'assistant', 'content': 'The capital of Spain is Madrid.'},
 {'role': 'user', 'content': 'can you tell me some history about it'},
 {'role': 'assistant',
  'content': 'Madrid, the vibrant capital of Spain, has a rich and fascinating history spanning over 2,000 years. Here\'s a brief overview:\n\n**Ancient Times (9th century BC - 5th century AD)**\n\n* The city of Madrid was founded by the Celtic tribe of the Matiloci in the 9th century BC.\n* In 218 BC, the Romans conquered the city and named it "Madrid" after the Roman god of war, Mars.\n* During the Roman Empire, Madrid was an important center for trade and commerce.\n\n**Middle Ages (5th'}]

In [24]:
# as we could see the history remains intact while we could pass custome history to the chat method to generate the response