In [None]:
# Note: Make sure to download the NLTK packages, if you haven't already
# nltk.download('punkt')
# nltk.download('punkt_tab')
# nltk.download('wordnet')

In [5]:
# Todo: Extract snippets, generate query and responses. Write to json.
import asyncio
import inspect
import re

import nltk
from aiolimiter import AsyncLimiter
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score

from app.definitions import INPUT_DOCS_DIR, EVALUATION_DATA_SET
from app.logger import set_logger, logger
from app.openai_llm import OpenAiLlm
from app.utilities import get_docs, read_file, create_file_if_not_exists, write_json



llm = OpenAiLlm()
llm_limiter = AsyncLimiter(max_rate=100, time_period=1)


async def rate_limited_get_completion(*args, **kwargs):
    async with llm_limiter:
        return await llm.get_completion(*args, **kwargs)


def calculate_bleu(reference_text: str, candidate_text: str) -> float:
    smoothing = SmoothingFunction().method1
    reference_tokens = nltk.word_tokenize(reference_text.lower())
    candidate_tokens = nltk.word_tokenize(candidate_text.lower())
    return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoothing)


def calculate_meteor(reference_text: str, candidate_text: str) -> float:
    reference_tokens = nltk.word_tokenize(reference_text.lower())
    candidate_tokens = nltk.word_tokenize(candidate_text.lower())
    return meteor_score([reference_tokens], candidate_tokens)


async def process_doc(doc, additional_context=None):
    logger.info(f"Processing document: {doc}")
    content = read_file(doc)
    raw_pull_quotes = await rate_limited_get_completion(inspect.cleandoc(f"""
        Extract up to three pull quote facts from this document. Each quote must be wrapped in a <pull-quote> tag.

        Remove all markdown formatting.
        Use no markdown formatting in your response.
        Pull quotes must be a minimum of ten words long.
        The quotes should be broadly related to the document content.
        Quotes must not be vague or unclear.
        It is of paramount importance that the quotes are accurate and use the precise copy of the document.
        Never make up quotes.
        Paraphrasing is permissible as a last resort.

        <content>
        {content}
        </content>

        Example One:
        <content>
        The study published by the National Oceanic and Atmospheric Administration revealed that the Arctic is warming nearly four times faster than the global average, resulting in unprecedented sea-ice loss and ecosystem disruption. Researchers warn that without immediate carbon emission reductions, the region could see ice-free summers as early as 2030.
        </content>
        <pull-quote>
        The Arctic is warming nearly four times faster than the global average, resulting in unprecedented sea-ice loss and ecosystem disruption.
        </pull-quote>
        <pull-quote>
        Without immediate carbon emission reductions, the region could see ice-free summers as early as 2030.
        </pull-quote>

        Example Two:
        <content>
        During its annual keynote, Horizon Dynamics unveiled the Helios X1, a lightweight electric aircraft capable of flying 500 miles on a single charge. CEO Maya Chen said the aircraft "reduces per-passenger emissions by 70 percent compared with regional jets" and will enter commercial service in 2027.
        </content>
        <pull-quote>
        the Helios X1, a lightweight electric aircraft capable of flying 500 miles on a single charge
        </pull-quote>
        <pull-quote>
        CEO Maya Chen said the aircraft "reduces per-passenger emissions by 70 percent compared with regional jets" and will enter commercial service in 2027
        </pull-quote>

        Example Three:
        <content>
        In her landmark address to Parliament on 18 June 1942, Noor Inayat Khan urged legislators to support the formation of a unified resistance council, stating, "We must stand together now, or we shall surely fall apart later." The speech galvanized cross-party backing and marked a turning point in wartime collaboration.
        </content>
        <pull-quote>
        In her landmark address to Parliament on 18 June 1942, Noor Inayat Khan urged legislators to support the formation of a unified resistance council
        </pull-quote>
        <pull-quote>
        Noor Inayat Khan urged legislators to support the formation of a unified resistance council, stating, "We must stand together now, or we shall surely fall apart later."
        </pull-quote>
    """))

    pull_quotes = [
        match.strip()
        for match in re.findall(r"<pull-quote>\s*(.*?)\s*</pull-quote>", raw_pull_quotes, flags=re.DOTALL)
    ]

    logger.info(f"Extracted {len(pull_quotes)} pull quotes from {doc}")
    pull_quote_tasks = [process_pull_quote(pull_quote, content, additional_context) for pull_quote in pull_quotes]
    results = await asyncio.gather(*pull_quote_tasks)
    return results


async def process_pull_quote(pull_quote, content, additional_context=None):
    logger.info(f"Processing pull quote: {pull_quote}")
    prompt = inspect.cleandoc(f"""
        Ask a question that relates to the pull quote and an expected response that you might expect from a rag that contains this information

        The question and response must directly target this pull quote.

        Use no markdown formatting in your response
        The question and response should each be on a single line:
         - the question must be on the first line
         - the response must be on the second line
        respond with only the question and response

        <content>
        {content}
        </content>

        <pull-quote>
        {pull_quote}
        </pull-quote>
    """)

    if additional_context is not None:
        prompt += f"\n\n<additional-context>\n{additional_context}\n</additional-context>"

    raw_query_response = await rate_limited_get_completion(prompt)

    clean_result = [line.strip() for line in re.sub(r'\n+', '\n', raw_query_response).strip().split('\n')]
    if len(clean_result) != 2:
        logger.info(f"Pull quote skipped due to unexpected answer format: {pull_quote}")
        return None
    (query, response) = clean_result
    bleu = calculate_bleu(pull_quote, response)
    meteor = calculate_meteor(pull_quote, response)
    if bleu >= 0.85:
        logger.info(f"Pull quote skipped due to high BLEU score (likely identical): {pull_quote}")
        return None
    # Note: chuck out results that match too closely they're likely identical to the initial query
    logger.info(f"Generated query/response pair for pull quote (BLEU={bleu:.2f}, METEOR={meteor:.2f})")
    return {
        "pull_quote": pull_quote,
        "query": query,
        "response": response,
        "bleu": bleu,
        "meteor": meteor
    }



async def generate():
    logger.info("Starting test set generation process.")
    logger.info(f"INPUT_DOCS_DIR: {INPUT_DOCS_DIR}")
    logger.info(f"EVALUATION_DATA_SET: {EVALUATION_DATA_SET}")

    create_file_if_not_exists(EVALUATION_DATA_SET, "")
    docs = get_docs(INPUT_DOCS_DIR)
    logger.info(f"Found {len(docs)} documents to process.")

    process_doc_tasks = [process_doc(doc) for doc in docs]
    results = await asyncio.gather(*process_doc_tasks)
    clean_results = [result for sublist in results for result in sublist if result is not None]
    logger.info(f"Writing {len(clean_results)} query/response pairs to {EVALUATION_DATA_SET}")
    write_json(EVALUATION_DATA_SET, clean_results)
    logger.info("Test set generation completed successfully.")

In [6]:
set_logger("generate_test_set.log")
await generate()

In [7]:
import asyncio
import inspect

from app.definitions import EVALUATION_DATA_SET
from app.openai_llm import OpenAiLlm
from app.smol_rag import SmolRag
from app.utilities import get_json
from app.logger import logger

rag = SmolRag()
llm = OpenAiLlm()


async def evaluate():
    data_set = get_json(EVALUATION_DATA_SET)
    logger.info(f"Loaded dataset from {EVALUATION_DATA_SET} with {len(data_set)} items.")

    query_tasks = [rag.mix_query(row["query"]) for row in data_set]
    responses = await asyncio.gather(*query_tasks)
    logger.info("Completed generating responses for all queries.")

    tasks = []
    for (response, row) in zip(responses, data_set):
        prompt = inspect.cleandoc(f"""
            Does the <response> accurately answer the <query> in relation to the <source>?

            Response parameters:
            You must answer with either "yes" or "no"
            There must be no other content in your response

            <query>
            {row["query"]}
            </query>

            <response>
            {response}
            </response>

            <source>
            {row["pull_quote"]}
            </source>
        """)

        tasks.extend([get_review(prompt, row["query"]) for _ in range(3)])

    results = await asyncio.gather(*tasks)

    score = sum(results) / len(results)
    logger.info(f"Evaluation completed. Final accuracy score: {score:.3f}")

    return score

async def get_review(prompt, query):
    tasks = [
        rag.rate_limited_get_completion(prompt, use_cache=False)
        for _ in range(3)
    ]
    logger.info(f"Evaluating accuracy for query: {query!r}")  # <-- Logging evaluation step
    results = await asyncio.gather(*tasks)

    responses = [
        1 if result == "yes" else 0
        for result in results
        if result in ("yes", "no")
    ]

    accuracy = 1 if sum(responses) / len(responses) > 0.5 else 0
    logger.info(
        f"Evaluation result for query {query!r}: {sum(responses)}/{len(responses)} votes yes --> {accuracy}")

    return accuracy

INFO:nano-vectordb:Load (226, 1536) data
INFO:nano-vectordb:Init {'embedding_dim': 1536, 'metric': 'cosine', 'storage_file': '/Users/orderandchaos/code/salable-smol-rag/app/data/embeddings_db.json'} 226 data
INFO:nano-vectordb:Load (935, 1536) data
INFO:nano-vectordb:Init {'embedding_dim': 1536, 'metric': 'cosine', 'storage_file': '/Users/orderandchaos/code/salable-smol-rag/app/data/entities_db.json'} 935 data
INFO:nano-vectordb:Load (997, 1536) data
INFO:nano-vectordb:Init {'embedding_dim': 1536, 'metric': 'cosine', 'storage_file': '/Users/orderandchaos/code/salable-smol-rag/app/data/relationships_db.json'} 997 data
INFO:mini-rag:Knowledge graph loaded from /Users/orderandchaos/code/salable-smol-rag/app/data/kg_db.graphml


In [8]:
set_logger("evaluate_test_set.log")
await evaluate()

INFO:mini-rag:Loaded dataset from /Users/orderandchaos/code/salable-smol-rag/app/evaluation/test_sets/evaluation_data_set.json with 176 items.
INFO:mini-rag:New query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:mini-rag:Processed high/low level keywords for mixed KG query.
INFO:mini-rag:Found 5 low-level keywords.
INFO:mini-rag:New query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:mini-rag:New query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:mini-rag:New query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:mini-rag:New query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:mini-rag:New query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:mini-rag:New query
INFO:httpx:HTTP Request: POST htt

0.8446969696969697