In [None]:
import os
from dotenv import load_dotenv
import textwrap

import llama_index
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Document, VectorStoreIndex, get_response_synthesizer
from llama_index.core.prompts import PromptTemplate
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline

from llama_index.core.query_pipeline import QueryPipeline, InputComponent, Link, FunctionComponent

from scraper.scraper import start_scraping

load_dotenv()

In [None]:
# scrape data
log_filename = f"scrapping.log"
start_url = "https://manuals.sma.de/STPxx50/en-US/index.html"
url_prefix = "https://manuals.sma.de/STPxx50/en-US"
max_depth = 5
num_workers = 100

scraped_data = start_scraping(
    start_url=start_url,
    url_prefix=url_prefix,
    max_depth=max_depth,
    num_workers=num_workers,
    log_filename=log_filename,
)

In [None]:
# init llms
llm_model = OpenAI(model='gpt-4o-mini')
emb_model = OpenAIEmbedding()


In [None]:
# ingest data
documents = [Document(text=d['text'], extra_info={'url':d['url'], 'path':d['path']}) for d in scraped_data]
pipeline = IngestionPipeline(transformations=[emb_model])
nodes = pipeline.run(documents=documents)

In [None]:
# init index
index = VectorStoreIndex(nodes=nodes, embed_model=emb_model)

In [None]:
# init retriever
retriever = VectorIndexRetriever(index, similarity_top_k=5)

In [None]:
# init response synthesizer
QA_PROMPT_TMPL = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query.\n"
    "Query: {query_str}\n"
    "Answer: "
)
QA_PROMPT = PromptTemplate(QA_PROMPT_TMPL)

REFINE_PROMPT_TMPL = (
    "The original query is as follows: {query_str}\n"
    "We have provided an existing answer: {existing_answer}\n"
    "We have the opportunity to refine the existing answer "
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{context_msg}\n"
    "------------\n"
    "Given the new context, refine the original answer to better answer the query. "
    "If the context isn't useful, return the original answer.\n"
    "Always try to keep the answer concise and relevant to the query.\n"
    "There is no need to include information beyond the scope of the query.\n"
    "Refined Answer: "
)
REFINE_PROMPT = PromptTemplate(REFINE_PROMPT_TMPL)

response_synthesizer = get_response_synthesizer(
    llm=llm_model,
    response_mode='refine',
    text_qa_template=QA_PROMPT,
    refine_template=REFINE_PROMPT,
)

In [None]:
# init query engine
query_engine = RetrieverQueryEngine(retriever=retriever, response_synthesizer=response_synthesizer)

In [None]:
def send_query(query):
    WRAP_LEN = 200
    _wrap = lambda x: textwrap.fill(x, WRAP_LEN, replace_whitespace=False)
    response = query_engine.query(query)

    print(f"Query: {_wrap(query)}")
    print(f"Response:")
    print(_wrap(response.response))

    print('\n\nsources')
    for i, node in enumerate(sorted(response.source_nodes, key=lambda x: x.score)):
        print(f"Node {i}")
        print(f"URL: {node.metadata['url']}")
        print(f"Path: {' > '.join(node.metadata['path'])}")
        # print(textwrap.fill(node.text, WRAP_LEN))

In [None]:
x = send_query("what are the different output power classes of the inverter? For each what is the mpp voltage range? what is the euro-efficiency of the inverter?")

In [None]:
# query pipeline

def output_formatter(llm_response, source_nodes):
    WRAP_LEN = 200
    _wrap = lambda x: textwrap.fill(x, WRAP_LEN, replace_whitespace=False)

    out = (
        f"Response:\n"
        f"{_wrap(llm_response.response)}"
        "\n\nsources"
    )

    for i, node in enumerate(sorted(source_nodes, key=lambda x: x.score)):
        out +=f"\nNode {i}"
        out +=f"\nURL: {node.metadata['url']}"
        out +=f"\nPath: {' > '.join(node.metadata['path'])}"

    return out

rag_2 = QueryPipeline(verbose=True)

rag_2.add_modules(
    module_dict={
        'input': InputComponent(),
        'retriever': retriever,
        'response_synthesizer': response_synthesizer.as_query_component(),
        'output_formatter': FunctionComponent(output_formatter),
    }
)

rag_2.add_links(
    [
        Link('input', 'retriever'),
        Link('retriever', 'response_synthesizer', dest_key='nodes'),
        Link('retriever', 'output_formatter', dest_key='source_nodes'),
        Link('input', 'response_synthesizer', dest_key='query_str'),
        Link('response_synthesizer', 'output_formatter', dest_key='llm_response'),
    ]
)


In [None]:
out = rag_2.run(question="what area the different output power classes of the inverter? For each what is the mpp voltage range? what is the euro-efficiency of the inverter?")