# INTEL GETI Docs Chatbot

## Crawling Pages

Crawl pages based on the provided links. Additionally, retrieve a list of new pages from the sidebar directory information and continue crawling until all pages have been crawled.

In [1]:
from superduperdb import superduper
import os
mongodb_uri = os.getenv("SUPERDUPERDB_DATA_BACKEND","mongomock://test")
db = superduper(mongodb_uri)
db.drop(force=True)

  from .autonotebook import tqdm as notebook_tqdm
2024-03-06 22:43:37,030	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


[32m 2024-Mar-06 22:43:37.03[0m| [1mINFO    [0m | [36mzhouhaha-2.local[0m| [36msuperduperdb.base.build[0m:[36m65  [0m | [1mData Client is ready. mongomock.MongoClient('localhost', 27017)[0m
[32m 2024-Mar-06 22:43:37.04[0m| [1mINFO    [0m | [36mzhouhaha-2.local[0m| [36msuperduperdb.base.build[0m:[36m38  [0m | [1mConnecting to Metadata Client with engine:  mongomock.MongoClient('localhost', 27017)[0m
[32m 2024-Mar-06 22:43:37.04[0m| [1mINFO    [0m | [36mzhouhaha-2.local[0m| [36msuperduperdb.base.build[0m:[36m148 [0m | [1mConnecting to compute client: local[0m
[32m 2024-Mar-06 22:43:37.04[0m| [1mINFO    [0m | [36mzhouhaha-2.local[0m| [36msuperduperdb.base.datalayer[0m:[36m85  [0m | [1mBuilding Data Layer[0m


In [2]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin

def process_code_snippets(text):
    soup = BeautifulSoup(text, "html.parser")
    pre_tags = soup.find_all("pre")

    for pre in pre_tags:
        processed_text = str(pre.text)
        new_content = "CODE::" + soup.new_string(processed_text)
        pre.clear()
        pre.append(new_content)
    return str(soup)


def process_py_class(source_html):
    soup = BeautifulSoup(source_html, "html.parser")
    dl_tags = soup.find_all("dl", class_="py class")

    for dl in dl_tags:
        dt_tag = dl.find("dt", class_="sig sig-object py")
        if not dt_tag:
            continue
        last_headerlink = dt_tag.find_all("a", class_="headerlink")[-1]
        href = last_headerlink["href"] if last_headerlink else ""
        id = dt_tag.attrs["id"]
        new_h3 = soup.new_tag("h3")
        new_a_inside_h3 = soup.new_tag("a", href=href)
        new_a_inside_h3.string = f"Class: {id}"
        new_h3.append(new_a_inside_h3)

        new_code = soup.new_tag("a")
        new_code.string = dt_tag.text
        dt_tag.insert_before(new_h3)
        dt_tag.insert_before(new_code)
        dt_tag.decompose()

    return str(soup)


def parse_url(seed_url):
    print(f"parse {seed_url}")
    response = requests.get(seed_url)
    # Parse the HTML content
    source_html = response.text
    source_html = process_code_snippets(source_html)
    source_html = process_py_class(source_html)

    return source_html


def url2html(url):
    return parse_url(url)


## Importing Webpage Data into Database

### Using SuperduperDB to Connect to Database

In [3]:
from superduperdb import superduper
db = superduper("mongodb://127.0.0.1:27017/intel-geti")
db.drop(force=True)

  from .autonotebook import tqdm as notebook_tqdm
2024-03-04 21:49:47,055	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


[32m 2024-Mar-04 21:49:47.06[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36msuperduperdb.base.build[0m:[36m65  [0m | [1mData Client is ready. MongoClient(host=['127.0.0.1:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=5000)[0m
[32m 2024-Mar-04 21:49:47.07[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36msuperduperdb.base.build[0m:[36m38  [0m | [1mConnecting to Metadata Client with engine:  MongoClient(host=['127.0.0.1:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=5000)[0m
[32m 2024-Mar-04 21:49:47.08[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36msuperduperdb.base.build[0m:[36m148 [0m | [1mConnecting to compute client: local[0m
[32m 2024-Mar-04 21:49:47.08[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36msuperduperdb.base.datalayer[0m:[36m85  [0m | [1mBuilding Data Layer[0m


Store the webpage data into the database after unstructured parsing.

In [4]:
from unstructured.partition.html import partition_html
from superduperdb.ext.unstructured.encoder import unstructured_encoder

db.add(unstructured_encoder)

datas = []
for url, source_html in pages:
    elements = partition_html(text=source_html, html_assemble_articles=True)
    if elements:
        datas.append({'url': url, 'elements': unstructured_encoder(elements)})

from superduperdb import Document
from superduperdb.backends.mongodb import Collection
documents = list(map(Document, datas))
collection = Collection("pages")
collection.insert_many(documents).execute(db)

[32m 2024-Mar-04 21:49:49.18[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36msuperduperdb.components.component[0m:[36m333 [0m | [1mInitializing DataType : dill[0m
[32m 2024-Mar-04 21:49:49.18[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36msuperduperdb.components.component[0m:[36m336 [0m | [1mInitialized  DataType : dill successfully[0m


[2024-03-04 21:49:50] unstructured INFO Reading document from string ...
[2024-03-04 21:49:50] unstructured INFO Reading document ...


[32m 2024-Mar-04 21:49:50.83[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36msuperduperdb.components.component[0m:[36m333 [0m | [1mInitializing DataType : unstructured[0m
[32m 2024-Mar-04 21:49:50.83[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36msuperduperdb.components.component[0m:[36m336 [0m | [1mInitialized  DataType : unstructured successfully[0m


[ObjectId('65e5d17ea0a7ed69fcf48cd7')]

## Parsing and Chunking Webpages

Define an title ecognition function to be used as chunk identifiers.

In [5]:
from unstructured.documents.elements import ElementType

def get_title_data(element):
    data = {}
    if element.category != ElementType.TITLE:
        return data
    if 'link_urls' not in element.metadata.to_dict():
        return data

    if 'category_depth' not in element.metadata.to_dict():
        return data

    [link_text, *_] = element.metadata.link_texts

    if not link_text:
        return data

    link_urls = element.metadata.link_urls
    if not link_urls:
        return data
    category_depth = element.metadata.category_depth
    return {'link': link_urls[0], 'category_depth':category_depth}

Define conversion methods for different types of text, such as titles, lists, tables, and code.


In [6]:
import pandas as pd
from io import StringIO
def element2text(element):
    title_message = get_title_data(element)
    text = element.text
    if title_message:
        title_tags = '#' * (title_message['category_depth'] + 1)
        text = title_tags + ' ' + text
        text = text.rstrip('#')

    elif element.category == ElementType.LIST_ITEM:
        text = '- ' + text

    elif element.category == ElementType.TABLE:
        html = element.metadata.text_as_html
        html = html.replace('|', '')
        df = pd.read_html(StringIO(html))[0]
        text = df.to_markdown(index=False)
        text = text + '  \n'

    if text.startswith("CODE::"):
        text = f"```\n{text[6:]}\n```"

    return text

Define chunking functions.

In [7]:
def get_chunk_texts(text, chunk_size=1000, overlap_size=300):
    chunks = []
    start = 0

    while start < len(text):
        if chunks:
            start -= overlap_size
        end = start + chunk_size
        end = min(end, len(text))
        chunks.append(text[start:end])
        start = end
        if start >= len(text):
            break

    return chunks

from collections import defaultdict
def get_chunks(elements):
    chunk_tree = defaultdict(list)
    now_depth = -1
    now_path = 'root'
    for element in elements:
        title_data = get_title_data(element)
        if not title_data:
            chunk_tree[now_path].append(element)
        else:
            link = title_data['link']
            depth = title_data['category_depth']
            if depth > now_depth:
                now_path = now_path + "::" +link
            else:
                now_path = '::'.join(now_path.split("::")[:depth+1] + [link])
            now_depth = depth
            chunk_tree[now_path].append(element)
     
    chunks = []
    for node_path, node_elements in chunk_tree.items():
        new_elements = []
        nodes = node_path.split("::")
        parent_elements = []
        for i in range(1, len(nodes) - 1):
            [parent_element, *_] = chunk_tree["::".join(nodes[:i+1])] or [None]
            if parent_element:
                parent_elements.append(parent_element)
        node_elements = [*parent_elements, *node_elements]
        content = '\n\n'.join(map(lambda x: element2text(x), node_elements))
        for chunk_text in get_chunk_texts(content):
            # The url field is used to save the jump link
            # The text field is used for vector search
            # The content field is used to submit to LLM for answer
            chunk = {"url": nodes[-1], 'text': chunk_text, 'content': content}
            chunks.append(chunk_text)
    return chunks

Define a chunking model and add a Listener to listen to data and chunk webpages.

In [8]:
from superduperdb import Model, Listener, Schema


chunk_model = Model(
    identifier='chunk',
    object=get_chunks,
    flatten=True,
    model_update_kwargs={"document_embedded": False},
    output_schema=Schema(identifier="myschema", fields={"text": "string"}),
)

db.add(
    Listener(
        model=chunk_model,
        select=Collection('pages').find(),
        key="elements",
    )
)

[32m 2024-Mar-04 21:49:51.04[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36msuperduperdb.components.component[0m:[36m333 [0m | [1mInitializing DataType : dill[0m
[32m 2024-Mar-04 21:49:51.04[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36msuperduperdb.components.component[0m:[36m336 [0m | [1mInitialized  DataType : dill successfully[0m
[32m 2024-Mar-04 21:49:51.09[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36msuperduperdb.backends.local.compute[0m:[36m32  [0m | [1mSubmitting job. function:<function method_job at 0x12f494430>[0m


1it [00:00, 689.17it/s]

[32m 2024-Mar-04 21:49:51.10[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36msuperduperdb.components.component[0m:[36m333 [0m | [1mInitializing ObjectModel : chunk[0m
[32m 2024-Mar-04 21:49:51.10[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36msuperduperdb.components.component[0m:[36m336 [0m | [1mInitialized  ObjectModel : chunk successfully[0m
[32m 2024-Mar-04 21:49:51.10[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36msuperduperdb.components.model[0m:[36m649 [0m | [1mAdding 1 model outputs to `db`[0m
[32m 2024-Mar-04 21:49:51.12[0m| [32m[1mSUCCESS [0m | [36m183eefeaab2d[0m| [36msuperduperdb.backends.local.compute[0m:[36m38  [0m | [32m[1mJob submitted.  function:<function method_job at 0x12f494430> future:0c2da506-8f0a-44f0-9f2a-d544cab83345[0m





([<superduperdb.jobs.job.ComponentJob at 0x291b115d0>],
 Listener(identifier='chunk/elements', key='elements', model=ObjectModel(identifier='chunk', signature='*args,**kwargs', datatype=None, output_schema=Schema(identifier='myschema', fields={'text': 'string'}), flatten=True, model_update_kwargs={'document_embedded': False}, metrics=(), validation_sets=None, predict_kwargs={}, object=<function get_chunks at 0x28e1b5b40>, num_workers=0), select=<superduperdb.backends.mongodb.query.MongoCompoundSelect[
     [92m[1mpages.find({}, {})[0m
 ] object at 0x291b13850>, active=True, predict_kwargs={}))

## Building Vector Search Feature Using OpenAIEmbedding Model

In [11]:
from superduperdb.ext.openai import OpenAIEmbedding
from tqdm import tqdm

from superduperdb.ext.openai import OpenAIEmbedding
from superduperdb import VectorIndex

def preprocess(x):
    if isinstance(x, dict):
        # For model chains, the logic of this key needs to be optimized.
        chunk = sorted(x.items())[-1][1]
        return chunk["text"]
    return x

# Create an instance of the OpenAIEmbedding model with the specified identifier ('text-embedding-ada-002')
model = OpenAIEmbedding(
    identifier='text-embedding-ada-002',
    model="text-embedding-ada-002",
)
# preprocess = Model("preprocess", object=preprocess)

# from superduperdb.components.model import SequentialModel
# emb_model = SequentialModel(identifier="emb", predictors=[preprocess, model])


db.add(
    VectorIndex(
        identifier='vector_index',
        indexing_listener=Listener(
            select=Collection('_outputs.elements.chunk').find(),
            key='_outputs.elements.chunk',  # Key for the documents
            model=model,  # Specify the model for processing
            predict_kwargs={"max_chunk_size": 64},
        ),
    )
)


[2024-03-04 21:53:16] httpx INFO HTTP Request: GET https://api.openai.com/v1/models "HTTP/1.1 200 OK"


[32m 2024-Mar-04 21:53:16.83[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36msuperduperdb.backends.local.compute[0m:[36m32  [0m | [1mSubmitting job. function:<function method_job at 0x12f494430>[0m


6it [00:00, 2459.28it/s]
  0%|                                                                           | 0/1 [00:00<?, ?it/s][2024-03-04 21:53:17] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
100%|███████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.18s/it]


[32m 2024-Mar-04 21:53:18.05[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36msuperduperdb.components.model[0m:[36m649 [0m | [1mAdding 6 model outputs to `db`[0m
[32m 2024-Mar-04 21:53:18.08[0m| [32m[1mSUCCESS [0m | [36m183eefeaab2d[0m| [36msuperduperdb.backends.local.compute[0m:[36m38  [0m | [32m[1mJob submitted.  function:<function method_job at 0x12f494430> future:6352ce50-2098-460e-a965-51216908a4c8[0m
[32m 2024-Mar-04 21:53:18.09[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36msuperduperdb.backends.local.compute[0m:[36m32  [0m | [1mSubmitting job. function:<function callable_job at 0x12f494700>[0m


KeyError: '_outputs'

Define a function for vector search.

# Create vector search and Chatbot applications

In [None]:
db = superduper("mongodb://127.0.0.1:27017/intel-geti")

In [None]:
def vector_search(db, query, top_k=5):
    logging.info(f"Vector search query: {query}")
    collection = Collection('_outputs.elements.chunk')
    outs = db.execute(
        collection.like(
            Document({"_outputs.elements.chunk": query}),
            vector_index="vector_index",
            n=top_k,
        ).find({})
    )
    if outs:
        outs = sorted(outs, key=lambda x: x.content["score"], reverse=True)
    for out in outs:
        print("-" * 20, '\n')
        data = out.outputs("elements", 'chunk')
    
        source = out.content['_source']
        source_url = Collection('pages').find_one({"_id": source}).execute(db)['url']
        data = out.outputs("elements", 'chunk')
        url = source_url + data['url']
        print(url, out['score'])
        print(data["content"])

In [None]:
vector_search(db, "What parameters does the DeployedModel class have?")

## Building Document Functionality Using ChatGPT Model

In [None]:
from superduperdb.ext.openai import OpenAIChatCompletion
prompt = """
As an Intel GETI assistant, based on the provided documents and the question, answer the question.
If the document does not provide an answer, offer a safe response without fabricating an answer.

Documents:
{context}

Question: """

llm = OpenAIChatCompletion(identifier='gpt-3.5-turbo', prompt=prompt)

db.add(llm)

print(db.show('model'))

In [None]:
def qa(db, query, vector_search_top_k=5):
    logging.info(f"QA query: {query}")
    collection = Collection("_outputs.elements.chunk")
    output, sources = db.predict(
        model_name='gpt-3.5-turbo',
        input=query,
        context_select=collection.like(
            Document({"_outputs.elements.chunk": query}),
            vector_index="vector_index",
            n=vector_search_top_k,
        ).find({}),
        context_key="_outputs.elements.chunk.0.content",
    )
    if sources:
        sources = sorted(sources, key=lambda x: x.content["score"], reverse=True)
    return output, sources


In [None]:
from IPython.display import Markdown, display

output, sources = qa(db, "What parameters does the DeployedModel class have?")
display(Markdown(output.content))
for source in sources:
    source_data = source.content['_source']
    source_url = Collection('pages').find_one({"_id": source_data}).execute(db)['url']
    data = source.outputs("elements", 'chunk')
    url = source_url + data['url']
    print(url)

In [None]:
source.outputs("elements", 'chunk')["text"]