# INTEL GETI Docs Chatbot

Install related dependencies

In [1]:
# !pip install superduperdb unstructured pandas openai aiohttp

## Crawling Pages

Crawl pages based on the provided links. Additionally, retrieve a list of new pages from the sidebar directory information and continue crawling until all pages have been crawled.

In [2]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin

def is_toctree_class(tag):
    classes = tag.get('class', [])
    return any(re.match('toctree-l\d+', cls) for cls in classes)

def filter_sub_urls(all_urls):
    # remove the URL with #, for example: http://xxxx.com/xxx#P1
    base_urls_set = {url for _, url in all_urls if '#' not in url}
    new_urls = []
    for page_name, url in all_urls:
        if '#' in url and url.split('#')[0] in base_urls_set:
            continue
        else:
            new_urls.append((page_name, url))
    return new_urls

def process_code_snippets(text):
    soup = BeautifulSoup(text, 'html.parser')
    pre_tags = soup.find_all('pre')

    for pre in pre_tags:
        processed_text = str(pre.text)
        new_content = "CODE::"+soup.new_string(processed_text)
        pre.clear()
        pre.append(new_content)
    return str(soup)
            
    
def parse_url(seed_url):
    print(f"parse {seed_url}")
    response = requests.get(seed_url)
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    page_urls = []
    for l in soup.find_all(is_toctree_class):
        page_name = l.find('a').text.strip()
        href = l.find('a')['href'] if l.find('a') else ''
        if href:
            url = urljoin(seed_url, href)
            page_urls.append((page_name, url))

    page_urls = filter_sub_urls(page_urls)
    source_html = response.text
    source_html = process_code_snippets(source_html)
            
    return source_html, page_urls

# URL of the page to scrape
filter_tag = "geti_sdk."
url_sets = set()
url_sets.add("https://openvinotoolkit.github.io/geti-sdk/index.html")
url_sets.add("https://docs.geti.intel.com/on-prem/1.8/guide/get-started/introduction.html")
url_waiting_list = url_sets.copy()
pages = list()
while url_waiting_list:
    url = url_waiting_list.pop()
    source_html, page_urls = parse_url(url)
    pages.append((url, source_html))
    break
    new_urls = {url for _, url in page_urls if url not in url_sets}
    new_urls = {url for url in new_urls if filter_tag not in url}
    url_waiting_list.update(new_urls)
    url_sets.update(new_urls)

parse https://openvinotoolkit.github.io/geti-sdk/index.html


## Importing Webpage Data into Database

### Using SuperduperDB to Connect to Database

In [3]:
from superduperdb import superduper
db = superduper("mongodb://127.0.0.1:27017/intel-geti")
db.drop(force=True)

  from .autonotebook import tqdm as notebook_tqdm
2024-02-19 19:59:53,006	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


[32m 2024-Feb-19 19:59:53.01[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m1cd33afc-bd71-4b70-bdcf-eac97e6bf5b6[0m| [36msuperduperdb.base.build[0m:[36m61  [0m | [1mData Client is ready. MongoClient(host=['127.0.0.1:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=5000)[0m
[32m 2024-Feb-19 19:59:53.02[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m1cd33afc-bd71-4b70-bdcf-eac97e6bf5b6[0m| [36msuperduperdb.base.build[0m:[36m36  [0m | [1mConnecting to Metadata Client with engine:  MongoClient(host=['127.0.0.1:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=5000)[0m
[32m 2024-Feb-19 19:59:53.02[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m1cd33afc-bd71-4b70-bdcf-eac97e6bf5b6[0m| [36msuperduperdb.base.build[0m:[36m144 [0m | [1mConnecting to compute client: local[0m
[32m 2024-Feb-19 19:59:53.02[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m1cd33afc-bd71-4b70-bdcf-eac97e6bf

Store the webpage data into the database after unstructured parsing.

In [None]:
from unstructured.partition.html import partition_html
from superduperdb.ext.unstructured.encoder import unstructured_encoder

db.add(unstructured_encoder)

datas = []
for url, source_html in pages:
    elements = partition_html(text=source_html, html_assemble_articles=True)
    if elements:
        datas.append({'url': url, 'elements': unstructured_encoder(elements)})

from superduperdb import Document
from superduperdb.backends.mongodb import Collection
documents = list(map(Document, datas))
collection = Collection("pages")
collection.insert_many(documents).execute(db)

## Parsing and Chunking Webpages

Define an title ecognition function to be used as chunk identifiers.

In [None]:
from unstructured.documents.elements import ElementType

def get_title_data(element):
    data = {}
    if element.category != ElementType.TITLE:
        return data
    if 'link_urls' not in element.metadata.to_dict():
        return data

    if 'category_depth' not in element.metadata.to_dict():
        return data

    [link_text, *_] = element.metadata.link_texts

    if not link_text:
        return data

    link_urls = element.metadata.link_urls
    if not link_urls:
        return data
    category_depth = element.metadata.category_depth
    return {'link': link_urls[0], 'category_depth':category_depth}

Define conversion methods for different types of text, such as titles, lists, tables, and code.


In [6]:
import pandas as pd
from io import StringIO
def element2text(element):
    title_message = get_title_data(element)
    text = element.text
    if title_message:
        title_tags = '#' * (title_message['category_depth'] + 1)
        text = title_tags + ' ' + text
        text = text.rstrip('#')

    elif element.category == ElementType.LIST_ITEM:
        text = '- ' + text

    elif element.category == ElementType.TABLE:
        html = element.metadata.text_as_html
        html = html.replace('|', '')
        df = pd.read_html(StringIO(html))[0]
        text = df.to_markdown(index=False)
        text = text + '  \n'

    if text.startswith("CODE::"):
        text = f"```\n{text[6:]}\n```"

    return text

Define chunking functions.

In [7]:
from collections import defaultdict
def get_chunks(elements):
    chunk_tree = defaultdict(list)
    now_depth = -1
    now_path = 'root'
    for element in elements:
        title_data = get_title_data(element)
        if not title_data:
            chunk_tree[now_path].append(element)
        else:
            link = title_data['link']
            depth = title_data['category_depth']
            if depth > now_depth:
                now_path = now_path + "::" +link
            else:
                now_path = '::'.join(now_path.split("::")[:depth+1] + [link])
            now_depth = depth
            chunk_tree[now_path].append(element)
     
    chunks = []
    for node_path, node_elements in chunk_tree.items():
        new_elements = []
        nodes = node_path.split("::")
        parent_elements = []
        for i in range(1, len(nodes) - 1):
            [parent_element, *_] = chunk_tree["::".join(nodes[:i+1])] or [None]
            if parent_element:
                parent_elements.append(parent_element)
        node_elements = [*parent_elements, *node_elements]
        chunk = {"url": nodes[-1], 'text': '\n\n'.join(map(lambda x: element2text(x), node_elements))}
        chunks.append(chunk)
    return chunks

Define a chunking model and add a Listener to listen to data and chunk webpages.

In [8]:
from superduperdb import Model, Listener, Schema


chunk_model = Model(
    identifier='chunk',
    object=get_chunks,
    flatten=True,
    model_update_kwargs={"document_embedded": False},
    output_schema=Schema(identifier="myschema", fields={"text": "string"}),
)

db.add(
    Listener(
        model=chunk_model,
        select=Collection('pages').find(),
        key="elements",
    )
)

1it [00:00, 747.25it/s]

[32m 2024-Feb-19 19:48:40.72[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36mab6ab314-5767-4bfb-aac6-1a2084725da3[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 1 model outputs to `db`[0m





([None],
 Listener(identifier='chunk/elements', key='elements', model=Model(identifier='chunk', encoder=None, output_schema=Schema(identifier='myschema', fields={'text': 'string', '_fold': FieldType(identifier='String')}), flatten=True, preprocess=None, postprocess=None, collate_fn=None, batch_predict=False, takes_context=False, metrics=(), model_update_kwargs={'document_embedded': False}, validation_sets=None, predict_X=None, predict_select=None, predict_max_chunk_size=None, predict_kwargs=None, object=<Artifact artifact=<function get_chunks at 0x292c0eb90> serializer=dill>, model_to_device_method=None, metric_values={}, predict_method=None, serializer='dill', device='cpu', preferred_devices=('cuda', 'mps', 'cpu'), training_configuration=None, train_X=None, train_y=None, train_select=None), select=<superduperdb.backends.mongodb.query.MongoCompoundSelect[
     [92m[1mpages.find({'_id': "{'$in': '[65d34017d9885df39a4b3843]'}"}, {})[0m
 ] object at 0x2966423b0>, active=True, predict_k

## Building Vector Search Feature Using OpenAIEmbedding Model

In [15]:
from superduperdb.ext.openai import OpenAIEmbedding
from superduperdb.base.artifact import Artifact
from tqdm import tqdm
def _predict(self, X, one: bool = False, **kwargs):
    if isinstance(X, str) or one:
        if isinstance(self.preprocess, Artifact):
            X = self.preprocess.artifact(X)
        return self._predict_one(X)

    if isinstance(self.preprocess, Artifact):
        X = [self.preprocess.artifact(i) for i in X]

    out = []
    batch_size = kwargs.pop("batch_size", 100)
    for i in tqdm(range(0, len(X), batch_size)):
        out.extend(self._predict_a_batch(X[i : i + batch_size], **kwargs))
    return out


OpenAIEmbedding._predict = _predict

from superduperdb.ext.openai import OpenAIEmbedding
from superduperdb.base.artifact import Artifact
from superduperdb import VectorIndex

def preprocess(x):
    print(x)
    if isinstance(x, dict):
        # For model chains, the logic of this key needs to be optimized.
        chunk = sorted(x.items())[-1][1]
        return chunk["text"]
    return x

# Create an instance of the OpenAIEmbedding model with the specified identifier ('text-embedding-ada-002')
model = OpenAIEmbedding(
    identifier='text-embedding-ada-002',
    model="text-embedding-ada-002",
    preprocess=Artifact(preprocess),
)


db.add(
    VectorIndex(
        identifier='vector_index',
        indexing_listener=Listener(
            select=Collection('pages').find().outputs(elements='chunk'),
            key='_outputs.elements.chunk',  # Key for the documents
            model=model,  # Specify the model for processing
            predict_kwargs={"max_chunk_size": 64},
        ),
    )
)


InvalidDocument: documents must have only string keys, key was $_outputs.elements.chunk.{version}

Define a function for vector search.

In [None]:
def vector_search(db, query, top_k=5):
    logging.info(f"Vector search query: {query}")
    collection = Collection('_outputs.elements.chunk')
    outs = db.execute(
        collection.like(
            Document({"_outputs.elements.chunk": query}),
            vector_index="vector_index",
            n=top_k,
        ).find({})
    )
    if outs:
        outs = sorted(outs, key=lambda x: x.content["score"], reverse=True)
    for out in outs:
        print("-" * 20, '\n')
        data = out.outputs("elements", 'chunk')
    
        source = out.content['_source']
        source_url = Collection('pages').find_one({"_id": source}).execute(db)['url']
        data = out.outputs("elements", 'chunk')
        url = source_url + data['url']
        print(url, out['score'])
        print(data["text"])

In [41]:
# db = superduper("mongodb://127.0.0.1:27017/intel-geti")

In [32]:
outs = vector_search(db, "Learn how to interact with the Intel® Geti™ platform programmatically, bypassing the user interface.")


[2024-02-19 16:29:19] root INFO Vector search query: Learn how to interact with the Intel® Geti™ platform programmatically, bypassing the user interface.
[2024-02-19 16:29:19] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[32m 2024-Feb-19 16:29:19.80[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m84b42ae7-76eb-459e-97d7-5d784a1233ee[0m| [36msuperduperdb.base.datalayer[0m:[36m154 [0m | [1mloading of vectors of vector-index: 'vector_index'[0m
[32m 2024-Feb-19 16:29:19.80[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m84b42ae7-76eb-459e-97d7-5d784a1233ee[0m| [36msuperduperdb.base.datalayer[0m:[36m170 [0m | [1m<superduperdb.backends.mongodb.query.MongoCompoundSelect[
    [92m[1m_outputs.elements.chunk.elements.chunk.find({}, {'_outputs.elements.text-embedding-ada-002.0': '1', '_outputs.elements.text-embedding-ada-002/0': '1', '_id': '1'})[0m
] object at 0x29d0dcbe0>[0m


Loading vectors into vector-table...: 370it [00:00, 778.33it/s]

-------------------- 

https://docs.geti.intel.com/on-prem/1.8/guide/get-started/introduction.html#introduction 0.8708402222041286
# Introduction

Welcome to IntelÂ® Getiâ¢ documentation! Our documentation is broadly divided into the following blocks on the sidebar:

- Get Started - Discover what the IntelÂ® Getiâ¢ platform is, eplore its capabilities, and learn how to train your very first model using the IntelÂ® Getiâ¢ platform.

- Understanding the UI - Discover the core functionality of the IntelÂ® Getiâ¢ platform and get to know your way around the user interface.

- REST API - Learn how to interact with the IntelÂ® Getiâ¢ platform programmatically, bypassing the user interface.

- Additional Resources - Understand the fundamental concepts of artificial intelligence, and learn how to optimize the use of the IntelÂ® Getiâ¢ platform with IntelÂ® Hardware, among other things.

- On-prem Installation - Dive into the step-by-step process of installing and configuring the IntelÂ® 




## Building Document Functionality Using ChatGPT Model

In [33]:
from superduperdb.ext.openai import OpenAIChatCompletion
prompt = """
As an Intel GETI assistant, based on the provided document snippets and the question, answer the question.
If the document does not provide an answer, offer a safe response without fabricating an answer.

Document snippet: {context}

Question: """

llm = OpenAIChatCompletion(identifier='gpt-3.5-turbo', prompt=prompt)

db.add(llm)

print(db.show('model'))

['chunk', 'gpt-3.5-turbo', 'text-embedding-ada-002']


In [46]:
def qa(db, query, vector_search_top_k=5):
    logging.info(f"QA query: {query}")
    collection = Collection("_outputs.elements.chunk")
    output, sources = db.predict(
        model_name='gpt-3.5-turbo',
        input=query,
        context_select=collection.like(
            Document({"_outputs.elements.chunk": query}),
            vector_index="vector_index",
            n=vector_search_top_k,
        ).find({}),
        context_key="_outputs.elements.chunk.0.text",
    )
    if sources:
        sources = sorted(sources, key=lambda x: x.content["score"], reverse=True)
    return output, sources


In [52]:
def process_sources(sources):
    results = []
    for source in sources:
        source_data = source.content["_source"]
        source_url = (
            Collection("pages").find_one({"_id": source_data}).execute(db)["url"]
        )
        data = source.outputs("elements", "chunk")
        url = source_url + data["url"]
        results.append(
            {
                "score": source["score"],
                "url": url,
            }
        )
    return results


In [53]:
process_sources(sources)

[{'score': 0.8585718976409569,
  'url': 'https://docs.geti.intel.com/on-prem/1.8/guide/installation-guide/installation.html#installation-at-a-glance'},
 {'score': 0.8243211971856605,
  'url': 'https://docs.geti.intel.com/on-prem/1.8/guide/installation-guide/installation.html#installation-guide'},
 {'score': 0.8153833191961699,
  'url': 'https://openvinotoolkit.github.io/geti-sdk/getting_started.html#installing-from-the-git-repo'},
 {'score': 0.8080372840084254,
  'url': 'https://docs.geti.intel.com/on-prem/1.8/guide/installation-guide/installation/downloading-package.html#part-2-downloading-the-package'},
 {'score': 0.8061480481045303,
  'url': 'https://docs.geti.intel.com/on-prem/1.8/guide/installation-guide/installation/install-guide.html#installing-the-intel-geti-platform'}]

In [50]:
from IPython.display import Markdown, display

output, sources = qa(db, "How to install")
display(Markdown(output.content))
for source in sources:
    source_data = source.content['_source']
    source_url = Collection('pages').find_one({"_id": source_data}).execute(db)['url']
    data = out.outputs("elements", 'chunk')
    url = source_url + data['url']
    print(url)

[2024-02-19 16:55:51] root INFO QA query: How to install
[2024-02-19 16:55:52] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[2024-02-19 16:55:57] httpx INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


To install the Intel GETI platform, you can follow these general steps:

1. Ensure that all hardware, software, network, and security prerequisites are met on your Ubuntu machine.
2. Download and extract the Intel GETI installation package on your server.
3. Navigate to the extracted folder and choose between installation modes: with wizard (interactive) or with configuration file (pre-filled data).
4. Follow the installation instructions provided in the platform installation package.

For specific and detailed installation instructions, please refer to the official Intel GETI platform documentation or installation guide.

https://docs.geti.intel.com/on-prem/1.8/guide/installation-guide/installation.html#installing-the-intel-geti-platform
https://docs.geti.intel.com/on-prem/1.8/guide/installation-guide/installation.html#installing-the-intel-geti-platform
https://openvinotoolkit.github.io/geti-sdk/getting_started.html#installing-the-intel-geti-platform
https://docs.geti.intel.com/on-prem/1.8/guide/installation-guide/installation/downloading-package.html#installing-the-intel-geti-platform
https://docs.geti.intel.com/on-prem/1.8/guide/installation-guide/installation/install-guide.html#installing-the-intel-geti-platform
