# INTEL GETI Docs Chatbot

Install related dependencies

In [1]:
# !pip install superduperdb unstructured pandas openai aiohttp

## Crawling Pages

Crawl pages based on the provided links. Additionally, retrieve a list of new pages from the sidebar directory information and continue crawling until all pages have been crawled.

In [2]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin

def is_toctree_class(tag):
    classes = tag.get('class', [])
    return any(re.match('toctree-l\d+', cls) for cls in classes)

def filter_sub_urls(all_urls):
    # remove the URL with #, for example: http://xxxx.com/xxx#P1
    base_urls_set = {url for _, url in all_urls if '#' not in url}
    new_urls = []
    for page_name, url in all_urls:
        if '#' in url and url.split('#')[0] in base_urls_set:
            continue
        else:
            new_urls.append((page_name, url))
    return new_urls

def process_code_snippets(text):
    soup = BeautifulSoup(text, 'html.parser')
    pre_tags = soup.find_all('pre')

    for pre in pre_tags:
        processed_text = str(pre.text)
        new_content = "CODE::"+soup.new_string(processed_text)
        pre.clear()
        pre.append(new_content)
    return str(soup)
            
    
def process_py_class(source_html):
    soup = BeautifulSoup(source_html, 'html.parser')
    dl_tags = soup.find_all('dl', class_='py class')
    
    for dl in dl_tags:
        dt_tag = dl.find('dt', class_='sig sig-object py')
        if not dt_tag:
            continue
        last_headerlink = dt_tag.find_all('a', class_='headerlink')[-1]
        href = last_headerlink['href'] if last_headerlink else ''
        id = dt_tag.attrs['id']
        new_h3 = soup.new_tag("h3")
        new_a_inside_h3 = soup.new_tag("a", href=href)
        new_a_inside_h3.string = f"Class: {id}"
        new_h3.append(new_a_inside_h3)
        
        new_code = soup.new_tag("a")
        new_code.string = dt_tag.text
        dt_tag.insert_before(new_h3)
        dt_tag.insert_before(new_code)
        dt_tag.decompose()
        
            
    return str(soup)

def parse_url(seed_url):
    print(f"parse {seed_url}")
    response = requests.get(seed_url)
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    page_urls = []
    for l in soup.find_all(is_toctree_class):
        page_name = l.find('a').text.strip()
        href = l.find('a')['href'] if l.find('a') else ''
        if href:
            url = urljoin(seed_url, href)
            page_urls.append((page_name, url))

    page_urls = filter_sub_urls(page_urls)
    source_html = response.text
    source_html = process_code_snippets(source_html)
    source_html = process_py_class(source_html)
            
    return source_html, page_urls

# URL of the page to scrape
url_sets = set()
url_sets.add("https://openvinotoolkit.github.io/geti-sdk/index.html")
url_sets.add("https://docs.geti.intel.com/on-prem/1.8/guide/get-started/introduction.html")
url_waiting_list = url_sets.copy()
pages = list()
while url_waiting_list:
    url = url_waiting_list.pop()
    source_html, page_urls = parse_url(url)
    pages.append((url, source_html))
    new_urls = {url for _, url in page_urls if url not in url_sets}
    url_waiting_list.update(new_urls)
    url_sets.update(new_urls)

parse https://docs.geti.intel.com/on-prem/1.8/guide/get-started/introduction.html
parse https://docs.geti.intel.com/on-prem/1.8/guide/account-management/account-management.html
parse https://docs.geti.intel.com/on-prem/1.8/guide/additional-resources/openvino/test-optimize-deploy-openvino.html
parse https://docs.geti.intel.com/on-prem/1.8/guide/datasets/statistics.html
parse https://docs.geti.intel.com/on-prem/1.8/guide/additional-resources/ai-fundamentals/anomaly-classification-project.html
parse https://docs.geti.intel.com/on-prem/1.8/guide/rest-api/rest-api-redirect.html
parse https://docs.geti.intel.com/on-prem/1.8/guide/annotations/video-annotation.html
parse https://docs.geti.intel.com/on-prem/1.8/guide/installation-guide/additional-resources/uninstall-guide.html
parse https://docs.geti.intel.com/on-prem/1.8/guide/installation-guide/installation.html
parse https://docs.geti.intel.com/on-prem/1.8/guide/datasets/media.html
parse https://docs.geti.intel.com/on-prem/1.8/guide/get-star

## Importing Webpage Data into Database

### Using SuperduperDB to Connect to Database

In [3]:
from superduperdb import superduper
db = superduper("mongodb://127.0.0.1:27017/intel-geti")
db.drop(force=True)

  from .autonotebook import tqdm as notebook_tqdm
2024-02-28 15:00:34,663	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


[32m 2024-Feb-28 15:00:34.67[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.base.build[0m:[36m61  [0m | [1mData Client is ready. MongoClient(host=['127.0.0.1:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=5000)[0m
[32m 2024-Feb-28 15:00:34.68[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.base.build[0m:[36m36  [0m | [1mConnecting to Metadata Client with engine:  MongoClient(host=['127.0.0.1:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=5000)[0m
[32m 2024-Feb-28 15:00:34.68[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.base.build[0m:[36m144 [0m | [1mConnecting to compute client: local[0m
[32m 2024-Feb-28 15:00:34.68[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52

Store the webpage data into the database after unstructured parsing.

In [4]:
from unstructured.partition.html import partition_html
from superduperdb.ext.unstructured.encoder import unstructured_encoder

db.add(unstructured_encoder)

datas = []
for url, source_html in pages:
    elements = partition_html(text=source_html, html_assemble_articles=True)
    if elements:
        datas.append({'url': url, 'elements': unstructured_encoder(elements)})

from superduperdb import Document
from superduperdb.backends.mongodb import Collection
documents = list(map(Document, datas))
collection = Collection("pages")
collection.insert_many(documents).execute(db)

[2024-02-28 15:00:37] unstructured INFO Reading document from string ...
[2024-02-28 15:00:37] unstructured INFO Reading document ...
[2024-02-28 15:00:37] unstructured INFO Reading document from string ...
[2024-02-28 15:00:37] unstructured INFO Reading document ...
[2024-02-28 15:00:37] unstructured INFO Reading document from string ...
[2024-02-28 15:00:37] unstructured INFO Reading document ...
[2024-02-28 15:00:37] unstructured INFO Reading document from string ...
[2024-02-28 15:00:37] unstructured INFO Reading document ...
[2024-02-28 15:00:37] unstructured INFO Reading document from string ...
[2024-02-28 15:00:37] unstructured INFO Reading document ...
[2024-02-28 15:00:37] unstructured INFO Reading document from string ...
[2024-02-28 15:00:37] unstructured INFO Reading document ...
[2024-02-28 15:00:37] unstructured INFO Reading document from string ...
[2024-02-28 15:00:37] unstructured INFO Reading document ...
[2024-02-28 15:00:37] unstructured INFO Reading document from 

[ObjectId('65deda17b9b5b0b929a00663'),
 ObjectId('65deda17b9b5b0b929a00664'),
 ObjectId('65deda17b9b5b0b929a00665'),
 ObjectId('65deda17b9b5b0b929a00666'),
 ObjectId('65deda17b9b5b0b929a00667'),
 ObjectId('65deda17b9b5b0b929a00668'),
 ObjectId('65deda17b9b5b0b929a00669'),
 ObjectId('65deda17b9b5b0b929a0066a'),
 ObjectId('65deda17b9b5b0b929a0066b'),
 ObjectId('65deda17b9b5b0b929a0066c'),
 ObjectId('65deda17b9b5b0b929a0066d'),
 ObjectId('65deda17b9b5b0b929a0066e'),
 ObjectId('65deda17b9b5b0b929a0066f'),
 ObjectId('65deda17b9b5b0b929a00670'),
 ObjectId('65deda17b9b5b0b929a00671'),
 ObjectId('65deda17b9b5b0b929a00672'),
 ObjectId('65deda17b9b5b0b929a00673'),
 ObjectId('65deda17b9b5b0b929a00674'),
 ObjectId('65deda17b9b5b0b929a00675'),
 ObjectId('65deda17b9b5b0b929a00676'),
 ObjectId('65deda17b9b5b0b929a00677'),
 ObjectId('65deda17b9b5b0b929a00678'),
 ObjectId('65deda17b9b5b0b929a00679'),
 ObjectId('65deda17b9b5b0b929a0067a'),
 ObjectId('65deda17b9b5b0b929a0067b'),
 ObjectId('65deda17b9b5b0

## Parsing and Chunking Webpages

Define an title ecognition function to be used as chunk identifiers.

In [5]:
from unstructured.documents.elements import ElementType

def get_title_data(element):
    data = {}
    if element.category != ElementType.TITLE:
        return data
    if 'link_urls' not in element.metadata.to_dict():
        return data

    if 'category_depth' not in element.metadata.to_dict():
        return data

    [link_text, *_] = element.metadata.link_texts

    if not link_text:
        return data

    link_urls = element.metadata.link_urls
    if not link_urls:
        return data
    category_depth = element.metadata.category_depth
    return {'link': link_urls[0], 'category_depth':category_depth}

Define conversion methods for different types of text, such as titles, lists, tables, and code.


In [6]:
import pandas as pd
from io import StringIO
def element2text(element):
    title_message = get_title_data(element)
    text = element.text
    if title_message:
        title_tags = '#' * (title_message['category_depth'] + 1)
        text = title_tags + ' ' + text
        text = text.rstrip('#')

    elif element.category == ElementType.LIST_ITEM:
        text = '- ' + text

    elif element.category == ElementType.TABLE:
        html = element.metadata.text_as_html
        html = html.replace('|', '')
        df = pd.read_html(StringIO(html))[0]
        text = df.to_markdown(index=False)
        text = text + '  \n'

    if text.startswith("CODE::"):
        text = f"```\n{text[6:]}\n```"

    return text

Define chunking functions.

In [16]:
def get_chunk_texts(text, chunk_size=1000, overlap_size=300):
    chunks = []
    start = 0

    while start < len(text):
        if chunks:
            start -= overlap_size
        end = start + chunk_size
        end = min(end, len(text))
        chunks.append(text[start:end])
        start = end
        if start >= len(text):
            break

    return chunks

from collections import defaultdict
def get_chunks(elements):
    chunk_tree = defaultdict(list)
    now_depth = -1
    now_path = 'root'
    for element in elements:
        title_data = get_title_data(element)
        if not title_data:
            chunk_tree[now_path].append(element)
        else:
            link = title_data['link']
            depth = title_data['category_depth']
            if depth > now_depth:
                now_path = now_path + "::" +link
            else:
                now_path = '::'.join(now_path.split("::")[:depth+1] + [link])
            now_depth = depth
            chunk_tree[now_path].append(element)
     
    chunks = []
    for node_path, node_elements in chunk_tree.items():
        new_elements = []
        nodes = node_path.split("::")
        parent_elements = []
        for i in range(1, len(nodes) - 1):
            [parent_element, *_] = chunk_tree["::".join(nodes[:i+1])] or [None]
            if parent_element:
                parent_elements.append(parent_element)
        node_elements = [*parent_elements, *node_elements]
        content = '\n\n'.join(map(lambda x: element2text(x), node_elements))
        for chunk_text in get_chunk_texts(content):
            # The url field is used to save the jump link
            # The text field is used for vector search
            # The content field is used to submit to LLM for answer
            chunk = {"url": nodes[-1], 'text': chunk_text, 'content': content}
            chunks.append(chunk)
    return chunks

Define a chunking model and add a Listener to listen to data and chunk webpages.

In [8]:
from superduperdb import Model, Listener, Schema


chunk_model = Model(
    identifier='chunk',
    object=get_chunks,
    flatten=True,
    model_update_kwargs={"document_embedded": False},
    output_schema=Schema(identifier="myschema", fields={"text": "string"}),
)

db.add(
    Listener(
        model=chunk_model,
        select=Collection('pages').find(),
        key="elements",
    )
)

74it [00:00, 33500.11it/s]


[32m 2024-Feb-28 15:00:40.10[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 74 model outputs to `db`[0m


([None],
 Listener(identifier='chunk/elements', key='elements', model=Model(identifier='chunk', encoder=None, output_schema=Schema(identifier='myschema', fields={'text': 'string', '_fold': FieldType(identifier='String')}), flatten=True, preprocess=None, postprocess=None, collate_fn=None, batch_predict=False, takes_context=False, metrics=(), model_update_kwargs={'document_embedded': False}, validation_sets=None, predict_X=None, predict_select=None, predict_max_chunk_size=None, predict_kwargs=None, object=<Artifact artifact=<function get_chunks at 0x2956a37f0> serializer=dill>, model_to_device_method=None, metric_values={}, predict_method=None, serializer='dill', device='cpu', preferred_devices=('cuda', 'mps', 'cpu'), training_configuration=None, train_X=None, train_y=None, train_select=None), select=<superduperdb.backends.mongodb.query.MongoCompoundSelect[
     [92m[1mpages.find({'_id': "{'$in': '[65deda17b9b5b0b929a00663, 65deda17b9b5b0b929a00664, 65deda17b9b5b0b929a00665, 65deda17b9

## Building Vector Search Feature Using OpenAIEmbedding Model

In [9]:
from superduperdb.ext.openai import OpenAIEmbedding
from superduperdb.base.artifact import Artifact
from tqdm import tqdm
def _predict(self, X, one: bool = False, **kwargs):
    if isinstance(X, str) or one:
        if isinstance(self.preprocess, Artifact):
            X = self.preprocess.artifact(X)
        return self._predict_one(X)

    if isinstance(self.preprocess, Artifact):
        X = [self.preprocess.artifact(i) for i in X]

    out = []
    batch_size = kwargs.pop("batch_size", 100)
    for i in tqdm(range(0, len(X), batch_size)):
        out.extend(self._predict_a_batch(X[i : i + batch_size], **kwargs))
    return out


OpenAIEmbedding._predict = _predict

from superduperdb.ext.openai import OpenAIEmbedding
from superduperdb.base.artifact import Artifact
from superduperdb import VectorIndex

def preprocess(x):
    if isinstance(x, dict):
        # For model chains, the logic of this key needs to be optimized.
        chunk = sorted(x.items())[-1][1]
        return chunk["text"]
    return x

# Create an instance of the OpenAIEmbedding model with the specified identifier ('text-embedding-ada-002')
model = OpenAIEmbedding(
    identifier='text-embedding-ada-002',
    model="text-embedding-ada-002",
    preprocess=Artifact(preprocess),
)


db.add(
    VectorIndex(
        identifier='vector_index',
        indexing_listener=Listener(
            select=Collection('_outputs.elements.chunk').find(),
            key='_outputs.elements.chunk',  # Key for the documents
            model=model,  # Specify the model for processing
            predict_kwargs={"max_chunk_size": 64},
        ),
    )
)


[2024-02-28 15:00:41] httpx INFO HTTP Request: GET https://api.openai.com/v1/models "HTTP/1.1 200 OK"
983it [00:00, 137506.70it/s]


[32m 2024-Feb-28 15:00:41.36[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m417 [0m | [1mComputing chunk 0/15[0m


  0%|                                                                                                                                                          | 0/1 [00:00<?, ?it/s][2024-02-28 15:00:42] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.75s/it]


[32m 2024-Feb-28 15:00:43.11[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 64 model outputs to `db`[0m
[32m 2024-Feb-28 15:00:43.20[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m417 [0m | [1mComputing chunk 1/15[0m


  0%|                                                                                                                                                          | 0/1 [00:00<?, ?it/s][2024-02-28 15:00:43] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.19s/it]


[32m 2024-Feb-28 15:00:44.41[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 64 model outputs to `db`[0m
[32m 2024-Feb-28 15:00:44.48[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m417 [0m | [1mComputing chunk 2/15[0m


  0%|                                                                                                                                                          | 0/1 [00:00<?, ?it/s][2024-02-28 15:00:44] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.50it/s]


[32m 2024-Feb-28 15:00:45.15[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 64 model outputs to `db`[0m
[32m 2024-Feb-28 15:00:45.25[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m417 [0m | [1mComputing chunk 3/15[0m


  0%|                                                                                                                                                          | 0/1 [00:00<?, ?it/s][2024-02-28 15:00:45] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.58it/s]


[32m 2024-Feb-28 15:00:45.89[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 64 model outputs to `db`[0m
[32m 2024-Feb-28 15:00:45.96[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m417 [0m | [1mComputing chunk 4/15[0m


  0%|                                                                                                                                                          | 0/1 [00:00<?, ?it/s][2024-02-28 15:00:46] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.48it/s]


[32m 2024-Feb-28 15:00:46.64[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 64 model outputs to `db`[0m
[32m 2024-Feb-28 15:00:46.77[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m417 [0m | [1mComputing chunk 5/15[0m


  0%|                                                                                                                                                          | 0/1 [00:00<?, ?it/s][2024-02-28 15:00:47] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.36it/s]


[32m 2024-Feb-28 15:00:47.52[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 64 model outputs to `db`[0m
[32m 2024-Feb-28 15:00:47.58[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m417 [0m | [1mComputing chunk 6/15[0m


  0%|                                                                                                                                                          | 0/1 [00:00<?, ?it/s][2024-02-28 15:00:48] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.35it/s]


[32m 2024-Feb-28 15:00:48.33[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 64 model outputs to `db`[0m
[32m 2024-Feb-28 15:00:48.39[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m417 [0m | [1mComputing chunk 7/15[0m


  0%|                                                                                                                                                          | 0/1 [00:00<?, ?it/s][2024-02-28 15:00:48] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.25it/s]


[32m 2024-Feb-28 15:00:49.20[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 64 model outputs to `db`[0m
[32m 2024-Feb-28 15:00:49.26[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m417 [0m | [1mComputing chunk 8/15[0m


  0%|                                                                                                                                                          | 0/1 [00:00<?, ?it/s][2024-02-28 15:00:49] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.31it/s]


[32m 2024-Feb-28 15:00:50.03[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 64 model outputs to `db`[0m
[32m 2024-Feb-28 15:00:50.10[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m417 [0m | [1mComputing chunk 9/15[0m


  0%|                                                                                                                                                          | 0/1 [00:00<?, ?it/s][2024-02-28 15:00:50] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.11it/s]


[32m 2024-Feb-28 15:00:51.01[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 64 model outputs to `db`[0m
[32m 2024-Feb-28 15:00:51.07[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m417 [0m | [1mComputing chunk 10/15[0m


  0%|                                                                                                                                                          | 0/1 [00:00<?, ?it/s][2024-02-28 15:00:51] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.12it/s]


[32m 2024-Feb-28 15:00:51.97[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 64 model outputs to `db`[0m
[32m 2024-Feb-28 15:00:52.06[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m417 [0m | [1mComputing chunk 11/15[0m


  0%|                                                                                                                                                          | 0/1 [00:00<?, ?it/s][2024-02-28 15:00:52] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.54it/s]


[32m 2024-Feb-28 15:00:52.72[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 64 model outputs to `db`[0m
[32m 2024-Feb-28 15:00:52.80[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m417 [0m | [1mComputing chunk 12/15[0m


  0%|                                                                                                                                                          | 0/1 [00:00<?, ?it/s][2024-02-28 15:00:53] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.31s/it]


[32m 2024-Feb-28 15:00:54.12[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 64 model outputs to `db`[0m
[32m 2024-Feb-28 15:00:54.18[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m417 [0m | [1mComputing chunk 13/15[0m


  0%|                                                                                                                                                          | 0/1 [00:00<?, ?it/s][2024-02-28 15:00:54] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.30it/s]


[32m 2024-Feb-28 15:00:54.96[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 64 model outputs to `db`[0m
[32m 2024-Feb-28 15:00:55.02[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m417 [0m | [1mComputing chunk 14/15[0m


  0%|                                                                                                                                                          | 0/1 [00:00<?, ?it/s][2024-02-28 15:00:55] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.10it/s]


[32m 2024-Feb-28 15:00:55.94[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 64 model outputs to `db`[0m
[32m 2024-Feb-28 15:00:56.01[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m417 [0m | [1mComputing chunk 15/15[0m


  0%|                                                                                                                                                          | 0/1 [00:00<?, ?it/s][2024-02-28 15:00:56] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.13it/s]

[32m 2024-Feb-28 15:00:56.49[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 23 model outputs to `db`[0m





([None],
 VectorIndex(identifier='vector_index', indexing_listener=Listener(identifier='text-embedding-ada-002/elements', key='_outputs.elements.chunk', model=OpenAIEmbedding(encoder=Encoder(identifier='vector[1536]', decoder=None, encoder=None, shape=(1536,), load_hybrid=True), output_schema=None, flatten=False, preprocess=<Artifact artifact=<function preprocess at 0x2960ed2d0> serializer=dill>, postprocess=None, collate_fn=None, batch_predict=False, takes_context=False, metrics=(), model_update_kwargs={}, validation_sets=None, predict_X=None, predict_select=None, predict_max_chunk_size=None, predict_kwargs=None, identifier='text-embedding-ada-002', model='text-embedding-ada-002', client_kwargs={}, shape=(1536,)), select=<superduperdb.backends.mongodb.query.MongoCompoundSelect[
     [92m[1m_outputs.elements.chunk.elements.chunk.find({'_id': "{'$in': '[65deda18b9b5b0b929a00a73, 65deda18b9b5b0b929a00a74, 65deda18b9b5b0b929a00a75, 65deda18b9b5b0b929a00a76, 65deda18b9b5b0b929a00a77, 65d

Define a function for vector search.

# Create vector search and Chatbot applications

In [10]:
db = superduper("mongodb://127.0.0.1:27017/intel-geti")

[32m 2024-Feb-28 15:00:56.59[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.base.build[0m:[36m61  [0m | [1mData Client is ready. MongoClient(host=['127.0.0.1:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=5000)[0m
[32m 2024-Feb-28 15:00:56.59[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.base.build[0m:[36m36  [0m | [1mConnecting to Metadata Client with engine:  MongoClient(host=['127.0.0.1:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=5000)[0m
[32m 2024-Feb-28 15:00:56.59[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.base.build[0m:[36m144 [0m | [1mConnecting to compute client: local[0m
[32m 2024-Feb-28 15:00:56.59[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52

In [11]:
def vector_search(db, query, top_k=5):
    logging.info(f"Vector search query: {query}")
    collection = Collection('_outputs.elements.chunk')
    outs = db.execute(
        collection.like(
            Document({"_outputs.elements.chunk": query}),
            vector_index="vector_index",
            n=top_k,
        ).find({})
    )
    if outs:
        outs = sorted(outs, key=lambda x: x.content["score"], reverse=True)
    for out in outs:
        print("-" * 20, '\n')
        data = out.outputs("elements", 'chunk')
    
        source = out.content['_source']
        source_url = Collection('pages').find_one({"_id": source}).execute(db)['url']
        data = out.outputs("elements", 'chunk')
        url = source_url + data['url']
        print(url, out['score'])
        print(data["content"])

In [12]:
vector_search(db, "What parameters does the DeployedModel class have?")

[2024-02-28 15:00:56] root INFO Vector search query: What parameters does the DeployedModel class have?
[2024-02-28 15:00:57] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[32m 2024-Feb-28 15:00:57.39[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.base.datalayer[0m:[36m154 [0m | [1mloading of vectors of vector-index: 'vector_index'[0m
[32m 2024-Feb-28 15:00:57.39[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m31cafef7-ff65-4fb9-a076-ac55aac52f8c[0m| [36msuperduperdb.base.datalayer[0m:[36m170 [0m | [1m<superduperdb.backends.mongodb.query.MongoCompoundSelect[
    [92m[1m_outputs.elements.chunk.elements.chunk.find({}, {'_outputs.elements.text-embedding-ada-002.0': '1', '_outputs.elements.text-embedding-ada-002/0': '1', '_id': '1'})[0m
] object at 0x297fb9f30>[0m


Loading vectors into vector-table...: 983it [00:00, 1124.06it/s]

-------------------- 

https://openvinotoolkit.github.io/geti-sdk/geti_sdk.data_models.html#deployment-related-entities 0.8028081338966921
# geti_sdk.data_models

## Module contents

### Deployment-related entities
-------------------- 

https://docs.geti.intel.com/on-prem/1.8/guide/deployments/deployments.html#deployments 0.8019525830346235
# Deployments

Important

The expected color code is RGB for IntelÂ® Getiâ¢ exportable code and deployment.

The Deployments screen allows users to export deployment code for the trained models.

Once you have tested and optimized the model, you are ready to download and deploy your solution. To download the code, click on Select model for deployment. In the dialog box, choose the model architecture and its version as well as the model optimization if available. In the case of a task chain project, you will need to select two models through a wizard. Upon your selection, click Download.
-------------------- 

https://openvinotoolkit.github.io/g




## Building Document Functionality Using ChatGPT Model

In [13]:
from superduperdb.ext.openai import OpenAIChatCompletion
prompt = """
As an Intel GETI assistant, based on the provided documents and the question, answer the question.
If the document does not provide an answer, offer a safe response without fabricating an answer.

Documents:
{context}

Question: """

llm = OpenAIChatCompletion(identifier='gpt-3.5-turbo', prompt=prompt)

db.add(llm)

print(db.show('model'))

['chunk', 'gpt-3.5-turbo', 'text-embedding-ada-002']


In [14]:
def qa(db, query, vector_search_top_k=5):
    logging.info(f"QA query: {query}")
    collection = Collection("_outputs.elements.chunk")
    output, sources = db.predict(
        model_name='gpt-3.5-turbo',
        input=query,
        context_select=collection.like(
            Document({"_outputs.elements.chunk": query}),
            vector_index="vector_index",
            n=vector_search_top_k,
        ).find({}),
        context_key="_outputs.elements.chunk.0.content",
    )
    if sources:
        sources = sorted(sources, key=lambda x: x.content["score"], reverse=True)
    return output, sources


In [15]:
from IPython.display import Markdown, display

output, sources = qa(db, "What parameters does the DeployedModel class have?")
display(Markdown(output.content))
for source in sources:
    source_data = source.content['_source']
    source_url = Collection('pages').find_one({"_id": source_data}).execute(db)['url']
    data = source.outputs("elements", 'chunk')
    url = source_url + data['url']
    print(url)

[2024-02-28 15:00:58] root INFO QA query: What parameters does the DeployedModel class have?
[2024-02-28 15:00:58] httpx INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[2024-02-28 15:01:02] httpx INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The DeployedModel class in the geti_sdk.deployment module has the following parameters:

- Name
- FPS throughput
- Latency
- Precision
- Creation date
- Size
- Target device
- Target device type
- Previous revision id
- Previous trained revision id
- Score
- Performance
- ID
- Label schema in sync
- Model format
- Has XAI head
- Model status
- Optimization methods
- Optimization objectives
- Optimization type
- Version
- Configurations
- Hyper parameters

These parameters are used to represent an Intel® Geti™ model that has been deployed for inference.

https://openvinotoolkit.github.io/geti-sdk/geti_sdk.data_models.html#deployment-related-entities
https://docs.geti.intel.com/on-prem/1.8/guide/deployments/deployments.html#deployments
https://openvinotoolkit.github.io/geti-sdk/geti_sdk.deployment.html#geti_sdk.deployment.deployed_model.DeployedModel
https://docs.geti.intel.com/on-prem/1.8/guide/release-notes/1.8/release-1.8.html#new-default-model-deployment
https://openvinotoolkit.github.io/geti-sdk/geti_sdk.deployment.data_models.html#module-geti_sdk.deployment.data_models


In [18]:
source.outputs("elements", 'chunk')["text"]

'# geti_sdk.deployment.data_models package\uf0c1\n\n## Module contents\uf0c1'