## Setup

In [None]:
!pip install matplotlib
!pip install openai
!pip install plotly.express
!pip install scikit-learn
!pip install singlestoredb
!pip install tabulate
!pip install wget

In [None]:
import openai

import pandas as pd
import os
import wget
from ast import literal_eval

In [None]:
EMBEDDING_MODEL = "text-embedding-ada-002"

## Load Data

In [None]:
embeddings_url = 'https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip'

# The file is ~700 MB so this will take some time
wget.download(embeddings_url)

In [None]:
import zipfile

with zipfile.ZipFile("vector_database_wikipedia_articles_embedded.zip", "r") as zip_ref:
    zip_ref.extractall("data")

In [None]:
%%time

article_df = pd.read_csv(
    "data/vector_database_wikipedia_articles_embedded.csv"
)

In [None]:
article_df.head()

In [None]:
%%time

# Read vectors from strings back into a list
article_df['title_vector'] = article_df.title_vector.apply(literal_eval)
article_df['content_vector'] = article_df.content_vector.apply(literal_eval)

# Set vector_id to be a string
article_df['vector_id'] = article_df['vector_id'].apply(str)

In [None]:
article_df.info(show_counts=True)

## SingleStoreDB

## Create Table

In [None]:
import singlestoredb as s2

conn = s2.connect("root:<password>@<host>:3306/openai_demo")

cur = conn.cursor()

In [None]:
stmt = """
    CREATE TABLE IF NOT EXISTS wikipedia (
        id INT PRIMARY KEY,
        url VARCHAR(255),
        title VARCHAR(100),
        text TEXT,
        title_vector BLOB,
        content_vector BLOB,
        vector_id INT
    )
"""

cur.execute(stmt)

## Populate Table

In [None]:
%%time

# Prepare the statement
stmt = """
    INSERT INTO wikipedia (
        id,
        url,
        title,
        text,
        title_vector,
        content_vector,
        vector_id
    )
    VALUES (
        %s,
        %s,
        %s,
        %s,
        JSON_ARRAY_PACK_F64(%s),
        JSON_ARRAY_PACK_F64(%s),
        %s
    )
"""

# Convert the DataFrame to a NumPy record array
record_arr = article_df.to_records(index=False)

# Set the batch size
batch_size = 1000

# Iterate over the rows of the record array in batches
for i in range(0, len(record_arr), batch_size):
    batch = record_arr[i:i+batch_size]
    values = [(
        row[0],
        row[1],
        row[2],
        row[3],
        str(row[4]),
        str(row[5]),
        int(row[6])
    ) for row in batch]
    cur.executemany(stmt, values)

## Search Data

In [None]:
from openai.embeddings_utils import get_embedding

In [None]:
if os.getenv("OPENAI_API_KEY") is not None:
    openai.api_key = os.getenv("OPENAI_API_KEY")
    print ("OPENAI_API_KEY is ready")
else:
    print ("OPENAI_API_KEY environment variable not found")

In [None]:
from typing import Tuple, List

def search_wikipedia(
    query: str,
    column1: str,
    column2: str,
    num_rows: int = 10
) -> Tuple[List[str], List[float]]:
    """Searches Wikipedia for the given query and returns the top `num_rows` results.

    Args:
        query: The query to search for.
        column1: The name of the column in the Wikipedia database to return for each result.
        column2: The name of the column in the Wikipedia database to use as the score for each result.
        num_rows: The number of results to return.

    Returns:
        A list of the top `num_rows` results.
    """

    # Get the embedding of the query
    embedding = get_embedding(query, EMBEDDING_MODEL)

    # Create the SQL statement
    stmt = """
        SELECT
            {column1},
            DOT_PRODUCT_F64(JSON_ARRAY_PACK_F64(%s), {column2}) AS score
        FROM wikipedia
        ORDER BY score DESC
        LIMIT %s
    """.format(column1=column1, column2=column2)

    # Execute the SQL statement
    cur.execute(stmt, [str(embedding), num_rows])

    # Get the results
    results = cur.fetchall()

    # Separate the results into two lists
    values = [row[0] for row in results]
    scores = [row[1] for row in results]

    # Return the results
    return values, scores

In [None]:
%%time

values1, scores1 = search_wikipedia(
    query = "modern art in Europe",
    column1 = "title",
    column2 = "title_vector",
    num_rows = 5
)

In [None]:
from tabulate import tabulate

# Combine the values and scores lists into a list of tuples
# Each tuple contains a value and its corresponding score
table_data1 = list(zip(values1, scores1))

# Add a rank column to the table data
table_data1 = [(i + 1,) + data for i, data in enumerate(table_data1)]

# Create the table
table1 = tabulate(table_data1, headers=["Rank", "Title", "Score"])

# Print the table
print(table1)

In [None]:
%%time

values2, scores2 = search_wikipedia(
    query = "Famous battles in Scottish history",
    column1 = "text",
    column2 = "content_vector",
    num_rows = 5
)

In [None]:
# Combine the values and scores lists into a list of tuples
# Each tuple contains a value and its corresponding score
table_data2 = list(zip([value[:50] for value in values2], scores2))

# Add a rank column to the table data
table_data2 = [(i + 1,) + data for i, data in enumerate(table_data2)]

# Create the table
table2 = tabulate(table_data2, headers=["Rank", "Text", "Score"])

# Print the table
print(table2)

## License

Small code sections in this notebook are from the [Using Vector Databases for Embeddings Search](https://github.com/openai/openai-cookbook/blob/main/examples/vector_databases/Using_vector_databases_for_embeddings_search.ipynb) notebook from OpenAI.

MIT License

Copyright (c) 2023 OpenAI

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.