In [1]:
import json, os, time
import copy


import pprint

# OPENAI
from openai import OpenAI

# LANGCHAIN
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain.prompts.prompt import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import Chroma
from langchain_core.messages import AIMessage, HumanMessage
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import sqlite3
import numpy as np

# GENERAL
from dotenv import find_dotenv, load_dotenv
from rich.console import Console

In [2]:
console = Console()
load_dotenv()
if load_dotenv():
    print("Success: .env file found with some environment variables")
else:
    print(
        "Caution: No environment variables found. Please create .env file in the root directory or add environment variables in the .env file"
    )
api_key = os.environ["OPENAI_API_KEY"]

client = OpenAI()


if api_key:
    try:
        client.models.list()
        print("OPENAI_API_KEY is set and is valid:", api_key)
    except openai.APIError as e:
        print(f"OpenAI API returned an API Error: {e}")
        pass
    except openai.APIConnectionError as e:
        print(f"Failed to connect to OpenAI API: {e}")
        pass
    except openai.RateLimitError as e:
        print(f"OpenAI API request exceeded rate limit: {e}")
        pass

else:
    print("Please set you OpenAI API key as an environment variable OPENAI_API_KEY")

Success: .env file found with some environment variables
OPENAI_API_KEY is set and is valid: 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


In [3]:
embedding_function = OpenAIEmbeddings()

In [4]:
client = OpenAI()


def get_embedding(text, model="text-embedding-3-small"):

    # text = text.replace("\n", " ")

    try:
        embedding = (
            client.embeddings.create(input=[text], model=model).data[0].embedding
        )

    except Exception as e:

        print(f"Embedding failed: {text} | {e}")

        embedding = None

    return embedding

In [5]:
filename = "TMCB_43_2256640.pdf.json"
# filename = "post_ocr.pdf.json"
# filename = "mindset.pdf.json"
category = "nih"
data_json_directory = f"./pdf_output/{category}/"
file = data_json_directory + filename
console.print(f"File: {file}")


with open(file) as f:
    data = json.load(f)


num_el = len(data)


print(f"{num_el} elements to load")

333 elements to load


In [11]:
# create a connection to the SQLite DB
conn = sqlite3.connect("rag.db")
# Create a cursor object to execute SQL Commands
cursor = conn.cursor()
cursor.execute("""drop table if exists vectors;""")
cursor.execute(
    """

CREATE TABLE IF NOT EXISTS vectors (
    id  INTEGER PRIMARY KEY,
    doc_id TEXT  NULL,
    category TEXT  NULL,
    content TEXT  NULL,
    filename TEXT  NULL,
    page_number INTEGER  NULL,
    el_type TEXT  NULL,
    vector BLOB  NULL
)
"""
)

<sqlite3.Cursor at 0x1ce65346ac0>

In [13]:
for i in range(5):
    metadata = dict(data[i]["metadata"])
    el_type = data[i]["type"]
    page_number = metadata["page_number"]
    doc_id = data[i]["element_id"]
    content = data[i]["text"]
    embed = get_embedding(content)
    embed = np.array(embed)

    meta = {
        "category": category,
        "doc_id": doc_id,
        "filename": filename,
        "page_number": page_number,
        "type": el_type,
        "content": content,
    }
    print(f"EMBED: {i} | {doc_id} | {embed}")
    try:
        if embed is None:
            continue
        else:
            print("Doing insert...")
            cursor.execute(
                "INSERT INTO vectors (vector) VALUES (?)",
                (sqlite3.Binary(embed.tobytes()),),
            )

    except Exception as e:
        print("ERROR")
        print(f"Upsert failed: {doc_id}\n{e}")

# See how many vectors have been upserted

EMBED: 0 | f506b2a3fdfda7bd084acfb2fb5ff0f6 | [ 0.01742555  0.00664365  0.00104547 ... -0.00308344 -0.03764317
  0.02657458]
Doing insert...
EMBED: 1 | a5a52e72e96eb4065948f79512a09cac | [ 0.00934763  0.01649228  0.00951015 ... -0.00209163 -0.03021579
  0.03185298]
Doing insert...
EMBED: 2 | 51d4010aff5815b966a7719895572a51 | [ 0.01174418 -0.00392162  0.01552106 ... -0.01641704 -0.01354302
 -0.0034564 ]
Doing insert...
EMBED: 3 | b5cde38640ea1e370f3fbf91604c3f1b | [ 0.01887637  0.02648989 -0.00666541 ...  0.02462944 -0.01804633
  0.01326641]
Doing insert...
EMBED: 4 | 835f90ace46841639cd6f6f1b50ce9fe | [-0.02747788 -0.01412614  0.03217769 ...  0.02335219 -0.00616517
 -0.00279719]
Doing insert...


In [23]:
query_vect = get_embedding("What is the title ")

In [29]:
cursor.execute("SELECT doc_id,vector FROM vectors")
rows = cursor.fetchall()
rows

[('f506b2a3fdfda7bd084acfb2fb5ff0f6',
  b'\x00\x00\x00\xa0\x00\xd8\x91?\x00\x00\x00\x00^6{?\x00\x00\x00`\x05!Q?\x00\x00\x00\x80\xcb\x9d\xac?\x00\x00\x00@\x85\x96\xa4\xbf\x00\x00\x00\xa0S\xa2x?\x00\x00\x00\xe0\xc0\x1d\xa1?\x00\x00\x00\xe0/\x9dq\xbf\x00\x00\x00 \'\xf8\x9d\xbf\x00\x00\x00\x00%\xf3\x93\xbf\x00\x00\x00\xa0\\3\xa5?\x00\x00\x00\x80\xfe\x94n?\x00\x00\x00\xc01\xb6\x92?\x00\x00\x00`\xfbx\x87?\x00\x00\x00 \xac\xe0\x92?\x00\x00\x00\x80\x9a\xd4t?\x00\x00\x00\xa0R\x8b\xab?\x00\x00\x00@0\x9e\xa3\xbf\x00\x00\x00\xe0\xd1\x13\xa4?\x00\x00\x00 \xf3\xe9\xae?\x00\x00\x00\x80x\xd3\xa5?\x00\x00\x00\x005\xe7\x82\xbf\x00\x00\x00\xa0]J\x92\xbf\x00\x00\x00 \xd5D\x84\xbf\x00\x00\x00\x00\xf2\xfb\xa1\xbf\x00\x00\x00 \xd2\xff\\\xbf\x00\x00\x00\x00\nU\x97\xbf\x00\x00\x00\x803\xba\x8a?\x00\x00\x00\x80X\x00\xa1?\x00\x00\x00\xe0pZ\x88\xbf\x00\x00\x00\x00\xe6%\x9e?\x00\x00\x00\xc0\x0e\x9e\xa6\xbf\x00\x00\x00@7\xec\x8c?\x00\x00\x00`t\xa0\xa1\xbf\x00\x00\x00\xe0\xec\xb1\xb0\xbf\x00\x00\x00@\x8c\xe4\x9d?\x0

In [39]:
query_vect = np.array([1.0, 3.2, 2.0, 0.5])

In [None]:
cursor.execute(
    """
    SELECT vector FROM vectors ORDER BY abs(vector - ?) ASC
    """,
    (sqlite3.Binary(query_vect.tobytes()),),
)

In [None]:
embedding = [0.1, 0.2, 0.3, 0.4]
result = db.execute("select vec_length(?)", [serialize_float32(embedding)])

print(result.fetchone()[0])  # 4