In [1]:
import json, os, time
import copy


import pprint

# OPENAI
from openai import OpenAI

# LANGCHAIN
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain.prompts.prompt import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import Chroma
from langchain_core.messages import AIMessage, HumanMessage
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_pinecone import PineconeVectorStore

# PINECONE
import pinecone
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

# GENERAL
from dotenv import find_dotenv, load_dotenv
from rich.console import Console

In [2]:
console = Console()
load_dotenv()
if load_dotenv():
    print("Success: .env file found with some environment variables")
else:
    print(
        "Caution: No environment variables found. Please create .env file in the root directory or add environment variables in the .env file"
    )
api_key = os.environ["OPENAI_API_KEY"]
PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
PINECONE_ENV = os.environ["PINECONE_ENV"]
PINCONE_INDEX = os.environ["PINECONE_INDEX"]

print(f"{PINECONE_API_KEY} | {PINECONE_ENV} | {PINCONE_INDEX}")
client = OpenAI()


if api_key:
    try:
        client.models.list()
        print("OPENAI_API_KEY is set and is valid:", api_key)
    except openai.APIError as e:
        print(f"OpenAI API returned an API Error: {e}")
        pass
    except openai.APIConnectionError as e:
        print(f"Failed to connect to OpenAI API: {e}")
        pass
    except openai.RateLimitError as e:
        print(f"OpenAI API request exceeded rate limit: {e}")
        pass

else:
    print("Please set you OpenAI API key as an environment variable OPENAI_API_KEY")

Success: .env file found with some environment variables
69a6ef84-1e2b-49ad-b93d-c012c8be1ca2 | us-east-1 | test
OPENAI_API_KEY is set and is valid: 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


In [3]:
embedding_function = OpenAIEmbeddings()

In [4]:
# Initialize a client
pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)

In [5]:
index_name = PINCONE_INDEX

In [None]:
print(pc.list_indexes().names())

In [None]:
print(pc.Index(index_name).describe_index_stats())

In [4]:
client = OpenAI()


def get_embedding(text, model="text-embedding-3-small"):

    # text = text.replace("\n", " ")
    try:
        embedding = (
            client.embeddings.create(input=[text], model=model).data[0].embedding
        )
    except Exception as e:
        print(f"Embedding failed: {text} | {e}")
        embedding = None

    return embedding

In [9]:
# Wait for the index to be ready
index_name = PINCONE_INDEX
while not pc.describe_index(index_name).status["ready"]:
    time.sleep(1)
index = pc.Index(index_name)

In [5]:
# filename = "TMCB_43_2256640.pdf.json"
filename = "mindset.pdf.json"
persist_directory = "./data/db/chroma/"
data_json_directory = "./pdf_output/"

# get all the json from json file

file = data_json_directory + filename

console.print(f"File: {file}")

# import json data

with open(file) as f:
    data = json.load(f)

num_el = len(data)
print(f"{num_el} elements to load")

374 elements to load


In [6]:
import sqlite3
import numpy as np

In [7]:
# create a connection to the SQLite DB
conn = sqlite3.connect("ragdb")

# Create a cursor object to execute SQL Commands

cursor = conn.cursor()

In [8]:
# Create a table for vector data
cursor.execute(
    """
CREATE TABLE IF NOT EXISTS vectors (
    id INTEGER PRIMARY KEY,
    doc_id text,
    metadata text,
    vector BLOB NOT NULL
)
"""
)

<sqlite3.Cursor at 0x20c1cbb6dc0>

In [None]:
res = cursor.fetchone()  # finding the top one

In [None]:
np.frombuffer(res[0], dtype=np.float64)  # most similar vector

In [None]:
conn.commit()
conn.close()

In [12]:
for i in range(10):
    metadata = dict(data[i]["metadata"])
    el_type = data[i]["type"]
    page_number = metadata["page_number"]
    doc_id = data[i]["element_id"]
    content = data[i]["text"]
    embed = get_embedding(content)
    embed.tobytes()  # numpy array to bytestream

    meta = {
        "doc_id": doc_id,
        "filename": filename,
        "page_number": page_number,
        "type": el_type,
        "content": content,
    }
    print(f"Upserting: {i} | {doc_id}")
    try:
        if embed is None:

            cursor.execute(
                "INSERT INTO vectors (vector) VALUES (?)",
                (sqlite3.Binary(embed.tobytes()),),
            )
            continue

    except Exception as e:
        print(f"Upsert failed: {doc_id}\n{e}")

Upserting: 0 | 5ca69274a5fdd7bed6a160e7e686658e
Upserting: 1 | 5739b692a97926a35a1675ef47a46733
Upserting: 2 | b00b7517512ad9ab46c12bdf32d07972
Upserting: 3 | a7e1220813696c0da336d0fdfd619487
Upserting: 4 | fc8d4ac3c55e7ce3e5039298add4f1fd
Upserting: 5 | c671e89b41f287a0b49c6318128fb1ef
Upserting: 6 | c8c7244c09eeefef3cabb747021860a6
Upserting: 7 | a180fa02107492026261de159d6110d1
Upserting: 8 | fc7d0ce80265d1516a8b8da8fdfe0bd7
Upserting: 9 | 8efbc87ad928fc52b9c464f1622616cb
Index after upsert:
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}


