In [1]:
import json, os, time
import copy


import pprint

# OPENAI
from openai import OpenAI

# LANGCHAIN
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain.prompts.prompt import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import Chroma
from langchain_core.messages import AIMessage, HumanMessage
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_pinecone import PineconeVectorStore

# PINECONE
import pinecone
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

# GENERAL
from dotenv import find_dotenv, load_dotenv
from rich.console import Console

In [None]:
console = Console()

In [None]:
load_dotenv()
if load_dotenv():
    print("Success: .env file found with some environment variables")
else:
    print(
        "Caution: No environment variables found. Please create .env file in the root directory or add environment variables in the .env file"
    )
api_key = os.environ["OPENAI_API_KEY"]
PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
PINECONE_ENV = os.environ["PINECONE_ENV"]
PINCONE_INDEX = os.environ["PINECONE_INDEX"]

print(f"{PINECONE_API_KEY} | {PINECONE_ENV} | {PINCONE_INDEX}")
client = OpenAI()


if api_key:
    try:
        client.models.list()
        print("OPENAI_API_KEY is set and is valid:", api_key)
    except openai.APIError as e:
        print(f"OpenAI API returned an API Error: {e}")
        pass
    except openai.APIConnectionError as e:
        print(f"Failed to connect to OpenAI API: {e}")
        pass
    except openai.RateLimitError as e:
        print(f"OpenAI API request exceeded rate limit: {e}")
        pass

else:
    print("Please set you OpenAI API key as an environment variable OPENAI_API_KEY")

In [55]:
embedding_function = OpenAIEmbeddings()

In [56]:
# Initialize a client
pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)

In [57]:
index_name = PINCONE_INDEX

In [None]:
print(pc.list_indexes().names())

In [None]:
print(pc.Index(index_name).describe_index_stats())

In [60]:
client = OpenAI()


def get_embedding(text, model="text-embedding-3-small"):
    # text = text.replace("\n", " ")

    try:
        embedding = (
            client.embeddings.create(input=[text], model=model).data[0].embedding
        )

    except Exception as e:
        print(f"Embedding failed: {text} | {e}")

        embedding = None

    return embedding

In [61]:
# Wait for the index to be ready
index_name = PINCONE_INDEX
while not pc.describe_index(index_name).status["ready"]:
    time.sleep(1)
index = pc.Index(index_name)

In [None]:
filename = "TMCB_43_2256640.pdf.json"
# filename = "post_ocr.pdf.json"


# filename = "mindset.pdf.json"
category = "nih"


persist_directory = "./data/db/chroma/"


data_json_directory = f"./pdf_output/{category}/"


file = data_json_directory + filename


console.print(f"File: {file}")


with open(file) as f:
    data = json.load(f)


num_el = len(data)


print(f"{num_el} elements to load")

In [None]:
for i in range(3):
    metadata = dict(data[i]["metadata"])
    el_type = data[i]["type"]
    page_number = metadata["page_number"]
    doc_id = data[i]["element_id"]
    content = data[i]["text"]
    embed = get_embedding(content)
    meta = {
        "category": category,
        "doc_id": doc_id,
        "filename": filename,
        "page_number": page_number,
        "type": el_type,
        "content": content,
    }
    print(f"Upserting: {i} | {doc_id}")
    try:
        if embed is None:
            continue
        else:
            index.upsert(
                vectors=[
                    {"id": doc_id, "values": embed, "metadata": meta},
                ],
                namespace="",
            )
    except Exception as e:
        print(f"Upsert failed: {doc_id}\n{e}")

# See how many vectors have been upserted
print("Index after upsert:")
print(pc.Index(index_name).describe_index_stats())
print("\n")