In [1]:
import json, os, time
import copy


import pprint

# OPENAI
from openai import OpenAI

# LANGCHAIN
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain.prompts.prompt import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import Chroma
from langchain_core.messages import AIMessage, HumanMessage
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_pinecone import PineconeVectorStore

# PINECONE
import pinecone
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

# GENERAL
from dotenv import find_dotenv, load_dotenv
from rich.console import Console

In [2]:
console = Console()
load_dotenv()
if load_dotenv():
    print("Success: .env file found with some environment variables")
else:
    print(
        "Caution: No environment variables found. Please create .env file in the root directory or add environment variables in the .env file"
    )
api_key = os.environ["OPENAI_API_KEY"]
PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
PINECONE_ENV = os.environ["PINECONE_ENV"]
PINCONE_INDEX = os.environ["PINECONE_INDEX"]

print(f"{PINECONE_API_KEY} | {PINECONE_ENV} | {PINCONE_INDEX}")
client = OpenAI()


if api_key:
    try:
        client.models.list()
        print("OPENAI_API_KEY is set and is valid:", api_key)
    except openai.APIError as e:
        print(f"OpenAI API returned an API Error: {e}")
        pass
    except openai.APIConnectionError as e:
        print(f"Failed to connect to OpenAI API: {e}")
        pass
    except openai.RateLimitError as e:
        print(f"OpenAI API request exceeded rate limit: {e}")
        pass

else:
    print("Please set you OpenAI API key as an environment variable OPENAI_API_KEY")

Success: .env file found with some environment variables
69a6ef84-1e2b-49ad-b93d-c012c8be1ca2 | us-east-1 | test
OPENAI_API_KEY is set and is valid: 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


In [3]:
embedding_function = OpenAIEmbeddings()

In [4]:
# Initialize a client
pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)

In [5]:
index_name = PINCONE_INDEX

In [6]:
print(pc.list_indexes().names())

['test']


In [7]:
print(pc.Index(index_name).describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}


In [8]:
client = OpenAI()


def get_embedding(text, model="text-embedding-3-small"):

    # text = text.replace("\n", " ")
    try:
        embedding = (
            client.embeddings.create(input=[text], model=model).data[0].embedding
        )
    except Exception as e:
        print(f"Embedding failed: {text} | {e}")
        embedding = None

    return embedding

In [9]:
# Wait for the index to be ready
index_name = PINCONE_INDEX
while not pc.describe_index(index_name).status["ready"]:
    time.sleep(1)
index = pc.Index(index_name)

In [10]:
# filename = "TMCB_43_2256640.pdf.json"
filename = "mindset.pdf.json"
persist_directory = "./data/db/chroma/"
data_json_directory = "./pdf_output/"

# get all the json from json file

file = data_json_directory + filename

console.print(f"File: {file}")

# import json data

with open(file) as f:
    data = json.load(f)

num_el = len(data)
print(f"{num_el} elements to load")

374 elements to load


In [11]:
for i in range(num_el):
    metadata = dict(data[i]["metadata"])
    el_type = data[i]["type"]
    page_number = metadata["page_number"]
    doc_id = data[i]["element_id"]
    content = data[i]["text"]
    embed = get_embedding(content)
    meta = {
        "doc_id": doc_id,
        "filename": filename,
        "page_number": page_number,
        "type": el_type,
        "content": content,
    }
    print(f"Upserting: {i} | {doc_id}")
    try:
        if embed is None:
            continue
            index.upsert(
                vectors=[
                    {"id": doc_id, "values": embed, "metadata": meta},
                ],
                namespace="",
            )
    except Exception as e:
        print(f"Upsert failed: {doc_id}\n{e}")

# See how many vectors have been upserted
print("Index after upsert:")
print(pc.Index(index_name).describe_index_stats())
print("\n")

Upserting: 0 | 5ca69274a5fdd7bed6a160e7e686658e
Upserting: 1 | 5739b692a97926a35a1675ef47a46733
Upserting: 2 | b00b7517512ad9ab46c12bdf32d07972
Upserting: 3 | a7e1220813696c0da336d0fdfd619487
Upserting: 4 | fc8d4ac3c55e7ce3e5039298add4f1fd
Upserting: 5 | c671e89b41f287a0b49c6318128fb1ef
Upserting: 6 | c8c7244c09eeefef3cabb747021860a6
Upserting: 7 | a180fa02107492026261de159d6110d1
Upserting: 8 | fc7d0ce80265d1516a8b8da8fdfe0bd7
Upserting: 9 | 8efbc87ad928fc52b9c464f1622616cb
Upserting: 10 | 4fbd49b2ed0036405a805fd910ccdf67
Upserting: 11 | 30f2f93f075a29f5680f68d95c2bd26f
Upserting: 12 | 998e4d2b257d1f8829df94efca945b3b
Upserting: 13 | 672cdb638408604d9a3d48f81a1b2416
Upserting: 14 | ffd636104e8d8e14923f32cf07c99843
Upserting: 15 | 2b6d45f990e2ceb02bdd27768e014090
Upserting: 16 | 31a07596ce3abb4a47ce507a570ebbb2
Upserting: 17 | dc6ab4cc71cd1f9301e4d6ec1c33db8d
Upserting: 18 | 2c48cc5ca38752a97346fba5933f5c1a
Upserting: 19 | d0e8e4a684826ff8fafb4ccd06e2a66b
Upserting: 20 | 4b077fa05d3403