In [57]:
import os, json, sys
from dotenv import load_dotenv
from uuid import uuid4

from pinecone import Pinecone, ServerlessSpec

from langchain_openai import AzureOpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document

In [2]:
load_dotenv()

True

In [3]:
embed_model = AzureOpenAIEmbeddings(
    azure_endpoint=os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT"),
    openai_api_key=os.getenv("AZURE_OPENAI_EMBEDDING_API_KEY"),
    openai_api_version=os.getenv("AZURE_OPENAI_EMBEDDING_API_VERSION"),
    azure_deployment=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME")
)

In [4]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

In [5]:
index_name = os.getenv("PINECONE_INDEX_NAME")

In [6]:
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )


In [7]:
index = pc.Index(index_name)

In [9]:
# Delete all records in vector store
index.delete(delete_all=True)

{}

In [10]:
vector_store = PineconeVectorStore(index=index, embedding=embed_model)

In [23]:
source_path = "../data/processed"

In [24]:
# Get list of relevant files for vector store
files = []
for file in os.listdir(source_path):
    if file.endswith(".json"):
        files.append(file)


In [25]:
files

['dds.json', 'ddls.json']

In [31]:
def convert_to_documents(file_name: str) -> list[Document]:
    """Convert contents of file to Langchain Documents."""
    with open(os.path.join(source_path, file_name), "r") as f:
        content = json.loads(f.read())
    docs = [Document(page_content=item["content"], metadata=item["metadata"]) for item in content]
    return docs


In [36]:
# Prepare Documents
documents = []
for processed_file in files:
    documents.extend(convert_to_documents(processed_file))
    print(f"Processed File: {processed_file}")


Processed File: dds.json
Processed File: ddls.json


In [37]:
documents

[Document(metadata={'file_name': 'mcd_eligibility_mcd_data_dictionary.json', 'file_path': '../data/raw', 'table_name': 'eligibility_mcd', 'client_name': 'Medicaid', 'client_abbr': 'MCD'}, page_content='column_name: sub_member_id_code\ndata_type: varchar\ncolumn_size: 50\ncolumn_description: Unique identifier code for the subscriber member.'),
 Document(metadata={'file_name': 'mcd_eligibility_mcd_data_dictionary.json', 'file_path': '../data/raw', 'table_name': 'eligibility_mcd', 'client_name': 'Medicaid', 'client_abbr': 'MCD'}, page_content='column_name: first_name\ndata_type: varchar\ncolumn_size: 35\ncolumn_description: First name of the subscriber member.'),
 Document(metadata={'file_name': 'mcd_eligibility_mcd_data_dictionary.json', 'file_path': '../data/raw', 'table_name': 'eligibility_mcd', 'client_name': 'Medicaid', 'client_abbr': 'MCD'}, page_content='column_name: mem_last_name_code\ndata_type: varchar\ncolumn_size: 60\ncolumn_description: Last name code of the subscriber member

In [38]:
# Generate unique ids for each document
document_ids = [str(uuid4()) for _ in range(len(documents))]

In [39]:
document_ids

['07e197d1-0813-4b39-9fe7-741f96c32510',
 '3aafee1e-1295-4c00-8b84-e2c332bb0f2c',
 'd657436a-5b50-49e8-b3e3-1edfb6ca93ef',
 '4dacecfd-212d-40bd-bf64-3464fdcd8b2e',
 '7ea1c6c2-91d8-4d38-9cae-b2c6453c8bf1',
 '630a7fea-21f2-4690-8292-865e1244a2e3',
 '6b120dbb-69e3-490c-9e7f-a5c35a1bf6a9',
 'b6c6c993-28a4-4e6b-be1b-d2b9ebd36117',
 '78e699cc-e54a-441f-a275-ef6d42a4cfc1',
 '61521da4-f381-40a0-a222-4478ee6b4186',
 'b081d63c-5ff1-4599-8567-02cefbe71460',
 '9d61e4ae-3dcf-4aad-9e88-118c834e954e',
 '7a9dc74c-3c06-44c1-8382-cbb37fc87185',
 '35ec0b91-b35f-4c83-8d49-2512dcea22e8',
 '284ff416-a821-47ab-abc2-47e6cdd13e0d',
 '0ba20730-bccf-4509-aab5-20c2994f6a55',
 'f9782596-6fce-4712-97c8-2b829d472bf5',
 'd734a58b-7eda-4b02-bfe3-d17595884ecc',
 'b8d7b1de-9647-4b1b-84cf-3fb8faa54cfa',
 '3841f606-7321-41d5-bbd4-121345a230d1',
 '8b34dc43-37d1-41a9-b70c-f4868dddb686',
 '0af14a74-fa1c-4161-bf08-788f72dff57c',
 '756fadb1-93cc-4217-8dc9-c3c78e6f0dc3',
 '3fbcc8fd-1994-48b0-920c-2a1aed6d9f1e',
 '8b327c1b-7423-

In [40]:
len(document_ids)

788

In [93]:
# Publish knowledgebase to vector store
failed = []
for doc, doc_id in zip(documents, document_ids):
    try:
        vector_store.add_documents(documents=[doc], ids=[doc_id])
        print(f"Added: {doc_id}")
    except Exception as ex:
        failed.append([doc, doc_id])
        print(f"Failed to add: {doc_id}")
    

Added: 07e197d1-0813-4b39-9fe7-741f96c32510
Added: 3aafee1e-1295-4c00-8b84-e2c332bb0f2c
Added: d657436a-5b50-49e8-b3e3-1edfb6ca93ef
Added: 4dacecfd-212d-40bd-bf64-3464fdcd8b2e
Added: 7ea1c6c2-91d8-4d38-9cae-b2c6453c8bf1
Added: 630a7fea-21f2-4690-8292-865e1244a2e3
Added: 6b120dbb-69e3-490c-9e7f-a5c35a1bf6a9
Added: b6c6c993-28a4-4e6b-be1b-d2b9ebd36117
Added: 78e699cc-e54a-441f-a275-ef6d42a4cfc1
Added: 61521da4-f381-40a0-a222-4478ee6b4186
Added: b081d63c-5ff1-4599-8567-02cefbe71460
Added: 9d61e4ae-3dcf-4aad-9e88-118c834e954e
Added: 7a9dc74c-3c06-44c1-8382-cbb37fc87185
Added: 35ec0b91-b35f-4c83-8d49-2512dcea22e8
Added: 284ff416-a821-47ab-abc2-47e6cdd13e0d
Added: 0ba20730-bccf-4509-aab5-20c2994f6a55
Added: f9782596-6fce-4712-97c8-2b829d472bf5
Added: d734a58b-7eda-4b02-bfe3-d17595884ecc
Added: b8d7b1de-9647-4b1b-84cf-3fb8faa54cfa
Added: 3841f606-7321-41d5-bbd4-121345a230d1
Added: 8b34dc43-37d1-41a9-b70c-f4868dddb686
Added: 0af14a74-fa1c-4161-bf08-788f72dff57c
Added: 756fadb1-93cc-4217-8dc9-c