In [2]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Initialize API keys
CLAUDE_API_KEY = os.getenv('CLAUDE_API_KEY')
MONGODB_URI = os.getenv('MONGODB_URI')


# Validate that the keys are not None
if not all([CLAUDE_API_KEY, MONGODB_URI]):
    raise ValueError("One or more required environment variables are missing.")


In [4]:
from pymongo import MongoClient
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain_community.document_transformers.openai_functions import (
    create_metadata_tagger,
)
from langchain.chains import create_tagging_chain
#from langchain_anthropic import AnthropicEmbeddings
from langchain_anthropic import ChatAnthropic



# Set the MongoDB URI, DB, Collection Names

client = MongoClient(MONGODB_URI)
dbName = "book_mongodb_chunks"
collectionName = "chunked_data"
collection = client[dbName][collectionName]

loader = PyPDFLoader(".\sample_files\LightRAG.pdf")
pages = loader.load()
cleaned_pages = []

for page in pages:
    if len(page.page_content.split(" ")) > 20:
        cleaned_pages.append(page)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150)

schema = {
    "properties": {
        "title": {"type": "string"},
        "keywords": {"type": "array", "items": {"type": "string"}},
        "hasCode": {"type": "boolean"},
    },
    "required": ["title", "keywords", "hasCode"],
}

llm = ChatAnthropic(
    anthropic_api_key=CLAUDE_API_KEY, temperature=0, model="claude-3-sonnet-20240229"
)


ModuleNotFoundError: No module named 'pymongo'

In [None]:

tagging_chain = create_tagging_chain(schema, llm)

docs = []
for page in cleaned_pages:
    tagged_content = tagging_chain.run(page.page_content)
    docs.append({**tagged_content, **page.metadata})

#document_transformer = create_metadata_tagger(metadata_schema=schema, llm=llm)

#docs = document_transformer.transform_documents(cleaned_pages)

split_docs = text_splitter.split_documents(docs)

embeddings = SentenceTransformer("hkunlp/instructor-xl")

vectorStore = MongoDBAtlasVectorSearch.from_documents(
    split_docs, embeddings, collection=collection
)