In [1]:
%load_ext kedro.ipython

## Import Dependencies

In [2]:
import json
import os
import re
import uuid
from pathlib import Path

import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
from kedro.config import OmegaConfigLoader
from kedro.framework.project import settings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents.base import Document
from langchain_openai import OpenAIEmbeddings

## Load Credentials

In [3]:
conf_path = str(str(Path(os.getcwd()).parent / settings.CONF_SOURCE))
conf_loader = OmegaConfigLoader(conf_source=conf_path)
credentials = conf_loader["credentials"]
credentials.keys()  # view the available credentials to load

[1;35mdict_keys[0m[1m([0m[1m[[0m[32m'OPENAI_API_KEY'[0m[1m][0m[1m)[0m

### `OPENAI_API_KEY`

In [4]:
embedding_model_name = catalog.load("params:embedding_model_name")

# Load the OpenAI API key
OPENAI_API_KEY = credentials["OPENAI_API_KEY"]
embedding_model = OpenAIEmbeddings(model=embedding_model_name, openai_api_key=OPENAI_API_KEY)

## Load Parameters

### Catalog

In [5]:
# View everything available in the catalog
catalog.list()


[1m[[0m
    [32m'docs_dict'[0m,
    [32m'pdfs_dict'[0m,
    [32m'parameters'[0m,
    [32m'params:vector_db'[0m,
    [32m'params:vector_db.path'[0m,
    [32m'params:vector_db.collection_name'[0m,
    [32m'params:websites'[0m,
    [32m'params:pdfs_dir_path'[0m,
    [32m'params:splitter'[0m,
    [32m'params:splitter.chunk_size'[0m,
    [32m'params:splitter.chunk_overlap'[0m,
    [32m'params:splitter.separators'[0m,
    [32m'params:embedding_model_name'[0m
[1m][0m

In [6]:
# Load the parameters for vector database
# which contain the path and collection name
db_params = catalog.load("params:vector_db")

db_path = db_params["path"]
collection_name = db_params["collection_name"]

In [7]:
splitter_params = catalog.load("params:splitter")

chunk_size = splitter_params["chunk_size"]
chunk_overlap = splitter_params["chunk_overlap"]
separators = splitter_params["separators"]

In [8]:
# Load the websites from parameters
websites = catalog.load("params:websites")
websites

[1m[[0m[32m'https://www.healthhub.sg/a-z/diseases-and-conditions/diabetes-treatment-capsules--tablets'[0m[1m][0m

In [9]:
def strip_content(page_content: str) -> str:
    new_content = re.sub("\s+", " ", page_content).strip()
    return new_content


def websites_to_docs(
    websites: list[str], chunk_size: int, chunk_overlap: int, separators: list[str]
) -> tuple[list[Document], dict]:
    loader = WebBaseLoader(websites)
    data = loader.load()

    for d in data:
        new_content = strip_content(d.page_content)
        d.page_content = new_content

    # Define text chunk strategy
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=separators
    )
    # Split documents into chunks
    data_split = splitter.split_documents(data)
    # Convert to JSON serializable format
    docs_dict = [dict(ds) for ds in data_split]

    return data_split, docs_dict

In [10]:
client = chromadb.Client(
    Settings(
        is_persistent=True,
        persist_directory=str(Path(os.getcwd()).parent / db_path),
    )
)

# Check collections
collections = [collection.name for collection in client.list_collections()]
collections

[1m[[0m[1m][0m

In [11]:
# If collection doesn't exist, we create the collection and index all documents
if collection_name not in collections:
    print(
        f"Collection: {collection_name} does not exist. Creating collection and indexing all documents."
    )
    data_split, docs_dict = websites_to_docs(
        websites, chunk_size, chunk_overlap, separators
    )

    db = Chroma.from_documents(
        data_split,
        embedding_model,
        collection_name=collection_name,
        persist_directory=str(Path(os.getcwd()).parent / db_path),
    )
    
    catalog.save("docs_dict", docs_dict)

Collection: healthcare does not exist. Creating collection and indexing all documents.


In [12]:
# If the collection exists, we want to check if there are
# any new documents. If so, we want to add them to the collection
if collection_name in collections:
    print(
        f"Collection: {collection_name} already exists. Checking for new documents to index into collection."
    )
    collection = client.get_collection(name=collection_name)
    # Get all the websites already in collection
    website_urls = set(
        [metadata["source"] for metadata in collection.get()["metadatas"]]
    )

    # From the websites, only keep those which do not already appear in the collection
    # (we do not want to index the same website twice)
    new_websites = [website for website in websites if website not in website_urls]

    # If there are new websites, index them into the collection
    if new_websites:
        print(f"Indexing all {len(new_websites)} new documents into collection.")
        data_split, new_docs_dict = websites_to_docs(
            new_websites, chunk_size, chunk_overlap, separators
        )

        # Get JSON already saved to be updated with new documents
        try:
            docs_dict = catalog.load("docs_dict")
        except:
            docs_dict = []

        print(f"Before updating: {len(docs_dict)}")
        # Extend the docs_dict_to_update with the new documents
        docs_dict.extend(new_docs_dict)
        print(f"After updating: {len(docs_dict)}")

        # Save the updated docs_dict_to_update to the JSON
        # catalog.save("docs_dict", docs_dict)
        with open(os.path.join("..", "data", "02_intermediate", "websites.json"), "w", encoding="utf-8") as f:
            json.dump(docs_dict, f, ensure_ascii=False, indent=4)

        embedding_function = embedding_functions.OpenAIEmbeddingFunction(
            model_name=embedding_model_name, api_key=OPENAI_API_KEY
        )

        documents = [ds.page_content for ds in data_split]
        metadatas = [ds.metadata for ds in data_split]
        embeddings = embedding_function(documents)
        ids = [str(uuid.uuid4()) for _ in embeddings]

        collection.add(
            documents=documents, embeddings=embeddings, metadatas=metadatas, ids=ids
        )

    else:
        print("There are no new documents to index.")

## Load Content From Websites

**Note:** Only text are extracted and images are not. This is one avenue for improvement.

In [None]:
loader = WebBaseLoader(websites)
data = loader.load()
print(len(data))

In [None]:
for d in data:
    new_content = strip_content(d.page_content)
    d.page_content = new_content

## Chunk Strategy

**Note:** This is another avenue where improvements can be made. Many various chunking strategies to employ and evaluate.

In [None]:
# Define text chunk strategy
splitter = RecursiveCharacterTextSplitter(
    chunk_size=splitter_params["chunk_size"],
    chunk_overlap=splitter_params["chunk_overlap"],
)
# Split documents into chunks
data_split = splitter.split_documents(data)
print(len(data_split))

## Index Documents into Vector Database

In [None]:
client = chromadb.Client(
    Settings(
        is_persistent=True,
        persist_directory=str(Path(os.getcwd()).parent / db_params["path"]),
    )
)

collections = [collection.name for collection in client.list_collections()]
collections

In [None]:
# Define embedding model
embedding_model = OpenAIEmbeddings(model=embedding_model_name, openai_api_key=OPENAI_API_KEY)

if db_params["collection_name"] not in collections:

    db = Chroma.from_documents(
        data_split,
        embedding_model,
        collection_name=db_params["collection_name"],
        persist_directory=str(Path(os.getcwd()).parent / db_params["path"]),
    )

In [None]:
collections = [collection.name for collection in client.list_collections()]
collections