In [None]:
"hi"

In [None]:
%pwd

In [None]:
import os

os.chdir("../")

In [None]:
%pwd

# RAG Chatbot architecture building

1. Load the documents
2. Preprocess the document (remove the metadta and just keep the page_source and page_content)
3. Chunk the data
4. Convert the chunks into embeddings

## 1. Load the PDF file

In [None]:
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from typing import List

def load_documents(data_path: str) -> List:
    loader = DirectoryLoader(
        data_path, glob="*.pdf", loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents


In [None]:
extracted_documents = load_documents("data")
extracted_documents

In [None]:
len(extracted_documents)

### Proprocess the extracted documents

In [None]:
from langchain.schema import Document
from typing import List

def filter_minimal_docs(docs: List[Document]) -> List[Document]:
    # filer out the docs that are two short

    minimal_docs = []

    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [None]:
minimal_docs = filter_minimal_docs(extracted_documents)
minimal_docs

In [None]:
len(minimal_docs)

## 2. Chunking

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# split the data into smaller chunks

def chunk_data(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20
    )
    text_chunk = text_splitter.split_documents(minimal_docs)
    return text_chunk


In [None]:
chunked_data = chunk_data(minimal_docs)
chunked_data

In [None]:
print(f'number of chunks: {len(chunked_data)}')

# Convert chunks into embeddings

In [None]:
# Download embeddings model from the Hugging face (sentence transformer)
from langchain.embeddings import HuggingFaceBgeEmbeddings

def download_embeddings():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceBgeEmbeddings(
        model_name=model_name
    )
    return embeddings

In [None]:
embedding = download_embeddings()

In [None]:
embedding

In [None]:
vector = embedding.embed_query("Hi, How are you?")
vector

In [None]:
print(f'The dimension of the vector is: {len(vector)}')

# Setup pinecone Vector db

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_api_key

In [None]:
from pinecone import Pinecone

# create a pinecone client
pc = Pinecone(api_key=pinecone_api_key)
print(f'Pinecone client: {pc}')


In [None]:
from pinecone import ServerlessSpec


index_name = "medical-chatbot"

if not pc.has_index:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


# create index
pc.Index(index_name)