In [1]:
from langchain_community.document_loaders import PDFPlumberLoader

In [None]:
# Load the PDF document
pdf_path = "national-cancer-plan-508.pdf"
loader = PDFPlumberLoader(pdf_path)
documents = loader.load()
documents

In [None]:
documents[1].page_content

In [None]:
#Preprocessing
import re

def preprocess_text(text):
    # Remove page numbers (e.g., "Page 1", "1 of 10")
    #text = re.sub(r'Page \d+', '', text)
    #text = re.sub(r'\d+ of \d+', '', text)

    # Remove repetitive headers/footers (e.g., "NATIONAL CANCER PLAN |")
    text = re.sub(r'NATIONAL CANCER PLAN \| \d', '', text)

    # Remove extra whitespace and newlines
    #text = re.sub(r'\s+', ' ', text).strip()

    return text 


for doc in documents:
    doc.page_content = preprocess_text(doc.page_content)

documents[1].page_content

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
    length_function=len,
    separators=["\n\n", "\n"]
)

chunks = splitter.split_documents(documents)

In [None]:
chunks[0].metadata

In [None]:
type(chunks[0])

In [None]:
#test
from sentence_transformers import SentenceTransformer

# Load the all-MiniLM-L6-v2 model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [22]:
#test
# Extract the text from each document chunk
texts = [doc.page_content for doc in chunks]

In [None]:
texts

In [24]:
#test
# Create sentence embeddings for all document chunks in a batch
embeddings = embedding_model.encode(texts)

In [None]:
#test
# Print embeddings for verification
for idx, embedding in enumerate(embeddings):
    print(f"Document {idx} embedding (first 5 dimensions): {embedding[:5]} ... Total dimensions: {len(embedding)}")

In [None]:
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings

# Initialize the embedding model with all-MiniLM-L6-v2
embedding_model_hugg = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Create a FAISS vector store from the documents.
# LangChain automatically stores the metadata with each document internally.
vector_store = FAISS.from_documents(chunks, embedding_model_hugg)

In [None]:
vector_store.docstore._dict.items()

In [None]:
#test
for i in range(vector_store.index.ntotal):
    # Use FAISS's reconstruct to get the vector for a given index.
    embedding_vector = vector_store.index.reconstruct(i)

    # Retrieve the corresponding document id.
    #doc_id = vector_store._index_to_docstore_id[i]

    # Access the original Document object from the internal docstore.
    #doc_obj = vector_store.docstore._dict[doc_id]
    #print(f"Document ID: {doc_id}")
    #print("Text:", doc_obj.page_content)
    #print("Metadata:", doc_obj.metadata)
    print("Embedding vector:", embedding_vector)
    print("-" * 50)

In [58]:
# Save the vector store locally to a directory (e.g., "faiss_index")
save_directory = "faiss_index"
vector_store.save_local(save_directory)

In [59]:
from langchain.chat_models import ChatOpenAI          # Chat LLM (like GPT-4 or GPT-3.5)
from langchain.chains import ConversationalRetrievalChain  # Combines chat and retrieval
from langchain.memory import ConversationBufferMemory    # Keeps track of conversation history

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()  # This will load the variables from your .env file into os.environ

# Now you can verify that the API key is loaded
print("OPENAI_API_KEY is:", os.environ.get("OPENAI_API_KEY"))

In [None]:
#Set Up the Chat Model
chat_llm = ChatOpenAI(temperature=0)

In [None]:
print(chat_llm.model_name)
chat_gpt35 = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
print("Using model:", chat_gpt35.model_name)

# Instance using GPT-4 (requires proper access)
chat_gpt4 = ChatOpenAI(temperature=0, model_name="gpt-4")
print("Using model:", chat_gpt4.model_name)

In [None]:
#Establish Conversation Memory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [62]:
#Create the Conversational Retrieval Chain
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=chat_llm,
    retriever=vector_store.as_retriever(search_kwargs={"k": 5}),  # Retrieves the top 5 most similar documents to a query
    memory=memory
)

In [None]:
#Create an Interactive Chat Loop
print("Chatbot is ready! Type 'exit' to quit.")
while True:
    user_input = input("User: ")
    if user_input.lower() == "exit":
        break
    # Pass the user query to the conversation chain
    result = conversation_chain({"question": user_input})
    print("Chatbot:", result["answer"])