### Load data from pdf

In [18]:
import os
import time  # Import time to measure performance
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS


In [19]:
def load_pdf_and_split(pdf_directory):
    all_pdf_docs = []
    print("Loading documents...")
    for filename in os.listdir(pdf_directory):
        if filename.endswith(".pdf"):
            file_path = os.path.join(pdf_directory, filename)
            try:
                loader = PyPDFLoader(file_path)
                # .load() returns a list of Document objects, one for each page
                docs_for_file = loader.load()
                all_pdf_docs.extend(docs_for_file)
                print(f"Loaded {len(docs_for_file)} pages from {filename}")
            except Exception as e:
                print(f"Error loading {filename}: {e}")
    return all_pdf_docs

In [20]:
all_docs = []
nlp_doc = load_pdf_and_split("dataset/nlp")
all_docs.extend(nlp_doc)

Loading documents...
Loaded 114 pages from ANLP Session2_Week2_After Session-1.pdf
Loaded 118 pages from ANLP Session3_Week4_After session-2.pdf
Loaded 83 pages from ANLP Session1_Week1_Before Session-1.pdf
Loaded 190 pages from ANLP Session4_Week5_After Session.pdf
Loaded 27 pages from Extended topic - Text summarization slides.pdf


In [21]:
embeddings = OllamaEmbeddings(model="nomic-embed-text")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=200
)
splits = text_splitter.split_documents(all_docs)
vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)

### Train LLM

In [72]:
from langchain.retrievers import MultiQueryRetriever
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_history_aware_retriever
from langchain_core.messages import HumanMessage, AIMessage

In [73]:
contextualize_q_system_prompt = (
    "You are a query rephrasing assistant. Your ONLY task is to rephrase a follow-up question into a standalone question. "
    "Use the following chat history and user question to formulate the standalone question. "
    "The standalone question MUST be understandable without the chat history. "
    "If the user question is ALREADY a standalone question, you MUST return it without any changes. "
    "Do NOT answer the question. Only provide the rephrased question."
    "\n\n"
    "<chat_history>"
    "{chat_history}"
    "</chat_history>"
    "\n\n"
    "<user_question>"
    "{input}"
    "</user_question>"
    "\n\n"
    "Standalone question:"
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", contextualize_q_system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
)

qa_system_prompt = (
    "You are a specialized assistant for answering questions based ONLY on the provided documents. "
    "The provided documents are excerpts from university course materials, including lecture slides, notes, assognments. "
    "You are a helpful AI tutor for university of technology sydney students"
    "Your task is to use the following pieces of retrieved context to answer the user's question. "
    "Follow these rules STRICTLY:\n"
    "1. You MUST ONLY use the information present in the context provided below. DO NOT use any of your internal knowledge.\n"
    "2. If the context does not contain the answer to the question, you MUST respond with the exact phrase: 'The provided documents do not contain enough information to answer this question.'\n"
    "3. Do not add any extra information, explanations, or apologies. Just provide the answer from the context or the 'I don't know' phrase.\n\n"
    "3. If the users greet you, simple greet back and dont mention anything else.\n\n"

    "----------------\n"
    "CONTEXT:\n"
    "{context}\n"
    "----------------"
)

# The qa_prompt creation remains the same
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_ollama import ChatOllama

llm = ChatOllama(model="llama3")


question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

# This chain is responsible for retrieving relevant documents from the vector store.
retriever = MultiQueryRetriever.from_llm(
        retriever=vectorstore.as_retriever(), llm=llm
    )

history_aware_retriever = create_history_aware_retriever(
        llm, retriever, contextualize_q_prompt
    )

retrieval_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


In [None]:
import os
chat_history = []
def ask_RAG(question):
    # --- 5. Ask a Question ---

    print(f"\nAsking question: {question}")
    response = retrieval_chain.invoke({"input": question, "chat_history": chat_history})

    answer = response["answer"]
    source_documents = response["context"]

    # Print the final answer
    print("\n--- Answer ---")
    print(answer)

    # Append the user's message and the bot's response to the history
    chat_history.append(HumanMessage(content=question))
    chat_history.append(AIMessage(content=answer))

    # Process and display the sources
    if source_documents:
        print("\n--- Sources ---")
        # Use a set to store unique sources to avoid duplicates
        unique_sources = set()
        for doc in source_documents:
            # doc.metadata is a dictionary, e.g., {'source': 'path/file.pdf', 'page': 0}
            source_file = os.path.basename(doc.metadata['source']) # Get just the filename
            page_number = doc.metadata['page'] + 1 # Add 1 because pages are 0-indexed
            unique_sources.add(f"File: {source_file}, Page: {page_number}")
    
        for source in sorted(list(unique_sources)):
            print(source)

In [76]:
def ask_LLM(question):
    ask_RAG(question)

In [77]:
ask_LLM("Tell me what I will learn I this ANLP subject?")


Asking question: Tell me what I will learn I this ANLP subject?
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:langchain.retrievers.multi_query:Generated queries: ['Here are three different versions of the original question, each with a unique perspective:', 'Tell me about the key takeaways and concepts that students typically explore in an Artificial Neural Networks for Language Processing (ANLP) course.', 'What skills and knowledge can I expect to gain from studying ANLP, and how might these insights be applied to real-world problems?', 'Can you provide me with an overview of the major topics and themes covered in an ANLP subject, including any relevant case studies or examples that illustrate key concepts?']
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"

--- Answer ---
According to the context provided, you will learn the following in this ANLP subject:

1. Machine learning:
   • Classification algorithms
   • 

In [78]:
ask_LLM("Can you elaborate more on the topics you meantion?")


Asking question: Can you elaborate more on the topics you meantion?
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:langchain.retrievers.multi_query:Generated queries: ['Here are three alternative versions of the user question:', 'What are the key takeaways or insights related to specific topics that I should explore further?', 'Are there any specific themes, concepts, or ideas within topic areas that require more in-depth analysis or investigation?', 'Can you provide me with summaries or overviews of particular aspects of topics mentioned, such as main findings, supporting evidence, or relevant context?', 'What are the most significant connections or relationships between specific topics or their related aspects that I should be aware of or explore further?']
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"

--- Answer ---
The provided do

In [79]:
ask_LLM("What is Natural Language Processing (NLP)?")


Asking question: What is Natural Language Processing (NLP)?
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:langchain.retrievers.multi_query:Generated queries: ['Here are three different versions of the user question:', 'What is Natural Language Processing (NLP)?', 'How do humans communicate with machines using natural language?', 'What is the field that enables computers to understand, interpret, and generate human-like text?', 'These alternative questions can help retrieve relevant documents from a vector database by providing multiple perspectives on the original question. By asking different questions, we can cover different aspects of NLP, such as its definition, functionality, or applications, which may not be captured by a single distance-based search query.']
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"

--- Answer ---
Accordin

In [80]:
ask_LLM("What you will learn in ANLP?")


Asking question: What you will learn in ANLP?
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:langchain.retrievers.multi_query:Generated queries: ['Here are three different versions of the original question:', 'What topics or concepts will I gain knowledge about through studying ANLP?', 'What are the key takeaways and skills that someone learns when they study ANLP?', 'What are some examples of practical applications or new ideas that one can expect to learn from ANLP training?', 'These alternative questions aim to capture different aspects of the original question, such as focusing on the types of knowledge gained, the key takeaways, and the practical applications. This can help retrieve relevant documents from a vector database by providing multiple angles to search for related information.']
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 

In [81]:
ask_LLM("How to cook an egg")


Asking question: How to cook an egg
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:langchain.retrievers.multi_query:Generated queries: ['Here are three different versions of the original question:', 'How to cook an egg', "What's a simple recipe for cooking eggs?", 'Instructions for preparing boiled, fried, or scrambled eggs', 'These alternative questions can help retrieve relevant documents from a vector database by providing multiple perspectives on the original query. This approach can overcome some limitations of distance-based similarity search, such as:', '* Different wording or phrasing: The alternative questions use slightly different language to query the same topic.', '* Broader or narrower focus: Some alternative questions may broaden the scope (e.g., "simple recipe") while others narrow it down (e.g., "boiled, fried, or scrambled eggs").', '* Contextual vari

In [82]:
ask_LLM("tell me about Long short-term memory in very detail")


Asking question: tell me about Long short-term memory in very detail
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:langchain.retrievers.multi_query:Generated queries: ['Here are three different versions of the original question, each with a unique perspective:', 'Tell me about the architecture and key components of Long Short-Term Memory (LSTM) neural networks.', 'What are the advantages and limitations of using LSTMs for sequential data processing, such as natural language processing or time series forecasting?', 'Can you provide an in-depth explanation of how LSTMs differ from traditional Recurrent Neural Networks (RNNs), including their cell state mechanism and forget gate function?']
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"

--- Answer ---
According to the provided documents, here is the information on Long Short-Term Memory 

In [83]:
ask_LLM("Hi, it is nice to meet you")


Asking question: Hi, it is nice to meet you
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:langchain.retrievers.multi_query:Generated queries: ['Here are three alternative versions of the original user question:', 'What information do you need assistance with?', "I'm an AI designed to provide helpful responses. What's on your mind? How can I assist you?", 'Seeking accurate and reliable information, what topic would you like me to address?', 'These alternative questions aim to capture the essence of the original query while providing different perspectives that might help retrieve relevant documents from a vector database.']
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"

--- Answer ---
Nice to meet you too! It's great to chat with you. I'm here to help answer your questions based on the provided documents. What's on your mind?

--- Sour

### Others Misc

In [84]:
import os
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS


def inspect_index(index_path):
    # We still need the embedding function to load the index
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    
    if not os.path.exists(index_path):
        print(f"Error: Index path '{index_path}' not found.")
        return

    print(f"Loading index from '{index_path}'...")
    try:
        vectorstore = FAISS.load_local(
            index_path, 
            embeddings, 
            allow_dangerous_deserialization=True
        )
    except Exception as e:
        print(f"Failed to load index: {e}")
        return

    # The docstore contains the mapping from index ID to the actual Document
    if not hasattr(vectorstore, 'docstore') or not hasattr(vectorstore.docstore, '_dict'):
        print("Could not find a valid docstore in the index.")
        return
        
    doc_dict = vectorstore.docstore._dict
    print(f"Found {len(doc_dict)} chunks in the vector store.")
    
    # Use a set to find all unique source PDF files
    unique_sources = set()
    for doc_id, document in doc_dict.items():
        if hasattr(document, 'metadata') and 'source' in document.metadata:
            unique_sources.add(document.metadata['source'])
            
    print("\n--- Unique PDF files found in the index ---")
    if not unique_sources:
        print("No source files found in the metadata.")
    else:
        for source in sorted(list(unique_sources)):
            print(f"- {os.path.basename(source)}")
    print("------------------------------------------")

if __name__ == "__main__":
    inspect_index("vector_store")

Loading index from 'vector_store'...
Found 555 chunks in the vector store.

--- Unique PDF files found in the index ---
- ANLP Session1_Week1_Before Session-1.pdf
- ANLP Session2_Week2_After Session-1.pdf
- ANLP Session3_Week4_After session-2.pdf
- ANLP Session4_Week5_After Session.pdf
- Extended topic - Text summarization slides.pdf
------------------------------------------


In [85]:
# test_retriever.py
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS

FAISS_INDEX_PATH = "vector_store"

def test_retrieval(index_path, query):
    print(f"--- Testing query: '{query}' ---")
    
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    
    if not os.path.exists(index_path):
        print("Index not found. Please run your main script first.")
        return
        
    vectorstore = FAISS.load_local(
        index_path, 
        embeddings, 
        allow_dangerous_deserialization=True
    )
    
    retriever = vectorstore.as_retriever(search_kwargs={"k": 3}) # Get top 3 results
    
    # Use the retriever directly to find relevant documents
    retrieved_docs = retriever.invoke(query)
    
    if not retrieved_docs:
        print("Retriever found NO documents.")
        return
        
    print(f"\nRetriever found {len(retrieved_docs)} documents:")
    for i, doc in enumerate(retrieved_docs):
        source_file = os.path.basename(doc.metadata.get('source', 'Unknown'))
        page_number = doc.metadata.get('page', -1) + 1
        print(f"  Result {i+1}: From '{source_file}', Page {page_number}")
        # print(f"    Content: {doc.page_content[:200]}...") # Uncomment to see content
    print("----------------------------------\n")

if __name__ == "__main__":
    # Query 1: This will probably fail to find the week 1 doc
    test_retrieval(FAISS_INDEX_PATH, "What was in the lecture for week 1?")
    
    # Query 2: Find a keyword that you KNOW is inside lecture_week_1.pdf
    # For example, if week 1 was about "neural networks", try that.
    # Replace "keyword from week 1 pdf" with a real keyword.
    test_retrieval(FAISS_INDEX_PATH, "What is Natural Language Processing (NLP)?")

--- Testing query: 'What was in the lecture for week 1?' ---

Retriever found 3 documents:
  Result 1: From 'ANLP Session2_Week2_After Session-1.pdf', Page 8
  Result 2: From 'ANLP Session1_Week1_Before Session-1.pdf', Page 81
  Result 3: From 'ANLP Session1_Week1_Before Session-1.pdf', Page 80
----------------------------------

--- Testing query: 'What is Natural Language Processing (NLP)?' ---

Retriever found 3 documents:
  Result 1: From 'ANLP Session1_Week1_Before Session-1.pdf', Page 27
  Result 2: From 'ANLP Session3_Week4_After session-2.pdf', Page 5
  Result 3: From 'ANLP Session1_Week1_Before Session-1.pdf', Page 26
----------------------------------



In [86]:
import logging

# Set up logging to see the generated queries
#logging.basicConfig()
#logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

FAISS_INDEX_PATH = "vector_store"

def test_multiquery(index_path, query):
    print(f"--- Testing MultiQuery for: '{query}' ---")
    
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    vectorstore = FAISS.load_local(
        index_path, embeddings, allow_dangerous_deserialization=True
    )
    llm = ChatOllama(model="llama3")
    
    # This will automatically log the generated queries to your console
    retriever = MultiQueryRetriever.from_llm(
        retriever=vectorstore.as_retriever(), llm=llm
    )
    
    # We invoke it to trigger the query generation
    retrieved_docs = retriever.invoke(query)
    
    print(f"\nMultiQuery retriever found {len(retrieved_docs)} documents.")
    for doc in retrieved_docs:
        print(f"  - Source: {os.path.basename(doc.metadata['source'])}")

if __name__ == "__main__":
    unique_query = "What is Natural Language Processing (NLP)?"
    test_multiquery(FAISS_INDEX_PATH, unique_query)

--- Testing MultiQuery for: 'What is Natural Language Processing (NLP)?' ---
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:langchain.retrievers.multi_query:Generated queries: ['Here are three different versions of the original question:', 'What is Natural Language Processing NLP?', 'What does Natural Language Processing entail, and how does it differ from other areas of artificial intelligence?', 'What can you tell me about the field of Natural Language Processing, its applications, and its relationship to human language comprehension?', 'These alternative questions aim to capture different nuances and perspectives on the original question, which can help retrieve relevant documents that may not be immediately retrieved by a distance-based similarity search.']

MultiQuery retriever found 14 documents.
  - Source: ANLP Session4_Week5_After Session.pdf
  - Source: ANLP Session4_Week5_After Session.pdf
  - Source: ANLP Session2_Week2_After Session-1.

In [87]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
import os

FAISS_INDEX_PATH = "vector_store"

def definitive_retrieval_test(index_path):
    print("--- Running Definitive Retrieval Test ---")
    
    # 1. This is the "key" maker in your current notebook
    print("Initializing the embedding model for the query...")
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    
    # 2. This is the "lock" created by your other notebook
    print(f"Loading vector store from '{index_path}'...")
    if not os.path.exists(index_path):
        print("ERROR: Index path not found. Cannot run test.")
        return
        
    vectorstore = FAISS.load_local(
        index_path, 
        embeddings, 
        allow_dangerous_deserialization=True
    )
    
    # 3. We now perform a direct search. This tests if the key fits the lock.
    # Replace with a real keyword you know for a fact is in ONE of your documents.
    specific_query = "What is Natural Language Processing (NLP)?"
    print(f"\nPerforming direct similarity search for: '{specific_query}'")
    
    # .similarity_search() is the most basic search function.
    retrieved_docs = vectorstore.similarity_search(specific_query, k=4)
    
    if not retrieved_docs:
        print("!!! TEST FAILED: The search returned ZERO documents.")
        return

    print("\n--- Search Results ---")
    found_match = False
    for doc in retrieved_docs:
        source_file = os.path.basename(doc.metadata.get('source', 'Unknown'))
        print(f"  - Found source: {source_file}, Page: {doc.metadata.get('page', -1) + 1}")
        if "your_document_name" in source_file: # Check if it found the right file
            found_match = True

    if found_match:
        print("\nSUCCESS: The search found the correct document!")
    else:
        print("\n!!! TEST FAILED: The search returned documents, but not the correct ones.")
        print("This strongly suggests an embedding mismatch.")

# Run the test
definitive_retrieval_test(FAISS_INDEX_PATH)

--- Running Definitive Retrieval Test ---
Initializing the embedding model for the query...
Loading vector store from 'vector_store'...

Performing direct similarity search for: 'What is Natural Language Processing (NLP)?'

--- Search Results ---
  - Found source: ANLP Session1_Week1_Before Session-1.pdf, Page: 27
  - Found source: ANLP Session3_Week4_After session-2.pdf, Page: 5
  - Found source: ANLP Session1_Week1_Before Session-1.pdf, Page: 26
  - Found source: ANLP Session3_Week4_After session-2.pdf, Page: 37

!!! TEST FAILED: The search returned documents, but not the correct ones.
This strongly suggests an embedding mismatch.
