In [1]:
!pip install -U langchain langchain-openai langchain-groq openai pymupdf spacy chromadb

Collecting openai
  Downloading openai-1.65.4-py3-none-any.whl.metadata (27 kB)
Downloading openai-1.65.4-py3-none-any.whl (473 kB)
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.65.3
    Uninstalling openai-1.65.3:
      Successfully uninstalled openai-1.65.3
Successfully installed openai-1.65.4



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


The code sets up the CRAG framework by:
- Importing Libraries: Loading critical libraries for document processing, embeddings, and model interactions.
- API Key Setup: Storing API keys for LangSmith and Groq services securely.
- Model Initialization: Initializing the LLaMA 3 model for response generation.
- Embeddings & Vector Store: Utilizing HuggingFaceEmbeddings to transform text into numerical vectors and storing them in Chroma for optimized document retrieval.

In [2]:
import getpass
import os
from langchain.chat_models import init_chat_model
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import spacy
import re

# Set up environment variables for API keys
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = getpass.getpass("Enter LangSmith API Key: ")
os.environ["GROQ_API_KEY"] = getpass.getpass("Enter API key for Groq: ")

# Initialize LLaMA 3 chat model for response generation
llm = init_chat_model("llama3-8b-8192", model_provider="groq")



Enter LangSmith API Key:  ········
Enter API key for Groq:  ········


In [3]:
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --- ------------------------------------ 1.0/12.8 MB 5.6 MB/s eta 0:00:03
     ------- -------------------------------- 2.4/12.8 MB 6.1 MB/s eta 0:00:02
     ----------- ---------------------------- 3.7/12.8 MB 5.9 MB/s eta 0:00:02
     ------------- -------------------------- 4.5/12.8 MB 5.7 MB/s eta 0:00:02
     ----------------- ---------------------- 5.5/12.8 MB 5.3 MB/s eta 0:00:02
     ------------------- -------------------- 6.3/12.8 MB 5.1 MB/s eta 0:00:02
     ---------------------- ----------------- 7.3/12.8 MB 5.0 MB/s eta 0:00:02
     -------------------------- ------------- 8.4/12.8 MB 5.0 MB/s eta 0:00:01
     ----------------------------- ---------- 9.4/12.8 MB 5.0 MB/s eta 0:00:01
     -------------------------------- ---


[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


- PDF Loading: The PDF file is loaded by PyMuPDFLoader, which retrieves text content from every page.
- Text Cleaning: Unnecessary or blank spaces are eliminated to prevent processing of only relevant text.
- Document Object Creation: Each of the cleaned text segments is translated into a Document object for processing.
- Text Splitting: The text is split into smaller overlapping pieces (500 characters with 100 character overlap) using RecursiveCharacterTextSplitter so that the data can be embedded and retrieved while generating responses.

In [4]:
# Load and process PDF
pdf_path = "C:/Users/User/Downloads/c04b3e14-0919-4f9f-8064-5d2f5f2df99e_NLP_EXP.pdf"  # Update path if needed
loader = PyMuPDFLoader(pdf_path)
docs = loader.load()
cleaned_docs = [doc.page_content.strip() for doc in docs if doc.page_content]
cleaned_documents = [Document(page_content=text) for text in cleaned_docs]

# Split text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
all_splits = text_splitter.split_documents(cleaned_documents)

- Embeddings Creation: HuggingFaceEmbeddings model (multi-qa-mpnet-base-dot-v1) maps text chunks into numerical vector forms.
- Vector Store Initialization: The Chroma vector store is initialized to cache these embeddings for fast similarity search.
- Document Addition: The divided documents (all_splits) are inserted into the vector store, allowing retrieval of meaningful text from user queries.

In [5]:
# Create embeddings and vector store
embeddings = HuggingFaceEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")
vector_store = Chroma(embedding_function=embeddings)
vector_store.add_documents(documents=all_splits)

  embeddings = HuggingFaceEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")





  vector_store = Chroma(embedding_function=embeddings)


['6f19f821-8b91-4c6e-9096-2755e2b2503f',
 'b07c65a2-9de1-45fa-97db-c86add9f6e4e',
 '3d33fd32-42a0-42f1-8e0a-71273b9304ac',
 '59aba70d-414f-4592-9ed9-742d6b6cec37',
 'ddbde104-e55d-4781-a229-be85d5e16248',
 '6ea9f57b-bef5-4acc-9d78-ac9035f1d3c1',
 'a457c7be-920e-4946-b62b-449da5c2230e',
 'e94cca42-ef4b-47e0-8901-082c94186232',
 'feccba42-8a83-47b4-9eec-180c1ae2cebc',
 '08b416e2-917c-493c-b63b-d34c022d9d74',
 '0a32a224-4101-47a1-b6ab-b4340e49d9ba',
 'aba91814-41cf-497e-b2ab-77ff80e3373f',
 '88da9a65-e1b0-49e1-8e41-449421022512',
 'b1b12cfc-1fbb-4380-9dd0-02d3ad18b183',
 'edc7e435-ebfb-468d-9b94-b8143f6f0281',
 '625b3265-0314-4451-aa1b-bc485b43130e',
 '920693f9-93eb-4fed-a217-e17dabba06ce',
 '6ad534dd-e38e-4609-bd3f-cd170bf1832b',
 '9de8dec3-ca0b-4eea-8ba7-459a0a75229f',
 '925442ba-c900-4cff-8232-4caebb7ca1e4',
 '39cd1174-c080-4447-81ef-63c3db4071f0',
 '0c1a0c2c-a8f8-42a0-b044-d9ac3d4b073a',
 '80dd0ca2-b4bb-4674-b939-647399d98362',
 '73b4c947-0f46-4298-abdb-e3f1b2757e30',
 '303bce9a-8d71-

- Function Purpose: The function detect_errors_in_documents detects documents that can potentially contain errors or out-of-date information.
- Logic Applied: It looks for the existence of some keywords such as "2019", "outdated", "incorrect", or "not verified" in the document text.
- Output: Documents containing any of these terms are flagged and added to the error_docs list for further correction.

In [6]:
# Function for Detecting Errors in Retrieved Documents
def detect_errors_in_documents(docs):
    """Detect potential errors or outdated information in documents."""
    error_docs = []
    for doc in docs:
        if any(term in doc.page_content.lower() for term in ["2019", "outdated", "incorrect", "not verified"]):
            error_docs.append(doc)
    return error_docs

- Function Purpose: The correct_information function automatically corrects the old or wrong information in the documents retrieved.
- Logic Applied: It substitutes specific words:
"2019" → "2023"
"outdated" → "updated"
"incorrect" → "verified"
- Output: The corrected text is kept in a new list named corrected_docs for further processing.

In [7]:
# Function for Correcting Information
def correct_information(docs):
    """Correct information in retrieved documents."""
    corrected_docs = []
    for doc in docs:
        corrected_text = (doc.page_content
                          .replace("2019", "2023")
                          .replace("outdated", "updated")
                          .replace("incorrect", "verified"))
        corrected_docs.append(Document(page_content=corrected_text))
    return corrected_docs

- Function Purpose: The function retrieve_and_correct_documents fetches suitable documents from the vector store against the user query.
- Error Detection: It identifies potential errors through the detect_errors_in_documents function.
- Correction Process: The documents are corrected automatically upon the detection of errors using the correct_information function.
- The function gives back corrected or original documents based on whether or not there were errors.

In [8]:
# Function to Retrieve and Correct Documents
def retrieve_and_correct_documents(query, vector_store):
    """Retrieve documents and correct information if needed."""
    retrieved_docs = vector_store.similarity_search(query, k=5)
    error_docs = detect_errors_in_documents(retrieved_docs)

    if error_docs:
        print("\n🔹 Detected potential errors in retrieved documents. Correcting...")
        corrected_docs = correct_information(retrieved_docs)
    else:
        corrected_docs = retrieved_docs

    return corrected_docs

- Function Purpose: The handle_query function handles user queries with the corrective RAG system.
- Document Retrieval and Correction: It invokes the retrieve_and_correct_documents function to retrieve and correct documents when necessary.
- Message Formatting: The information obtained is formatted into messages based on HumanMessage and SystemMessage schemas.
- Response Generation: The structured messages are forwarded to the LLaMA 3 model, which creates the final response based on improved information.

In [12]:
# Import necessary schema for message formatting
from langchain.schema import HumanMessage, SystemMessage

# Improved Query Handling with Correction
def handle_query(query, vector_store):
    """Handle user queries with corrective RAG."""
    corrected_docs = retrieve_and_correct_documents(query, vector_store)
    doc_texts = "\n".join([doc.page_content for doc in corrected_docs])

    # Prepare messages in the correct format for LLaMA 3
    messages = [
        SystemMessage(content="You are a helpful assistant that corrects outdated information."),
        HumanMessage(content=f"User: {query}\nRelevant Info: {doc_texts}\nBot:")
    ]

    # Generate response using LLaMA 3 with corrected format
    response = llm.invoke(messages)

    return response

In [13]:
# Example Queries Demonstrating Corrective RAG
queries = [
    "Which are the top states for tourism in India?",
    "Which festivals are celebrated in Maharashtra?",
    "Best time to visit Rajasthan",
    "What language is spoken in Kerala?",
    "Tell me about foreign tourist visits in 2023",
    "Tell me about tourism growth in 2019."
]

# Run all example queries
for i, query in enumerate(queries, 1):
    print(f"🔹 Query {i}:\n{query}")
    print("🔹 Response:\n", handle_query(query, vector_store))
    print("\n" + "=" * 80 + "\n")

🔹 Query 1:
Which are the top states for tourism in India?
🔹 Response:
 content='According to the latest available data, the top states for tourism in India in 2022 are:\n\n1. Uttar Pradesh - 317.91 million domestic tourist visits (18.37% share)\n2. Tamil Nadu - 218.58 million domestic tourist visits (12.63% share)\n3. Andhra Pradesh - 192.72 million domestic tourist visits (11.13% share)\n4. Karnataka - 182.41 million domestic tourist visits (10.54% share)\n5. Gujarat - 135.81 million domestic tourist visits (7.85% share)\n\nPlease note that these figures are based on 2022 data, and the numbers may have changed since then.' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 142, 'prompt_tokens': 750, 'total_tokens': 892, 'completion_time': 0.118333333, 'prompt_time': 0.140527242, 'queue_time': 0.020405167000000002, 'total_time': 0.258860575}, 'model_name': 'llama3-8b-8192', 'system_fingerprint': 'fp_179b0f92c9', 'finish_reason': 'stop', 'logprobs': None} id='r