In [10]:
#step-1
#!pip install --upgrade langchain langchain-community faiss-cpu pypdf tiktoken sentence-transformers transformers accelerate bitsandbytes
!pip install --upgrade langchain langchain-community faiss-cpu pypdf tiktoken sentence-transformers transformers accelerate bitsandbytes pdfminer.six



In [10]:
#step-2
# to upload the pdf file


from google.colab import files
import os

print("Please upload your PDF file (e.g., 'The Basics of Nutrition.pdf').")
try:
    uploaded = files.upload()
    pdf_path = list(uploaded.keys())[0]
    print(f"Uploaded: {pdf_path}")
except Exception as e:
    print(f"Error uploading file: {e}")
    # Fallback for local testing if file not uploaded
    pdf_path = 'The Basics of Nutrition.pdf' # <--- IMPORTANT: Change this if your PDF has a different name
    if not os.path.exists(pdf_path):
        print(f"Error: PDF file '{pdf_path}' not found. Please upload it or ensure it's in the correct directory.")
        import sys
        sys.exit(1)

In [10]:
#step-3
#Function to clean and refine text for better readability

def clean_output(text):
    import re

    # Normalize newlines and tabs to single spaces first
    text = re.sub(r'[\n\t]+', ' ', text)

    # Aggressively remove common OCR artifacts like 'E' and 'O' that appear to join words or as noise.
    # This specifically targets cases where 'E' or 'O' are used as separators,
    # or when words are simply merged without a space.
    # Example: "FoodsE ProteinsE" -> "Foods Proteins"
    # Example: "Ironintakesrequired" -> "Iron intakes required"

    # Insert space before uppercase letter if preceded by lowercase (e.g., "WordWord" -> "Word Word")
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)

    # Remove 'E' or 'O' when they are between two letters
    text = re.sub(r'([a-zA-Z])E([a-zA-Z])', r'\1\2', text)
    text = re.sub(r'([a-zA-Z])O([a-zA-Z])', r'\1\2', text)

    # Remove isolated 'E' or 'O' characters or sequences of them (e.g., " E ", " O ", " EE ")
    text = re.sub(r'\b[EO]\b', ' ', text) # Standalone 'E' or 'O' as whole words
    text = re.sub(r'(\s*[EO]\s*)+', ' ', text) # Sequences of 'E's or 'O's with spaces

    # Remove specific header/footer remnants commonly seen in your PDF
    text = re.sub(r'M DUL \d+ – TH BASICS F NUTRITI N © ACF – Technical and Research Department – \d+', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Skerrett and Willett Page \d+', '', text)
    text = re.sub(r'NIH-PA Author Manuscript', '', text)
    text = re.sub(r'E+', '', text) # Remove any remaining 'E's

    # Remove long sequences of non-alphanumeric characters (like dashes, underscores, etc.)
    text = re.sub(r"[-=_~]{3,}", "", text)

    # Remove excessive repetition of the same word (e.g., "word word word" -> "word")
    text = re.sub(r"(\b\w+\b)(?:\s+\1\b)+", r"\1", text, flags=re.IGNORECASE)

    # Collapse multiple spaces into a single space
    text = re.sub(r"\s{2,}", " ", text)
    # Collapse multiple newlines into a single newline
    text = re.sub(r"\n\s*\n", "\n", text)

    # Remove leading/trailing whitespace
    text = text.strip()

    # Ensure proper sentence formatting
    # Attempt to capitalize the first letter of the cleaned output if it's not already
    if text and len(text) > 0 and text[0].islower():
        text = text[0].upper() + text[1:]

    # Add a period if the sentence ends abruptly without punctuation, and it's not a list item.
    if text and not re.search(r'[.!?]$', text) and not text.startswith('- '):
        text += '.'

    # Final pass to fix common OCR merging issues that result in repeated fragments or awkward spacing
    text = re.sub(r'(\w+)\s*>\s*(\w+)', r'\1 > \2', text) # 'Word>Word' -> 'Word > Word'
    text = re.sub(r'([a-z])(\s+)(\w)', r'\1 \3', text) # Remove extra space after a lowercase letter followed by a capital letter, if one was added by the first rule (e.g. "Acid, minerals" -> "Acid, minerals" if previous rule added space)
    text = re.sub(r'(\w+),(\w+)', r'\1, \2', text) # Ensure space after comma if word follows immediately
    text = re.sub(r'\.(\w)', r'. \1', text) # Ensure space after period if word follows immediately

    return text

In [10]:
#step-4: Load and chunk the PDF
#Optimizing chunk size and overlap can improve retrieval accuracy.

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from pdfminer.high_level import extract_text # For alternative extraction

print(f"Loading PDF from: {pdf_path}")
try:
    # Try extracting text using pdfminer.six first
    raw_pdf_text = extract_text(pdf_path)

    # Split the raw text into pages using form feed character, then clean each page
    pages_content_list = raw_pdf_text.split('\f')

    documents = []
    for i, page_content in enumerate(pages_content_list):
        cleaned_page_content = clean_output(page_content) # Clean each page immediately
        if cleaned_page_content.strip(): # Only add if content is not just empty after cleaning
            documents.append(Document(page_content=cleaned_page_content, metadata={"page": i + 1, "source": pdf_path}))

    print(f"Extracted and initially cleaned {len(documents)} document-like sections.")

    # Now, use Langchain's splitter on these pre-cleaned documents
    splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=40) # Keep optimized for LLM input
    chunks = splitter.split_documents(documents) # Use the pre-cleaned documents

    print(f"Split cleaned document sections into {len(chunks)} final chunks.")

except Exception as e:
    print(f"Error loading or chunking PDF: {e}")
    chunks = []

In [10]:
#setp-5
#Generate vector embeddings using Hugging Face

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

print("Generating vector embeddings...")
try:
    # Using a good default embedding model from Hugging Face
    embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    # Use the 'chunks' list which now contains pre-cleaned content
    vectorstore = FAISS.from_documents(chunks, embedding)
    print("Vector store created successfully.")
except Exception as e:
    print(f"Error generating embeddings or creating vector store: {e}")
    vectorstore = None

In [10]:
#step-6: Use a free Hugging Face model for LLM

from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

print("Loading Hugging Face LLM...")
try:
    # --- IMPORTANT CHANGE FOR PERFORMANCE AND STABILITY ---
    # Reverting to 'flan-t5-base' for better performance in environments with limited resources.
    # If you have a powerful GPU (e.g., A100), you can try 'google/flan-t5-large' again.
    model_name = "google/flan-t5-base" # Using 'base' for faster execution and less memory
    # model_name = "google/flan-t5-large" # Uncomment this if 'large' causes memory issues and your environment can handle it

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # Using `torch_dtype=torch.bfloat16` and `device_map="auto"` for memory efficiency on GPU
    # If you don't have a GPU or encounter errors, you might need to remove `torch_dtype` or set `device_map="cpu"`
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else None, # Use bfloat16 only if CUDA is available
        device_map="auto" # Automatically maps layers to available devices (GPU/CPU)
    )

    # Configure the pipeline for text generation. Max_length is important for controlling response verbosity.
    # Tuned parameters for potentially better generation quality.
    pipe = pipeline(
        "text2text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=150, # Increased max_length slightly to allow for more coherent sentences
        temperature=0.2, # Kept low for deterministic, direct output
        top_k=50,        # Sample from top K most likely tokens.
        top_p=0.95,      # Nucleus sampling: sample from tokens summing up to P probability.
        repetition_penalty=1.1, # Penalize repeating tokens, reducing repetitive output.
        do_sample=True,   # Enable sampling (required for temperature, top_k, top_p)
        num_return_sequences=1 # Generate only one sequence
    )
    llm = HuggingFacePipeline(pipeline=pipe)
    print(f"Hugging Face LLM '{model_name}' loaded successfully.")

except Exception as e:
    print(f"Error loading Hugging Face LLM: {e}")
    llm = None

In [10]:
# step-7: Set up the RAG Chain
# Langchain's RetrievalQA chain is excellent for combining retrieval and generation.

from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

print("Setting up RAG chain...")
if vectorstore and llm:
    # `search_type="similarity"` is good for semantic search.
    # --- IMPORTANT CHANGE: Set 'k' to 2 for a slightly broader context, but rely on cleaning and prompt for conciseness ---
    retriever = vectorstore.as_retriever(search_type="similarity", k=2) # Increased k to 2 for a slightly broader context

    # Define the custom prompt template for better accuracy and control.
    # This instructs the LLM to use *only* the provided context and be highly concise.
    qa_template = """You are a helpful assistant specialized in nutrition, based on the provided text.
    Use ONLY the following pieces of information (context) to answer the question at the end.
    If the context does not contain enough information to answer the question, state clearly and concisely: "I don't have enough information from the provided document to answer that question."
    Do NOT try to make up an answer or use external knowledge.
    Be extremely concise and direct. Provide the exact answer from the context if possible, otherwise summarize it in one to two sentences.
    Ensure your answer is grammatically correct and flows naturally.

    Context:
    {context}

    Question: {question}
    Helpful Answer:"""

    QA_PROMPT = PromptTemplate(template=qa_template, input_variables=["context", "question"])

    rag_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        # --- IMPORTANT CHANGE: Do not return source documents in the final output ---
        return_source_documents=False, # Set to False to not show source details in the final output
        chain_type="stuff", # 'stuff' concatenates all retrieved documents into the prompt
        chain_type_kwargs={"prompt": QA_PROMPT} # Pass the custom prompt
    )
    print("RAG chain setup complete.")
else:
    print("RAG chain could not be set up due to previous errors. Please check previous cell outputs.")
    rag_chain = None

In [10]:
#step 7: Ask questions

print("\n--- Basics of Nutrition Chatbot (Hugging Face) ---")
print("Ask me anything about nutrition from the PDF. Type 'exit' to quit.")

while True:
    query = input("\nYour Question: ")
    if query.lower() == 'exit':
        print("Exiting chatbot. Goodbye!")
        break

    if not rag_chain:
        print("Chatbot is not initialized due to previous errors. Cannot process query.")
        continue

    print("Thinking...")
    try:
        # Use .invoke() for newer Langchain versions and pass a dictionary as input
        result = rag_chain.invoke({"query": query})
        answer = result['result']
        # source_documents = result['source_documents'] # No longer returned if return_source_documents=False

        cleaned_answer = clean_output(answer)
        print(f"Answer: {cleaned_answer}")

        # --- IMPORTANT CHANGE: Removed the source display logic ---
        # if source_documents:
        #     print("\n--- Sources Used ---")
        #     for i, doc in enumerate(source_documents):
        #         print(f"Source {i+1} (Page {doc.metadata.get('page', 'N/A')}):")
        #         cleaned_snippet = clean_output(doc.page_content)
        #         print(f"  Snippet: {cleaned_snippet[:200]}...")
        #         print("-" * 30)

    except Exception as e:
        print(f"An error occurred: {e}")
        print("Please try re-running the notebook or check for specific error messages above.")
