In [None]:
!pip install langchain faiss-cpu transformers sentence-transformers pypdf PyPDF2

import os
from PyPDF2 import PdfReader
import textwrap

pdf_paths = [
    '/content/1706.03762v7.pdf',
    '/content/2005.11401v4.pdf',
    '/content/2005.14165v4.pdf',
]

documents = []

for path in pdf_paths:
    if os.path.exists(path):
        reader = PdfReader(path)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
        documents.append(text)
    else:
        print(f"File not found: {path}")

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

all_chunks = []
for doc in documents:
    all_chunks.extend(text_splitter.split_text(doc))

print(f"Total chunks created: {len(all_chunks)}")

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(all_chunks)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

def retrieve_top_chunks(query, top_k=8):  # Increased from 5 to 8 for more context
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)
    return [all_chunks[i] for i in indices[0]]

from transformers import pipeline

qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-large")

def format_answer(text, width=80):
    """Format the answer with proper line breaks and wrapping"""
    # Replace any existing newlines and clean up spacing
    text = text.replace('\\n', '\n').replace('  ', ' ').strip()

    # Split into sentences for better formatting
    sentences = text.split('. ')
    formatted_text = ""

    for i, sentence in enumerate(sentences):
        if i < len(sentences) - 1:
            sentence += '. '

        # Wrap long sentences
        wrapped = textwrap.fill(sentence, width=width)
        formatted_text += wrapped

        # Add paragraph breaks for better readability
        if i < len(sentences) - 1:
            formatted_text += '\n\n'

    return formatted_text

def answer_query(query):
    top_chunks = retrieve_top_chunks(query)
    context = "\n".join(top_chunks)

    # Increased context length for more detailed answers
    max_context_length = 4500
    if len(context) > max_context_length:
        context = context[:max_context_length]

    # Enhanced prompt for more detailed responses
    prompt = f"""Based on the provided context, give a comprehensive and detailed answer to the question.
    Explain the concepts clearly and provide specific details from the context.

Context:
{context}

Question: {query}

Provide a detailed answer with explanations:"""

    # Increased max_new_tokens for longer responses
    response = qa_pipeline(
        prompt,
        max_new_tokens=500,  # Increased from 300
        do_sample=False,
        temperature=0.7
    )[0]['generated_text']

    return format_answer(response.strip())

print("🚀 RAG System Ready! Ask your questions about the uploaded documents.")
print("=" * 100)

while True:
    user_question = input("\n💭 Ask your question (or type 'exit' to quit): ")
    if user_question.lower() == 'exit':
        print("\n👋 Exiting the QA system. Goodbye!")
        break

    print(f"\n🔍 Processing your question: '{user_question}'")
    print("-" * 80)

    try:
        answer = answer_query(user_question)
        print("\n🧠 Detailed Answer:")
        print("-" * 50)
        print(answer)
        print("\n" + "=" * 100)
    except Exception as e:
        print(f"\n❌ Error generating answer: {str(e)}")
        print("Please try rephrasing your question.")
        print("=" * 100)

Total chunks created: 430


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
