<a href="https://colab.research.google.com/github/ShivEla/LibriQuery/blob/main/LibriQuery_RAG_Book_Q%26A_System_(LangChain_FAISS_Gemini).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install langchain-google-genai langchain-community python-dotenv requests



In [None]:

# Install necessary libraries in Colab
!pip install langchain-google-genai langchain-community python-dotenv requests faiss-cpu

import requests
import json
import os
import time # For adding a small delay between API calls
from dotenv import load_dotenv

# LangChain Imports for Gemini
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from langchain_core.documents import Document # For creating Document objects for FAISS

# Load environment variables from .env file (for GEMINI_API_KEY)
load_dotenv()

# --- Configuration ---
OPEN_LIBRARY_API_URL_SEARCH = "https://openlibrary.org/search.json"
OPEN_LIBRARY_API_URL_WORKS = "https://openlibrary.org/works/" # For detailed work data
OPEN_LIBRARY_API_URL_AUTHORS = "https://openlibrary.org/authors/" # For author data
OPEN_LIBRARY_API_URL_BOOKS_BY_BIBKEYS = "https://openlibrary.org/api/books" # For ISBN lookup
OPEN_LIBRARY_API_URL_COVERS = "https://covers.openlibrary.org/b/" # For cover images

# Gemini API Key - used for both embeddings and the ChatGoogleGenerativeAI LLM
# Ensure GEMINI_API_KEY is set in your environment variables or .env file
# Example: GEMINI_API_KEY="AIzaSyC..."
GEMINI_API_KEY = "AIzaSyC95Y7RY-tIx6vg8Hm2nZifct1JSq63ffA" # <--- REPLACE THIS WITH YOUR ACTUAL API KEY FOR LOCAL USE


# --- Helper Functions for Open Library Data Retrieval ---

def make_api_request(url: str, params: dict = None, headers: dict = None) -> dict:
    """
    Helper function to make HTTP GET requests and handle common errors.
    """
    if headers is None:
        headers = {}
    # User-Agent removed as per previous request.

    try:
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error making API request to {url}: {e}")
        return {}

def fetch_open_library_data_for_context(query: str, limit: int = 5) -> list:
    """
    Fetches initial book data from the Open Library Search API based on a query.
    """
    print(f"  Searching Open Library for '{query}' as initial context...")
    params = {"q": query, "limit": limit}
    data = make_api_request(OPEN_LIBRARY_API_URL_SEARCH, params=params)
    return data.get("docs", [])

def fetch_detailed_work_data(work_olid: str) -> dict:
    """
    Fetches more detailed data for a specific book work using its OLID.
    This uses the /works/{OLID}.json API.
    """
    url = f"{OPEN_LIBRARY_API_URL_WORKS}{work_olid}.json"
    data = make_api_request(url)

    detailed_info = {
        'description': 'No detailed description available.',
        'subjects': [],
        'authors_ol_ids': [] # To store author OLIDs
    }

    description = data.get('description')
    if isinstance(description, dict) and 'value' in description:
        detailed_info['description'] = description['value']
    elif isinstance(description, str):
        detailed_info['description'] = description

    subjects = data.get('subjects')
    if subjects and isinstance(subjects, list):
        detailed_info['subjects'] = subjects

    authors = data.get('authors')
    if authors and isinstance(authors, list):
        for author_entry in authors:
            if 'author' in author_entry and 'key' in author_entry['author']:
                author_olid = author_entry['author']['key'].split('/')[-1]
                detailed_info['authors_ol_ids'].append(author_olid)

    return detailed_info

def fetch_author_details(author_olid: str) -> dict:
    """
    Fetches detailed data for a specific author using their OLID.
    This uses the /authors/{OLID}.json API.
    """
    url = f"{OPEN_LIBRARY_API_URL_AUTHORS}{author_olid}.json"
    data = make_api_request(url)

    author_info = {
        'name': data.get('name', 'N/A'),
        'bio': 'No biography available.',
        'birth_date': data.get('birth_date', 'N/A'),
        'death_date': data.get('death_date', 'N/A'),
        'author_subjects': data.get('subjects', []) # Subjects associated with the author
    }

    bio = data.get('bio')
    if isinstance(bio, dict) and 'value' in bio:
        author_info['bio'] = bio['value']
    elif isinstance(bio, str):
        author_info['bio'] = bio

    return author_info

def fetch_book_by_isbn(isbn: str) -> dict:
    """
    Fetches book details using its ISBN.
    This uses the /api/books?bibkeys=ISBN:{ISBN} API.
    """
    params = {"bibkeys": f"ISBN:{isbn}", "format": "json", "jscmd": "data"}
    data = make_api_request(OPEN_LIBRARY_API_URL_BOOKS_BY_BIBKEYS, params=params)

    if data and f"ISBN:{isbn}" in data:
        book_data = data[f"ISBN:{isbn}"]
        return {
            'isbn_title': book_data.get('title', 'N/A'),
            'publish_date': book_data.get('publish_date', 'N/A'),
            'number_of_pages': book_data.get('number_of_pages', 'N/A'),
            'publishers': [p.get('name') for p in book_data.get('publishers', []) if p.get('name')],
            'isbn_authors': [a.get('name') for a in book_data.get('authors', []) if a.get('name')]
        }
    return {}

def get_cover_url(olid: str = None, isbn: str = None, size: str = 'M') -> str:
    """
    Constructs a cover image URL.
    Size can be 'S' (small), 'M' (medium), 'L' (large).
    Prioritizes OLID if both are provided.
    """
    if olid:
        return f"{OPEN_LIBRARY_API_URL_COVERS}olid/{olid}-{size}.jpg"
    elif isbn:
        return f"{OPEN_LIBRARY_API_URL_COVERS}isbn/{isbn}-{size}.jpg"
    return "No cover available."

# --- RAG Core Functions (using LangChain with Gemini) ---

def create_faiss_index(books_data: list, embeddings_model: GoogleGenerativeAIEmbeddings) -> FAISS:
    """
    Creates a FAISS vector store from enriched book data.
    """
    documents = []
    for book in books_data:
        # Combine relevant information into page_content for embedding
        content = f"Title: {book.get('title', 'N/A')}\n" \
                  f"Author: {', '.join(book.get('author_name', ['N/A']))}\n" \
                  f"Description: {book.get('description', book.get('first_sentence', ['No description available.'])[0])}\n" \
                  f"Genres: {', '.join(book.get('subjects', book.get('subject', ['N/A'])))}\n" \
                  f"Publish Date: {book.get('publish_date', 'N/A')}\n" \
                  f"Number of Pages: {book.get('number_of_pages', 'N/A')}\n" \
                  f"Publishers: {', '.join(book.get('publishers', ['N/A']))}\n" \
                  f"Author Bio: {book.get('author_bio', 'No author biography available.')}\n" \
                  f"Author Birth Date: {book.get('author_birth_date', 'N/A')}\n" \
                  f"Author Death Date: {book.get('author_death_date', 'N/A')}"

        # Store useful metadata for source attribution
        metadata = {
            "title": book.get('title', 'N/A'),
            "author": ', '.join(book.get('author_name', ['N/A'])),
            "olid": book.get('key', '').replace('/works/', '') # Use work OLID as a stable ID
        }
        documents.append(Document(page_content=content, metadata=metadata))

    if not documents:
        raise ValueError("No documents generated to create FAISS index.")

    print("  Creating FAISS vector store...")
    vectorstore = FAISS.from_documents(documents, embeddings_model)
    print("  FAISS vector store created.")
    return vectorstore

def extract_search_terms_from_question(question: str, llm_model: ChatGoogleGenerativeAI) -> str:
    """
    Uses LLM to extract relevant search terms from a user's question.
    Now uses LangChain's ChatGoogleGenerativeAI.
    """
    prompt = f"""From the following user question, identify the primary book title, author name, or series name that would be most effective for searching Open Library. If the question is about a general topic or genre, extract that.
    Return only the most relevant single keyword or a short phrase for Open Library search. Do NOT include words like 'plot', 'storyline', 'summary', 'rating', 'review', 'what is the', 'tell me about' in the extracted terms.
    If no specific book/author/series is mentioned, return a relevant genre or 'book'.

    User Question: "{question}"

    Search Term:"""

    try:
        # Using LLM directly for extraction
        response = llm_model.invoke(prompt)
        extracted_term = response.content.strip()

        # Basic cleanup for common phrases that LLM might still include
        extracted_term_lower = extracted_term.lower()
        for phrase in ["plot of", "storyline of", "summary of", "rating of", "review of", "what is the", "tell me about"]:
            if extracted_term_lower.startswith(phrase):
                extracted_term = extracted_term_lower.replace(phrase, "", 1).strip()

        if not extracted_term:
            question_lower = question.lower()
            if "plot of" in question_lower:
                return question_lower.split("plot of", 1)[1].strip().replace("?", "").replace(".", "")
            elif "storyline of" in question_lower:
                return question_lower.split("storyline of", 1)[1].strip().replace("?", "").replace(".", "")
            return "book" # Fallback
        return extracted_term
    except Exception as e:
        print(f"Error extracting search terms with LLM: {e}. Falling back to simple parsing.")
        # Fallback if LLM extraction fails
        question_lower = question.lower()
        if "plot of" in question_lower:
            return question_lower.split("plot of", 1)[1].strip().replace("?", "").replace(".", "")
        elif "storyline of" in question_lower:
            return question_lower.split("storyline of", 1)[1].strip().replace("?", "").replace(".", "")
        return "book" # Final fallback


# Define custom prompt templates for the ConversationalRetrievalChain
qa_prompt_template = """
You are a helpful chatbot that answers questions about books.
Use ONLY the following pieces of context to answer the question at the end.
If the information is not explicitly mentioned in the context, say "I don't have specific information about that."
Do not make up or infer information that is not directly stated in the context.

Context:
{context}

Question: {question}

Chat History:
{chat_history}

Answer:
"""

QA_PROMPT = PromptTemplate(
    template=qa_prompt_template,
    input_variables=["context", "question", "chat_history"]
)

# --- Main Application Logic ---

def main():
    print("--- LibriQuery: Command-Line RAG Book Q&A System ---")
    print("Ask any question about books, and I'll try to answer based on Open Library data.")
    print("Type 'exit' to quit.")

    # Initialize LangChain components once
    # These will be re-initialized if a new FAISS index is created
    # Changed model to gemini-1.5-flash for broader availability
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GEMINI_API_KEY)
    llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.7, google_api_key=GEMINI_API_KEY)
    memory = ConversationBufferMemory(
        memory_key="chat_history",
        return_messages=True,
        output_key="answer"
    )

    # Initialize conversation_chain outside the loop, but it needs a retriever
    # which depends on the FAISS index. So, we'll initialize it after fetching data.
    conversation_chain = None

    # Store fetched books globally for the session
    global_books_context = []

    while True:
        user_question = input("\nYour Question (or 'exit' to quit): ")
        if user_question.lower() == 'exit':
            print("Exiting Q&A session. Goodbye!")
            break
        if not user_question.strip():
            print("Please enter a question.")
            continue

        print("Understanding your question and searching for context...")

        # Step 1: Extract search terms from the user's question using LLM
        search_terms = extract_search_terms_from_question(user_question, llm)
        print(f"Identified search terms: '{search_terms}'")

        # Step 2: Fetch initial book data from Open Library based on extracted terms
        initial_search_results = fetch_open_library_data_for_context(search_terms)

        if not initial_search_results:
            print("No books found matching your query for context. Please try a different question.")
            continue

        # Step 3: Enrich book data with more details from various Open Library APIs
        print("  Enriching book data with more details from multiple Open Library APIs...")
        books_for_qa_context = []
        for i, book in enumerate(initial_search_results):
            print(f"    Enriching book {i+1}/{len(initial_search_results)}: {book.get('title', 'N/A')}")

            # Fetch detailed work data
            work_olid = book.get('key', '').replace('/works/', '')
            if work_olid:
                detailed_work_data = fetch_detailed_work_data(work_olid)
                book.update(detailed_work_data)
                time.sleep(0.05) # Small delay

            # Fetch author details if OLID is available from work data
            author_ol_ids = book.get('authors_ol_ids', [])
            if author_ol_ids:
                # For simplicity, just fetch details for the first author found
                first_author_olid = author_ol_ids[0]
                author_details = fetch_author_details(first_author_olid)
                book['author_bio'] = author_details.get('bio')
                book['author_birth_date'] = author_details.get('birth_date')
                book['author_death_date'] = author_details.get('death_date')
                book['author_subjects'] = author_details.get('author_subjects') # Add author's subjects
                time.sleep(0.05) # Small delay

            # Fetch ISBN-specific data if ISBNs are available
            isbns = book.get('isbn', [])
            if isbns:
                # Use the first ISBN found for detailed lookup
                first_isbn = isbns[0]
                isbn_data = fetch_book_by_isbn(first_isbn)
                book.update(isbn_data) # Merge ISBN-specific data
                time.sleep(0.05) # Small delay

            # Construct cover URL
            book['cover_url'] = get_cover_url(olid=book.get('cover_edition_key', '').replace('/books/', '') or book.get('olid'), isbn=isbns[0] if isbns else None)

            books_for_qa_context.append(book)

        # Update global context
        global_books_context = books_for_qa_context

        # Step 4: Create FAISS index and RAG chain from the enriched data
        try:
            vectorstore = create_faiss_index(global_books_context, embeddings)
            retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

            # Re-initialize conversation chain with the new retriever
            conversation_chain = ConversationalRetrievalChain.from_llm(
                llm=llm,
                retriever=retriever,
                memory=memory,
                combine_docs_chain_kwargs={"prompt": QA_PROMPT},
                return_source_documents=True
            )
            print("  RAG chain initialized with new context.")

            # Step 5: Answer the user's question using the RAG chain
            print("Generating answer using RAG chain...")
            result = conversation_chain({"question": user_question})
            answer = result["answer"]
            source_documents = result["source_documents"]

            print("\nAnswer:", answer)
            print("\nSources:")
            if source_documents:
                for i, doc in enumerate(source_documents[:3]): # Show only top 3 sources
                    # Displaying metadata for context
                    source_title = doc.metadata.get('title', 'Unknown Title')
                    source_author = doc.metadata.get('author', 'Unknown Author')
                    print(f"Source {i + 1}: Title: '{source_title}', Author: '{source_author}'")
            else:
                print("No specific sources found in the retrieved context.")

        except ValueError as e:
            print(f"Error during RAG process: {e}")
            print("Please ensure your Gemini API key is correct and try a different query.")
        except Exception as e:
            print(f"An unexpected error occurred during RAG process: {e}")
            print("Please try again.")

if __name__ == "__main__":
    main()


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m60.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0
--- LibriQuery: Command-Line RAG Book Q&A System ---
Ask any question about books, and I'll try to answer based on Open Library data.
Type 'exit' to quit.

Your Question (or 'exit' to quit): chetan bhagat
Understanding your question and searching for context...
Identified search terms: 'Chetan Bhagat'
  Searching Open Library for 'Chetan Bhagat' as initial context...
  Enriching book data with more details from multiple Open Library APIs...
    Enriching book 1/5: Half Girlfriend Chetan Bhagat
    Enriching book 2/5: three mistakes of my life by chetan bhagat
    Enriching book 3/5: 2 states
    Enrich

  result = conversation_chain({"question": user_question})



Answer: Chetan Bhagat is the author of *Half Girlfriend* and *2 States*.  His birth date is 1974.  I don't have specific information about his death date or a biography.

Sources:
Source 1: Title: 'three mistakes of my life by chetan bhagat', Author: 'N/A'
Source 2: Title: 'Half Girlfriend Chetan Bhagat', Author: 'Chetan Bhagat'
Source 3: Title: '2 states', Author: 'Chetan Bhagat'
