In [1]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain_community.llms import Ollama
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
from langchain import hub
import trafilatura

# For SOTA topic extraction using sentence transformers
from sentence_transformers import SentenceTransformer, util
import torch

# Predefined list of webpages (format: {topic: [urls]})
PREDEFINED_URLS = {
    "AI safety": ["https://www.deepmind.com/about/safety-and-security",  #scrapping blocked on this shit
                  "https://openai.com/safety"],
    "quantum computing": ["https://qiskit.org/learn/", #ts works
                           "https://quantum.microsoft.com/"]
}
# List of topics for similarity comparisons
PREDEFINED_TOPICS = list(PREDEFINED_URLS.keys())

# Initialize the SOTA embedding model for topic extraction
topic_extractor = SentenceTransformer('all-MiniLM-L6-v2')

def extract_topic(query: str) -> str:
    """
    Extract the most semantically similar topic from the predefined topics.
    Returns the topic if the similarity score exceeds a threshold, else None.
    """
    query_embedding = topic_extractor.encode(query, convert_to_tensor=True)
    topics_embeddings = topic_extractor.encode(PREDEFINED_TOPICS, convert_to_tensor=True)
    cosine_scores = util.cos_sim(query_embedding, topics_embeddings)
    best_score, best_index = torch.max(cosine_scores, dim=1)
    # Set a threshold (e.g., 0.6) for considering the topic as a match
    if best_score.item() >= 0.6:
        return PREDEFINED_TOPICS[best_index.item()]
    else:
        return None

def web_search(query):
    """Fallback web search using DuckDuckGo"""
    search = DuckDuckGoSearchAPIWrapper()
    return search.results(query, 3)  # Get top 3 results

from typing import List
from langchain.docstore.document import Document

def process_urls(urls: List[str]) -> List[Document]:
    """Process URLs into document chunks with error handling"""
    documents = []
    for url in urls:
        try:
            downloaded = trafilatura.fetch_url(url)
            text = trafilatura.extract(downloaded)
            if text:
                documents.append(Document(
                    page_content=text,
                    metadata={"source": url}
                ))
                print(f"Successfully processed: {url}")
            else:
                print(f"No content found at: {url}")
        except Exception as e:
            print(f"Error processing {url}: {str(e)}")
    
    if not documents:
        raise ValueError("No valid documents found from the provided URLs")
    
    # Split documents into manageable chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=200
    )
    return text_splitter.split_documents(documents)

# Initialize LLM (Make sure Ollama is running)
llm = Ollama(model="llama3.2:3b")

def answer_with_context(question: str, context_docs: List[Document]):
    """Generate answer using context documents"""
    if not context_docs:
        return "No relevant information found to answer this question."
    
    # Create retrieval chain
    embeddings = OllamaEmbeddings(model="llama3.2:3b")
    try:
        vectorstore = FAISS.from_documents(context_docs, embeddings)
    except Exception as e:
        print(f"Error creating vector store: {str(e)}")
        return "Failed to process documents for answering."
    
    retriever = vectorstore.as_retriever(k=3)
    
    # Create QA chain
    qa_prompt = hub.pull("rlm/rag-prompt")
    qa_chain = LLMChain(llm=llm, prompt=qa_prompt)
    combine_docs_chain = StuffDocumentsChain(
        llm_chain=qa_chain, document_variable_name="context"
    )
    
    try:
        result = combine_docs_chain.invoke({
            "input_documents": context_docs,
            "question": question
        })
        return result["output_text"]
    except Exception as e:
        print(f"Error generating answer: {str(e)}")
        return "Failed to generate answer."

# --- Caching Strategy Implementation ---

# Cache for topic-based FAISS vector stores
VECTOR_STORE_CACHE = {}

def get_vector_store_for_topic(topic: str, urls: List[str]):
    """
    Retrieve or create a FAISS vector store for the given topic.
    Uses caching to avoid re-vectorizing documents for repeated topics.
    """
    if topic in VECTOR_STORE_CACHE:
        print(f"Using cached vector store for topic: {topic}")
        return VECTOR_STORE_CACHE[topic]
    
    # Process URLs and create a new vector store
    docs = process_urls(urls)
    embeddings = OllamaEmbeddings(model="llama3.2:3b")
    
    try:
        vectorstore = FAISS.from_documents(docs, embeddings)
        VECTOR_STORE_CACHE[topic] = vectorstore  # Cache it for future use
        return vectorstore
    except Exception as e:
        print(f"Error creating vector store: {str(e)}")
        return None

def answer_question(question: str) -> str:
    try:
        # Use SOTA topic extraction to check for predefined topics
        topic = extract_topic(question)
        
        if topic:
            print(f"Extracted topic: {topic}")
            urls = PREDEFINED_URLS[topic]
            vectorstore = get_vector_store_for_topic(topic, urls)
            
            if vectorstore:
                retriever = vectorstore.as_retriever(k=3)
                context_docs = retriever.get_relevant_documents(question)
                return answer_with_context(question, context_docs)
            else:
                return "Failed to retrieve information for the topic."
        else:
            # Optionally, you can perform a web search if the topic is not supported
            print("Query topic is not supported. Skipping response.")
            return "The query topic is not supported as it doesn't match any predefined topics."
    
    except Exception as e:
        print(f"Unexpected error: {str(e)}")
        return "An error occurred while processing your request."


# First query on AI safety (this will create and cache the vector store)
question1 = "Tell me about quantum computing"
print("Answer 1:", answer_question(question1))



USER_AGENT environment variable not set, consider setting it to identify your requests.
  from .autonotebook import tqdm as notebook_tqdm
  llm = Ollama(model="llama3.2:3b")


Extracted topic: quantum computing
Successfully processed: https://qiskit.org/learn/
Successfully processed: https://quantum.microsoft.com/


  embeddings = OllamaEmbeddings(model="llama3.2:3b")
  context_docs = retriever.get_relevant_documents(question)
  qa_chain = LLMChain(llm=llm, prompt=qa_prompt)
  combine_docs_chain = StuffDocumentsChain(


Answer 1: Quantum computing is a type of computing that uses quantum mechanics to perform calculations, leveraging key concepts such as superposition, entanglement, and interference to solve complex problems. It has various applications, including cryptography, optimization, and machine learning, with the goal of enabling breakthroughs in fields like chemistry, materials science, and pharmaceutical research. Quantum computers have the potential to dramatically reduce noise effects and enable highly reliable logical qubits.


In [2]:
# Second query on the same topic (this will reuse the cached vector store)
question2 = "what are qubits in quantum computing?"
print("Answer 2:", answer_question(question2))


Extracted topic: quantum computing
Using cached vector store for topic: quantum computing




Answer 2: Qubits are the fundamental units of quantum computing, representing both 0 and 1 values simultaneously. They can exist in multiple states at once, known as a superposition, allowing for exponentially more processing power than classical computers. This property enables quantum computers to solve complex problems efficiently.


In [3]:

# Query on an unsupported topic (no processing or caching occurs)
question3 = "What are the latest trends in fusion energy?"
print("Answer 3:", answer_question(question3))

Query topic is not supported. Skipping response.
Answer 3: The query topic is not supported as it doesn't match any predefined topics.


# Google Scholar Update

In [None]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain_community.llms import Ollama
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
from langchain import hub
import trafilatura

# For SOTA topic extraction using sentence transformers
from sentence_transformers import SentenceTransformer, util
import torch

# For our custom Google Scholar loader
from scholarly import scholarly

# Predefined list of webpages (format: {topic: [urls]})
PREDEFINED_URLS = {
    "AI safety": [
        "https://www.deepmind.com/about/safety-and-security",
        "https://openai.com/safety"
    ],
    "quantum computing": [
        "https://qiskit.org/learn/",
        "https://quantum.microsoft.com/"
    ]
}
# List of topics for similarity comparisons
PREDEFINED_TOPICS = list(PREDEFINED_URLS.keys())

# Initialize the SOTA embedding model for topic extraction
topic_extractor = SentenceTransformer('all-MiniLM-L6-v2')

def extract_topic(query: str) -> str:
    """
    Extract the most semantically similar topic from the predefined topics.
    Returns the topic if the similarity score exceeds a threshold, else None.
    """
    query_embedding = topic_extractor.encode(query, convert_to_tensor=True)
    topics_embeddings = topic_extractor.encode(PREDEFINED_TOPICS, convert_to_tensor=True)
    cosine_scores = util.cos_sim(query_embedding, topics_embeddings)
    best_score, best_index = torch.max(cosine_scores, dim=1)
    # Set a threshold (e.g., 0.6) for considering the topic as a match
    if best_score.item() >= 0.6:
        return PREDEFINED_TOPICS[best_index.item()]
    else:
        return None

def web_search(query):
    """Fallback web search using DuckDuckGo"""
    search = DuckDuckGoSearchAPIWrapper()
    return search.results(query, 3)  # Get top 3 results

from typing import List
from langchain.docstore.document import Document

def process_urls(urls: List[str]) -> List[Document]:
    """Process URLs into document chunks with error handling"""
    documents = []
    for url in urls:
        try:
            downloaded = trafilatura.fetch_url(url)
            text = trafilatura.extract(downloaded)
            if text:
                documents.append(Document(
                    page_content=text,
                    metadata={"source": url}
                ))
                print(f"Successfully processed: {url}")
            else:
                print(f"No content found at: {url}")
        except Exception as e:
            print(f"Error processing {url}: {str(e)}")
    
    if not documents:
        raise ValueError("No valid documents found from the provided URLs")
    
    # Split documents into manageable chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    return text_splitter.split_documents(documents)

def process_google_scholar(topic: str) -> List[Document]:
    """
    Use the scholarly package to fetch scholarly documents on the topic,
    then create Document objects.
    """
    documents = []
    try:
        search_query = scholarly.search_pubs(topic)
        # Limit to a few results (e.g., top 3)
        for i, result in enumerate(search_query):
            if i >= 3:
                break
            bib = result.get('bib', {})
            title = bib.get('title', 'No Title')
            abstract = bib.get('abstract', 'No Abstract Available')
            content = f"Title: {title}\nAbstract: {abstract}"
            documents.append(Document(
                page_content=content,
                metadata={"source": "Google Scholar"}
            ))
            print(f"Fetched Google Scholar result: {title}")
        if not documents:
            print("No documents found via Google Scholar.")
        return documents
    except Exception as e:
        print(f"Error processing Google Scholar data: {str(e)}")
        return []

# Initialize LLM (Make sure Ollama is running)
llm = Ollama(model="llama3.2:3b")

def answer_with_context(question: str, context_docs: List[Document]):
    """Generate answer using context documents"""
    if not context_docs:
        return "No relevant information found to answer this question."
    
    # Create retrieval chain using the provided context
    embeddings = OllamaEmbeddings(model="llama3.2:3b")
    try:
        vectorstore = FAISS.from_documents(context_docs, embeddings)
    except Exception as e:
        print(f"Error creating vector store: {str(e)}")
        return "Failed to process documents for answering."
    
    retriever = vectorstore.as_retriever(k=3)
    
    # Create QA chain using a predefined prompt
    qa_prompt = hub.pull("rlm/rag-prompt")
    qa_chain = LLMChain(llm=llm, prompt=qa_prompt)
    combine_docs_chain = StuffDocumentsChain(
        llm_chain=qa_chain, document_variable_name="context"
    )
    
    try:
        result = combine_docs_chain.invoke({
            "input_documents": context_docs,
            "question": question
        })
        return result["output_text"]
    except Exception as e:
        print(f"Error generating answer: {str(e)}")
        return "Failed to generate answer."

# --- Caching Strategy Implementation ---
# Cache for topic-based FAISS vector stores
VECTOR_STORE_CACHE = {}

def get_vector_store_for_topic(topic: str, urls: List[str]):
    """
    Retrieve or create a FAISS vector store for the given topic.
    This function uses caching and downloads data from both predefined URLs
    and Google Scholar.
    """
    if topic in VECTOR_STORE_CACHE:
        print(f"Using cached vector store for topic: {topic}")
        return VECTOR_STORE_CACHE[topic]
    
    # Process documents from predefined URLs
    docs_web = process_urls(urls)
    # Process scholarly documents via our custom Google Scholar loader
    docs_scholar = process_google_scholar(topic)
    
    # Combine documents from both sources
    all_docs = docs_web + docs_scholar
    embeddings = OllamaEmbeddings(model="llama3.2:3b")
    
    try:
        vectorstore = FAISS.from_documents(all_docs, embeddings)
        VECTOR_STORE_CACHE[topic] = vectorstore  # Cache for future use
        return vectorstore
    except Exception as e:
        print(f"Error creating vector store: {str(e)}")
        return None

def answer_question(question: str) -> str:
    try:
        # Use SOTA topic extraction to check for a predefined topic
        topic = extract_topic(question)
        
        if topic:
            print(f"Extracted topic: {topic}")
            urls = PREDEFINED_URLS[topic]
            vectorstore = get_vector_store_for_topic(topic, urls)
            
            if vectorstore:
                retriever = vectorstore.as_retriever(k=3)
                context_docs = retriever.get_relevant_documents(question)
                return answer_with_context(question, context_docs)
            else:
                return "Failed to retrieve information for the topic."
        else:
            # If the topic is unsupported, do not process further.
            print("Query topic is not supported. Skipping response.")
            return "The query topic is not supported as it doesn't match any predefined topics."
    
    except Exception as e:
        print(f"Unexpected error: {str(e)}")
        return "An error occurred while processing your request."

# --- Example usage ---

# First query on AI safety (this will create and cache the vector store)
question1 = "Tell me about quantum computing"
print("Answer 1:", answer_question(question1))

In [None]:
# Second query on the same topic (this will reuse the cached vector store)
question2 = "what are qubits in quantum computing?"
print(answer_question(question2))



In [None]:
# Query on an unsupported topic (no processing or caching occurs)
question3 = "What are the latest trends in fusion energy?"
print(answer_question(question3))

In [None]:
# Query on an unsupported topic (no processing or caching occurs)
question3 = "What are quantum gates"
print(answer_question(question3))