In [14]:
import os
import wikipedia
from serpapi import GoogleSearch
from duckduckgo_search import DDGS
import json

def search_serpapi(query, num_results=5):
    print("Trying SerpApi")
    api_key = os.getenv("SERPAPI_API_KEY")
    

    params = {
        "engine": "google",
        "q": query,
        "api_key": api_key,
    }

    search = GoogleSearch(params)
    results = search.get_dict()

    organic_results = results.get("organic_results", [])
    if not organic_results:
        return []

    formatted_results = [
        {"title": item.get("title"), "link": item.get("link"), "snippet": item.get("snippet")}
        for item in organic_results[:num_results]
    ]
    return formatted_results


def search_ddg(query, num_results=5):
    print("SerpApi failed. Trying DuckDuckGo")
    with DDGS() as ddgs:
        results = list(ddgs.text(query, max_results=num_results))

    if not results:
        return []

    formatted_results = [
        {"title": item.get("title"), "link": item.get("href"), "snippet": item.get("body")}
        for item in results
    ]
    return formatted_results


def search_wikipedia(query, num_results=3):
    print("DuckDuckGo failed. Trying Wikipedia...")
    try:
        page_titles = wikipedia.search(query, results=num_results)
        if not page_titles:
            return []

        formatted_results = []
        for title in page_titles:
            try:
                page = wikipedia.page(title, auto_suggest=False)
                summary = wikipedia.summary(title, sentences=2)
                formatted_results.append({
                    "title": page.title,
                    "link": page.url,
                    "snippet": summary
                })
            except wikipedia.exceptions.DisambiguationError:
                print(f"'{title}' was ambiguous, skipping.")
            except Exception as e:
                print(f"Couldn't get page for '{title}': {e}")

        return formatted_results
    except Exception as e:
        print(f"Something went wrong with Wikipedia search: {e}")
        return []


def robust_web_search(query, num_results=5):
    try:
        results = search_serpapi(query, num_results)
        if results:
            print("Success with SerpApi!")
            return results
    except Exception as e:
        print(f"SerpApi error: {e}")

    try:
        results = search_ddg(query, num_results)
        if results:
            print("Success with DuckDuckGo!")
            return results
    except Exception as e:
        print(f"DuckDuckGo error: {e}")

    try:
        results = search_wikipedia(query, num_results)
        if results:
            print("Success with Wikipedia!")
            return results
    except Exception as e:
        print(f"Wikipedia error: {e}")

    print("Sorry, all search methods failed.")
    return []


if __name__ == "__main__":
    search_query = "What are LSTMs ?"
    search_results = robust_web_search(search_query)

    if search_results:
        print("\nResults:")
        print(json.dumps(search_results, indent=2))

Trying SerpApi
Success with SerpApi!

Results:
[
  {
    "title": "Long short-term memory",
    "link": "https://en.wikipedia.org/wiki/Long_short-term_memory",
    "snippet": "The long short-term memory (LSTM) cell can process data sequentially and keep its hidden state through time. An LSTM unit is typically composed of a cell and ..."
  },
  {
    "title": "What is LSTM - Long Short Term Memory?",
    "link": "https://www.geeksforgeeks.org/deep-learning/deep-learning-introduction-to-long-short-term-memory/",
    "snippet": "Long Short-Term Memory (LSTM) is an enhanced version of the Recurrent Neural Network (RNN) designed by Hochreiter and Schmidhuber."
  },
  {
    "title": "What is LSTM? Introduction to Long Short-Term Memory",
    "link": "https://www.analyticsvidhya.com/blog/2021/03/introduction-to-long-short-term-memory-lstm/",
    "snippet": "LSTM (Long Short-Term Memory) is a recurrent neural network (RNN) architecture widely used in Deep Learning. It excels at capturing long-

In [15]:
import os
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_google_genai import GoogleGenerativeAI




# --- RAG Agent Code ---

def scrape_websites(urls):
    print("\nStarting to scrape websites")
    full_text = ""
    for url in urls:
        try:
            print(f"Scraping: {url}")
            response = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
                element.decompose()
            
            text = soup.get_text(separator='\n', strip=True)
            full_text += f"\n\n--- Content from {url} ---\n\n{text}"
        except requests.RequestException:
            print(f"Skipping {url}, couldn't fetch it.")
        except Exception:
            print(f"Something else went wrong with {url}.")
            
    print("Finished scraping.")
    return full_text

def create_rag_agent(text_corpus):
    print("\nNext, building the RAG system")
    if not text_corpus or text_corpus.isspace():
        print("Text is empty, can't build the agent.")
        return None

    print("1. Splitting the text into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, 
        chunk_overlap=100,
        length_function=len
    )
    chunks = text_splitter.split_text(text_corpus)
    print(f"Made {len(chunks)} chunks of text.")

    print("2. Loading the embedding model")
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

    print("3. Storing chunks in a vector database")
    vector_store = FAISS.from_texts(chunks, embeddings)
    print("Vector database is ready.")

    print("4. Setting up the LLM")
    load_dotenv()
    llm = GoogleGenerativeAI(model="gemini-1.5-flash")

    print("5. Creating the final QA chain")
    rag_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
        return_source_documents=True
    )
    
    return rag_chain


if __name__ == "__main__":
    my_topic = "What are LSTMs?"
    search_results = robust_web_search(my_topic, num_results=5)

    if not search_results:
        print("Couldn't get any search results.")
    else:
        urls_to_scrape = [res['link'] for res in search_results if res.get('link')]
        
        scraped_content = scrape_websites(urls_to_scrape)
        
        rag_agent = create_rag_agent(scraped_content)
        
        if rag_agent:
            print("\nOkay, let's ask the agent a question.")
            rag_question = "What are LSTMs and how do they work?"
            
            print(f"\nQuestion: {rag_question}\n")
            
            response = rag_agent({"query": rag_question})
            
            print("--- Answer from RAG Agent ---")
            print(response["result"])
            
    

Trying SerpApi
Success with SerpApi!

Starting to scrape websites
Scraping: https://en.wikipedia.org/wiki/Long_short-term_memory
Scraping: https://www.geeksforgeeks.org/deep-learning/deep-learning-introduction-to-long-short-term-memory/
Scraping: https://www.analyticsvidhya.com/blog/2021/03/introduction-to-long-short-term-memory-lstm/
Scraping: https://www.machinelearningmastery.com/gentle-introduction-long-short-term-memory-networks-experts/
Scraping: https://developer.nvidia.com/discover/lstm
Finished scraping.

Next, building the RAG system
1. Splitting the text into chunks...
Made 292 chunks of text.
2. Loading the embedding model
3. Storing chunks in a vector database
Vector database is ready.
4. Setting up the LLM
5. Creating the final QA chain

Okay, let's ask the agent a question.

Question: What are LSTMs and how do they work?



/Users/harshsiddharthmalgatte/data_doritos/.conda/lib/python3.11/site-packages/langchain_google_genai/llms.py:44: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  for field_name, field_info in self.model_fields.items():


--- Answer from RAG Agent ---
Based on the provided text, LSTMs are a type of recurrent neural network.  They use a self-connected hidden layer containing memory cells and gate units.  The text explains that understanding their operation is best done through analogy rather than equations, but it doesn't detail the specifics of that analogy or the workings of the memory cells and gate units.  The text mentions applications such as speech recognition, image captioning, and time series forecasting.
