In [1]:
pip install -U langchain-community langchain-huggingface requests beautifulsoup4 langchain-ollama pymupdf faiss-cpu python-dotenv tiktoken

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import warnings
from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_ollama import OllamaEmbeddings
import faiss

# Load environment variables and suppress warnings
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore")
load_dotenv()

# Function to scrape website data
def scrape_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract meaningful text (e.g., skip headers, menus)
        main_content = soup.find('main')  # Most modern sites use <main> for content
        if main_content:
            return main_content.get_text(separator="\n", strip=True)
        else:
            return soup.get_text(separator="\n", strip=True)
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return ""

# List of target websites
websites = [
    "https://www.washington.edu/",
    "https://www.stanford.edu/",
    "https://und.edu/"
]

# Scrape each website and store content with metadata
website_contents = []
for website in websites:
    content = scrape_website(website)
    if content:
        website_contents.append({"content": content, "metadata": {"url": website}})

# Split content into chunks and convert to Document objects
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

document_chunks = []
for item in website_contents:
    raw_chunks = text_splitter.split_text(item["content"])
    document_chunks.extend([
        Document(page_content=chunk, metadata={"url": item["metadata"]["url"]})
        for chunk in raw_chunks if len(chunk.strip()) > 50  # Skip overly short chunks
    ])


print(f"Number of chunks created: {len(document_chunks)}")

# Initialize embeddings and vector store
embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url="http://localhost:11434")

vector_store = FAISS(
    embedding_function=embeddings,
    index=faiss.IndexFlatL2(len(embeddings.embed_query("test"))),
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

# Add documents to the vector store
ids = vector_store.add_documents(documents=document_chunks)
print(f"Documents added: {len(ids)}")

# Save the vector store
db_name = "website_embeddings"
vector_store.save_local(db_name)

# Load the vector store for querying
new_vector_store = FAISS.load_local(db_name, embeddings=embeddings, allow_dangerous_deserialization=True)


Number of chunks created: 14
Documents added: 14


In [3]:
question = "What is the history of Stanford University?"
docs = new_vector_store.search(query=question, search_type='similarity')

# Post-process to remove unrelated chunks
filtered_docs = [
    doc for doc in docs if "history" in doc.page_content.lower() or "founded" in doc.page_content.lower()
]

for doc in filtered_docs:
    print(f"Source: {doc.metadata['url']}")
    print(doc.page_content)
    print("\n")


Source: https://www.stanford.edu/
Main Content
A Societal Mission
Stanford was founded almost 150 years ago on a bedrock of societal purpose. Our mission is to contribute to the world by educating students for lives of leadership and contribution with integrity; advancing fundamental knowledge and cultivating creativity; leading in pioneering research for effective clinical therapies; and accelerating solutions and amplifying their impact.
More about Stanford
Campus News
Stories about people, research, and innovation across the Farm
Science & Engineering
Stanford welcomes first GPU-based supercomputer
Health & Medicine
Flu virus remains infectious in refrigerated raw milk for up to five days, new study shows
Science & Engineering
A new report warns of serious risks from ‘mirror life’
Awards
Five from Stanford named Marshall Scholars
Science & Engineering
New device produces critical fertilizer ingredient from thin air
Science & Engineering
Scientists call for all-out, global effort to 

In [4]:
import os
import warnings
from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_ollama import OllamaEmbeddings
import faiss

# Load environment variables and suppress warnings
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore")
load_dotenv()

# Function to scrape website data
def scrape_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract main content only
        main_content = soup.find('main')  # Most modern sites use <main> for content
        if main_content:
            return main_content.get_text(separator="\n", strip=True)
        else:
            return soup.get_text(separator="\n", strip=True)
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return ""

# Target website (Stanford University's "About" page)
website = "https://www.stanford.edu/about/"

# Scrape the website
content = scrape_website(website)
website_contents = [{"content": content, "metadata": {"url": website}}]

# Split content into chunks and convert to Document objects
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)  # Smaller chunks for specificity
document_chunks = []
for item in website_contents:
    raw_chunks = text_splitter.split_text(item["content"])
    document_chunks.extend([
        Document(page_content=chunk, metadata={"url": item["metadata"]["url"]})
        for chunk in raw_chunks
    ])

print(f"Number of chunks created: {len(document_chunks)}")

# Initialize embeddings and vector store
embeddings = OllamaEmbeddings(model='nomic-embed-text')

vector_store = FAISS(
    embedding_function=embeddings,
    index=faiss.IndexFlatL2(len(embeddings.embed_query("test"))),
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

# Add documents to the vector store
ids = vector_store.add_documents(documents=document_chunks)
print(f"Documents added: {len(ids)}")

# Save the vector store
db_name = "website_embeddings"
vector_store.save_local(db_name)

# Load the vector store for querying
new_vector_store = FAISS.load_local(db_name, embeddings=embeddings, allow_dangerous_deserialization=True)

# Query the vector store
questions = [
    "When was Stanford University founded?",
    "What research is done at Stanford?"
]

for question in questions:
    # Search for the question in the vector store
    docs = new_vector_store.search(query=question, search_type='similarity', k=1)  # Retrieve more results for better filtering
    
    # Filter documents based on relevance
    filtered_docs = []
    for doc in docs:
        # Check if the chunk is relevant to the query
        if any(keyword.lower() in doc.page_content.lower() for keyword in question.split()):
            filtered_docs.append(doc)
    
    # Select only the most relevant document for the query
    if filtered_docs:
        best_doc = max(filtered_docs, key=lambda doc: len(doc.page_content.split()))
        print(f"Results for Query: {question}")
        print(f"Source: {best_doc.metadata['url']}")
        print(best_doc.page_content)
        print("\n")
    else:
        print(f"No relevant information found for query: {question}")


Number of chunks created: 50
Documents added: 50
Results for Query: When was Stanford University founded?
Source: https://www.stanford.edu/about/
Stanford was founded in 1885 by California senator Leland Stanford and his wife, Jane, “to promote the public welfare by exercising an influence in behalf of humanity and civilization.” The university is governed by a Board of Trustees, President, Provost, Academic Council and a number of other


Results for Query: What research is done at Stanford?
Source: https://www.stanford.edu/about/
university’s mission, and students have extensive opportunities to join Stanford scholars in research that develops new knowledge and deepens understanding of ourselves and the world around us. A hallmark of Stanford is our extensive and vibrant ecosystem of interdisciplinary research. With all




In [5]:
import os
import warnings
from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_ollama import OllamaEmbeddings
import faiss

# Load environment variables and suppress warnings
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore")
load_dotenv()

# Function to scrape website data
def scrape_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract main content only
        main_content = soup.find('main')  # Most modern sites use <main> for content
        if main_content:
            return main_content.get_text(separator="\n", strip=True)
        else:
            return soup.get_text(separator="\n", strip=True)
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return ""

# Target website (Stanford University's "About" page)
website = "https://www.stanford.edu/about/"

# Scrape the website
content = scrape_website(website)
website_contents = [{"content": content, "metadata": {"url": website}}]

# Split content into chunks and convert to Document objects
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)  # Smaller chunks for specificity
document_chunks = []
for item in website_contents:
    raw_chunks = text_splitter.split_text(item["content"])
    document_chunks.extend([
        Document(page_content=chunk, metadata={"url": item["metadata"]["url"]})
        for chunk in raw_chunks
    ])

print(f"Number of chunks created: {len(document_chunks)}")

# Initialize embeddings and vector store
embeddings = OllamaEmbeddings(model='nomic-embed-text')

vector_store = FAISS(
    embedding_function=embeddings,
    index=faiss.IndexFlatL2(len(embeddings.embed_query("test"))),
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

# Add documents to the vector store
ids = vector_store.add_documents(documents=document_chunks)
print(f"Documents added: {len(ids)}")

# Save the vector store
db_name = "website_embeddings"
vector_store.save_local(db_name)

# Load the vector store for querying
new_vector_store = FAISS.load_local(db_name, embeddings=embeddings, allow_dangerous_deserialization=True)

# Query the vector store with dynamic user input
print("Enter questions about Stanford University ('exit' to quit):")
while True:
    question = input("> ").strip()
    if question.lower() == 'exit':
        break
    
    # Search for the question in the vector store
    docs = new_vector_store.search(query=question, search_type='similarity', k=1)  # Retrieve more results for better filtering
    
    # Filter documents based on relevance
    filtered_docs = []
    for doc in docs:
        # Check if the chunk is relevant to the query
        if any(keyword.lower() in doc.page_content.lower() for keyword in question.split()):
            filtered_docs.append(doc)
    
    # If there are relevant documents, select the top ones
    if filtered_docs:
        best_doc = max(filtered_docs, key=lambda doc: len(doc.page_content.split()))
        print(f"Results for Query: {question}")
        print(f"Source: {best_doc.metadata['url']}")
        print(best_doc.page_content)
        print("\n")
    else:
        print(f"No relevant information found for query: {question}")


Number of chunks created: 50
Documents added: 50
Enter questions about Stanford University ('exit' to quit):
Results for Query: What is Stanford University's mission?
Source: https://www.stanford.edu/about/
Stanford was founded almost 150 years ago on a bedrock of societal purpose. Our mission is to contribute to the world by educating students for lives of leadership and contribution with integrity; advancing fundamental knowledge and cultivating creativity; leading in pioneering research for


Results for Query: What research is done at Stanford?
Source: https://www.stanford.edu/about/
university’s mission, and students have extensive opportunities to join Stanford scholars in research that develops new knowledge and deepens understanding of ourselves and the world around us. A hallmark of Stanford is our extensive and vibrant ecosystem of interdisciplinary research. With all




In [None]:
#"What is Stanford University's mission?",
#"When was Stanford University founded?",
#"What are Stanford's academic programs?",
#"What research is done at Stanford?"
