<a href="https://colab.research.google.com/github/Tejaswini-Kethepalli/Sitafal-/blob/main/Sitafal_Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [None]:
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [None]:
# Step 1: Crawl and Scrape
def crawl_and_scrape(urls):
    website_data = {}
    for url in urls:
        try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            text = " ".join([p.get_text() for p in soup.find_all("p")])
            website_data[url] = text
        except Exception as e:
            print(f"Error scraping {url}: {e}")
    return website_data

In [None]:
# Step 2: Chunk and Embed
def chunk_and_embed(data, model, chunk_size=300):
    chunks = []
    for url, content in data.items():
        # Split content into chunks of specified size
        words = content.split()
        for i in range(0, len(words), chunk_size):
            chunk = " ".join(words[i:i + chunk_size])
            if len(chunk.strip()) > 0:
                chunks.append((url, chunk.strip()))
    # Generate embeddings for all chunks
    texts = [chunk[1] for chunk in chunks]
    embeddings = model.encode(texts, convert_to_tensor=False)
    return chunks, embeddings

In [None]:
# Step 3: Store in FAISS
def store_embeddings(embeddings):
    d = embeddings.shape[1]  # Dimension of embeddings
    index = faiss.IndexFlatL2(d)
    index.add(np.array(embeddings))
    return index


In [None]:
# Step 4: Query Handling
def query_vector_database(query, index, model, chunks, top_k=5):
    query_embedding = model.encode([query], convert_to_tensor=False)
    distances, indices = index.search(np.array(query_embedding), k=top_k)
    results = [chunks[i] for i in indices[0] if i < len(chunks)]
    return results


In [None]:
# Step 5: Generate Response
def generate_response(results):
    if not results:
        return "Sorry, I couldn't find any relevant information."
    response = "Based on your query, here are the results:\n\n"
    for url, text in results:
        response += f"URL: {url}\nContent: {text[:200]}...\n\n"  # Truncate long content
    return response

In [None]:
def main():
    # Step 1: Define URLs
    urls = [
        "https://www.uchicago.edu/",
        "https://www.washington.edu/",
        "https://www.stanford.edu/",
        "https://und.edu/",
    ]

    # Step 2: Initialize embedding model
    print("Loading embedding model...")
    model = SentenceTransformer("all-MiniLM-L6-v2")  # Pre-trained model

    # Step 3: Crawl and scrape websites
    print("Crawling and scraping websites...")
    website_data = crawl_and_scrape(urls)

    # Step 4: Chunk and embed content
    print("Chunking and embedding content...")
    chunks, embeddings = chunk_and_embed(website_data, model)

    # Step 5: Store embeddings in FAISS
    print("Storing embeddings in FAISS...")
    index = store_embeddings(embeddings)

    # Step 6: Process user query
    query = input("Enter your query: ")
    print("Searching content...")
    results = query_vector_database(query, index, model, chunks)
    print("Generating response...")
    response = generate_response(results)
    print(response)

if __name__ == "__main__":
    main()


Loading embedding model...
Crawling and scraping websites...
Error scraping https://www.uchicago.edu/: 403 Client Error: Forbidden for url: https://www.uchicago.edu/
Chunking and embedding content...
Storing embeddings in FAISS...
Enter your query: birds in india
Searching content...
Generating response...
Based on your query, here are the results:

URL: https://www.stanford.edu/
Content: a vibrant community of creative and accomplished people from around the world A residential campus with diverse housing, exceptional dining, and over 600 student organizations Student Affairs A rich t...

URL: https://www.washington.edu/
Content: UW astronomy undergrads are using cutting-edge coding skills to help scientists make the most of discoveries from a revolutionary new telescope. Read story Chris Mantegna, â21, is studying how pollu...

URL: https://www.stanford.edu/
Content: Other ways to search: Map Profiles Stanford Explore Stanford Stanford was founded almost 150 years ago on a bedrock 

In [2]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [5]:
from sentence_transformers import SentenceTransformer
import faiss
import requests
from bs4 import BeautifulSoup

def crawl_and_scrape(urls):
    """
    Crawl and scrape content from target websites with appropriate headers.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    content = []
    for url in urls:
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            text = ' '.join([p.get_text() for p in soup.find_all('p')])
            content.append(text)
        except Exception as e:
            print(f"Error scraping {url}: {e}")
    return ' '.join(content)


def chunk_text(text, chunk_size=100):
    """
    Split text into smaller chunks.
    """
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

def store_embeddings(chunks, model):
    """
    Generate embeddings for text chunks and store them in a FAISS index.
    """
    embeddings = model.encode(chunks)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index, chunks

def query_vector_database(query, index, model, chunks, top_k=3):
    """
    Retrieve the most relevant chunks using a query.
    """
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    return [chunks[i] for i in indices[0]]

def generate_response(results):
    """
    Generate a response from retrieved chunks.
    """
    return "\n".join(results)

def main():
    # Step 1: Define URLs
    urls = [
        "https://www.uchicago.edu/",
        "https://www.washington.edu/",
        "https://www.stanford.edu/",
        "https://und.edu/",
    ]

    # Step 2: Initialize embedding model
    print("Loading embedding model...")
    model = SentenceTransformer("all-MiniLM-L6-v2")  # Pre-trained model

    # Step 3: Crawl and scrape websites
    print("Crawling and scraping websites...")
    website_data = crawl_and_scrape(urls)

    # Step 4: Chunk and embed content
    print("Chunking and embedding content...")
    chunks = chunk_text(website_data)
    index, stored_chunks = store_embeddings(chunks, model)

    # Step 5: Handle user query
    query = input("Enter your query: ")
    print("Searching content...")
    results = query_vector_database(query, index, model, stored_chunks)
    print("Generating response...")
    response = generate_response(results)
    print(response)

if __name__ == "__main__":
    main()


Loading embedding model...
Crawling and scraping websites...
Chunking and embedding content...
Enter your query: give me about washington
Searching content...
Generating response...
thrilled to welcome him to UW.” Read story Capping a big — and BIG TEN — year, the Huskies are headed for the Tony the Tiger Sun Bowl! Join fellow fans in cheering on our favorite Dawgs against Louisville in El Paso, TX on December 31. Bowl Central David Baker, professor of biochemistry at the UW School of Medicine in Seattle, received the 2024 Nobel Prize in Chemistry. Nobel Week wove stately traditions with imaginative recognitions. Read story © 2024 University of Washington | Seattle, WA Other ways to search: Map Profiles Stanford Explore Stanford Stanford was founded almost 150
an international community of scholars working to solve the world's most pressing issues, with initiatives and programs on all seven continents. Chicago is not only in our name, it’s woven into the fabric of this institution. Loc