<a href="https://colab.research.google.com/github/VenkataBhanuTejaKonijeti/A-Semantic-Focused-Web-Crawler-Using-Sentence-BERT-and-Priority-Based-Crawling/blob/main/with_seed_urls.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence-transformers beautifulsoup4 requests nltk




In [None]:
import requests
import heapq
import re
import nltk
import numpy as np
import time
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
from urllib.parse import urljoin, urlparse


In [None]:
nltk.download('stopwords')
STOP_WORDS = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [w for w in words if w not in STOP_WORDS]
    return " ".join(words)


In [None]:
def fetch_page(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers, timeout=10)

        if response.status_code != 200:
            return None, []

        soup = BeautifulSoup(response.text, "html.parser")

        # Remove unwanted elements
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()

        text = soup.get_text(separator=" ")
        text = clean_text(text)

        links = []
        for a in soup.find_all("a", href=True):
            link = urljoin(url, a["href"])
            parsed = urlparse(link)
            if parsed.scheme in ["http", "https"]:
                links.append(link)

        return text, links

    except:
        return None, []


In [None]:
def semantic_similarity(query_embedding, document_text):
    doc_embedding = model.encode(
        document_text,
        normalize_embeddings=True
    )
    return float(np.dot(query_embedding, doc_embedding))


In [None]:
def semantic_crawler(
    seed_urls,
    query,
    max_pages=20,
    relevance_threshold=0.35,
    delay=1
):
    visited = set()
    results = []

    query_embedding = model.encode(
        query,
        normalize_embeddings=True
    )

    # Priority queue (max-heap using negative score)
    frontier = []
    for url in seed_urls:
        heapq.heappush(frontier, (-1.0, url))

    while frontier and len(results) < max_pages:
        _, url = heapq.heappop(frontier)

        if url in visited:
            continue
        visited.add(url)

        print(f"🔍 Fetching LIVE URL: {url}")

        text, links = fetch_page(url)
        if not text:
            continue

        score = semantic_similarity(query_embedding, text)

        if score >= relevance_threshold:
            results.append({
                "url": url,
                "score": round(score, 4)
            })

            # Add outgoing links with priority
            for link in links:
                if link not in visited:
                    heapq.heappush(frontier, (-score, link))

        time.sleep(delay)

    return sorted(results, key=lambda x: x["score"], reverse=True)


In [None]:
QUERY = "covid vaccine pandemic"

SEED_URLS = [
    "https://www.who.int",
    "https://www.cdc.gov",
    "https://www.nih.gov"
]


In [None]:
results = semantic_crawler(
    seed_urls=SEED_URLS,
    query=QUERY,
    max_pages=20,
    relevance_threshold=0.35,
    delay=1
)


🔍 Fetching LIVE URL: https://www.cdc.gov
🔍 Fetching LIVE URL: https://www.nih.gov
🔍 Fetching LIVE URL: https://www.who.int
🔍 Fetching LIVE URL: https://archive.cdc.gov/
🔍 Fetching LIVE URL: https://hhs.gov
🔍 Fetching LIVE URL: https://jobs.cdc.gov/index.html
🔍 Fetching LIVE URL: https://oig.hhs.gov/
🔍 Fetching LIVE URL: https://stacks.cdc.gov/
🔍 Fetching LIVE URL: https://twitter.com/CDCgov
🔍 Fetching LIVE URL: https://usa.gov
🔍 Fetching LIVE URL: https://www.cdc.gov#aboutCDC
🔍 Fetching LIVE URL: https://www.cdc.gov#archive
🔍 Fetching LIVE URL: https://www.cdc.gov#contactUs
🔍 Fetching LIVE URL: https://www.cdc.gov#content
🔍 Fetching LIVE URL: https://www.cdc.gov#gov-notice
🔍 Fetching LIVE URL: https://www.cdc.gov#headerSearch
🔍 Fetching LIVE URL: https://www.cdc.gov#languages
🔍 Fetching LIVE URL: https://www.cdc.gov#mobile-footer-aboutCDC
🔍 Fetching LIVE URL: https://www.cdc.gov#mobile-footer-archive
🔍 Fetching LIVE URL: https://www.cdc.gov#mobile-footer-contactUs
🔍 Fetching LIVE URL: 

In [None]:
print("\n📌 FINAL RANKED LIVE URL RESULTS\n")

for i, r in enumerate(results, 1):
    print(f"{i}. Score = {r['score']} | {r['url']}")



📌 FINAL RANKED LIVE URL RESULTS

1. Score = 0.3829 | https://www.cdc.gov
2. Score = 0.3829 | https://www.cdc.gov#aboutCDC
3. Score = 0.3829 | https://www.cdc.gov#archive
4. Score = 0.3829 | https://www.cdc.gov#contactUs
5. Score = 0.3829 | https://www.cdc.gov#content
6. Score = 0.3829 | https://www.cdc.gov#gov-notice
7. Score = 0.3829 | https://www.cdc.gov#headerSearch
8. Score = 0.3829 | https://www.cdc.gov#languages
9. Score = 0.3829 | https://www.cdc.gov#mobile-footer-aboutCDC
10. Score = 0.3829 | https://www.cdc.gov#mobile-footer-archive
11. Score = 0.3829 | https://www.cdc.gov#mobile-footer-contactUs
12. Score = 0.3829 | https://www.cdc.gov#mobile-footer-languages
13. Score = 0.3829 | https://www.cdc.gov#mobile-footer-policies
14. Score = 0.3829 | https://www.cdc.gov#policies
15. Score = 0.3829 | https://www.cdc.gov/
16. Score = 0.3829 | https://www.cdc.gov/#aboutCDC
17. Score = 0.3829 | https://www.cdc.gov/#archive
18. Score = 0.3829 | https://www.cdc.gov/#contactUs
19. Score = 