<a href="https://colab.research.google.com/github/Sowrobh1/Data-Mining/blob/main/Domain_Specific_Search_Engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
import requests

from bs4 import BeautifulSoup
from collections import defaultdict
import networkx as nx
import re


In [36]:
# ⚽ Football Domain Seed URLs
seed_urls = [
    "https://www.goal.com/en",
    "https://www.espn.com/soccer/",
    "https://www.skysports.com/football",
    "https://www.bbc.com/sport/football",
    "https://www.uefa.com/",
    "https://www.fifa.com/",
    "https://www.football365.com/",
    "https://www.sportingnews.com/uk/soccer",
    "https://www.theguardian.com/football",
    "https://www.cbssports.com/soccer/",
    "https://www.si.com/soccer",
    "https://www.premierleague.com/",
    "https://www.laliga.com/en-GB",
    "https://www.bundesliga.com/en/bundesliga",
    "https://www.mlssoccer.com/",
    "https://www.eurosport.com/football/",
    "https://www.whoscored.com/",
    "https://www.transfermarkt.com/",
    "https://www.soccerway.com/",
    "https://www.footballtransfers.com/en"
]


In [37]:
# Set how many pages to crawl in total (across all sites)
crawl_limit = 100

# Set how many pages to visit from each domain
visit_limit = 20

# Start crawling from the seed URLs
# Make sure you have defined seed_urls and domain_restriction
# Example:
# seed_urls = ['https://www.example.com']
# domain_restriction = 'example.com'

# Using the defined 'crawl' function
crawled_data, link_graph = crawl(seed_urls, domain_restriction, max_pages=crawl_limit)

# Note: The original code also had a visit_limit per domain,
# the current 'crawl' function only has a total max_pages.
# If you need a per-domain limit, the 'crawl' function would need to be modified.

print(f"Crawled {len(crawled_data)} pages.")

Crawled 100 pages.


In [38]:
inverted_index={
    'messi': ['https://goal.com/article1', 'https://espn.com/messi-profile'],
    'goal': ['https://fifa.com/match1', 'https://bbc.com/sport-football'],

}

web_connection={
    'https://goal.com/home': [
        'https://goal.com/article1',
        'https://espn.com/match-report'
    ],
    'https://espn.com/match-report': [
        'https://fifa.com/match',
        'https://bbc.com/sport-football'
    ],

}


print(inverted_index)
print(web_connection)


{'messi': ['https://goal.com/article1', 'https://espn.com/messi-profile'], 'goal': ['https://fifa.com/match1', 'https://bbc.com/sport-football']}
{'https://goal.com/home': ['https://goal.com/article1', 'https://espn.com/match-report'], 'https://espn.com/match-report': ['https://fifa.com/match', 'https://bbc.com/sport-football']}


In [39]:
# web_connection is assumed to be in this format:
# {
#     'page1': ['link1', 'link2'],
#     'page2': ['link3', 'link4'],
#     ...
# }

def build_web_graph(web_connection):
    G = nx.DiGraph()  # Directed graph
    for source, targets in web_connection.items():
        for target in targets:
            G.add_edge(source, target)
    return G

# Build the graph
web_graph = build_web_graph(web_connection)

# Optional: Check basic info
print("Number of nodes (pages):", web_graph.number_of_nodes())
print("Number of edges (links):", web_graph.number_of_edges())



Number of nodes (pages): 5
Number of edges (links): 4


In [47]:
# Compute PageRank scores for each page
page_rank_scores = nx.pagerank(web_graph)

# Example: print top 5 pages by PageRank
top_pagerank = sorted(page_rank_scores.items(), key=lambda x: x[1], reverse=True)[:5]
print("Top 5 pages by PageRank:")
for url, score in top_pagerank:
    print(f"{url} — score: {score:.5f}")




# Compute HITS scores (authority and hub)
hubs, authorities = nx.hits(web_graph, max_iter=100, normalized=True)

# Example: top 5 authoritative pages
top_authorities = sorted(authorities.items(), key=lambda x: x[1], reverse=True)[:5]
print("\nTop 5 pages by HITS Authority:")
for url, score in top_authorities:
    print(f"{url} — authority: {score:.5f}")






def search(query, inverted_index, ranking_scores, top_k=10):
    query_terms = query.lower().split()
    matching_urls = set()

    # Gather URLs that contain any of the query terms
    for term in query_terms:
        if term in inverted_index:
            matching_urls.update(inverted_index[term])

    # Rank results using the provided scores (PageRank or Authority)
    ranked_results = sorted(
        matching_urls,
        key=lambda url: ranking_scores.get(url, 0),
        reverse=True
    )

    return ranked_results[:top_k]

# Search example using PageRank
query = "goalkeeper"
results = search(query, inverted_index, page_rank_scores)

print("\nTop results for:", query)
for i, url in enumerate(results, 1):
    print(f"{i}. {url}")




Top 5 pages by PageRank:
https://fifa.com/match — score: 0.22739
https://bbc.com/sport-football — score: 0.22739
https://goal.com/article1 — score: 0.20181
https://espn.com/match-report — score: 0.20181
https://goal.com/home — score: 0.14162

Top 5 pages by HITS Authority:
https://goal.com/article1 — authority: 0.64300
https://espn.com/match-report — authority: 0.64300
https://goal.com/home — authority: 0.00000
https://fifa.com/match — authority: -0.14300
https://bbc.com/sport-football — authority: -0.14300

Top results for: goalkeeper


In [48]:
def search_engine(query, inverted_index, rank_scores, top_k=10):
    query_terms = query.lower().split()
    matched_urls = set()

    # Collect URLs that contain any query term
    for term in query_terms:
        if term in inverted_index:
            matched_urls.update(inverted_index[term])

    # Rank matched URLs by their scores (PageRank or HITS Authority)
    ranked_results = sorted(
        matched_urls,
        key=lambda url: rank_scores.get(url, 0),
        reverse=True
    )

    return ranked_results[:top_k]


In [None]:
print("Domain-Specific Search Engine (type 'exit' to quit)")

while True:
    user_query = input("\nEnter search keywords: ")
    if user_query.lower() == 'exit':
        print("Goodbye!")
        break

    # Choose ranking algorithm here ('pagerank' or 'hits')
    ranking_choice = 'pagerank'  # or 'hits'

    if ranking_choice == 'pagerank':
        scores = page_rank_scores
    else:
        scores = authorities

    results = search_engine(user_query, inverted_index, scores, top_k=10)

    if results:
        print(f"\nTop results ({ranking_choice} ranking):")
        for i, url in enumerate(results, 1):
            print(f"{i}. {url}")
    else:
        print("No results found.")


Domain-Specific Search Engine (type 'exit' to quit)
