In [12]:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict

# PageRank Explanation:
# PageRank is an algorithm developed by Google to rank web pages based on their importance.
# It assigns a score to each webpage, indicating its importance based on the number and quality of links pointing to it.
# The core idea is that a page is more important if it is linked to by other important pages.

# Uses of PageRank:
# PageRank is commonly used in search engines to rank search results, providing relevant and authoritative results.
# It is also used in social network analysis, ranking academic papers, and identifying influential nodes in a network.

# Why BeautifulSoup:
# BeautifulSoup is a Python library used for web scraping, making it easy to extract information from HTML and XML files.
# It parses HTML content, making it simple to navigate, search, and modify the parsed HTML tree.
# It is chosen here because it is easy to use and efficient for scraping simple HTML content.
# Other libraries like Selenium are used for more complex, dynamic content (e.g., JavaScript-based content),
# but BeautifulSoup is sufficient for scraping basic internal links from static HTML pages.

def get_links(url):
    """
    Scrape internal links from the given URL using BeautifulSoup.
    
    Arguments:
    url (str): The URL to scrape links from.
    
    Returns:
    set: A set of internal links found on the page.
    """
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Initialize an empty set to store internal links
        links = set()
        # Loop through all <a> tags with 'href' attributes
        for a_tag in soup.find_all("a", href=True):
            href = a_tag['href']
            # Consider only relative URLs (internal links)
            if href.startswith("/"):
                # Convert relative link to full URL and add it to the set
                links.add(url + href)
        
        return links
    except Exception as e:
        # Print an error message if scraping fails
        print(f"Error scraping {url}: {e}")
        return set()

def build_graph(urls):
    """
    Create a graph dictionary where each URL points to its list of linked URLs.
    
    Arguments:
    urls (list): A list of URLs to include in the graph.
    
    Returns:
    dict: A dictionary representing the graph (URL -> set of linked URLs).
    """
    graph = {}
    # Iterate over each URL in the list
    for url in urls:
        # Get the links found on the current URL and add them to the graph
        graph[url] = get_links(url)
    return graph

def page_rank(graph, d=0.85, iterations=50):
    """
    Simple PageRank algorithm to calculate rank scores for each URL in the graph.
    
    Arguments:
    graph (dict): The graph dictionary where each URL points to its linked URLs.
    d (float): The damping factor (default is 0.85). It represents the probability that a user continues clicking on links.
    iterations (int): The number of iterations to run the algorithm (default is 50).
    
    Returns:
    dict: A dictionary with URLs as keys and their PageRank scores as values.
    """
    n = len(graph)  # Total number of URLs (nodes) in the graph
    # Initialize the rank of each URL to 1/n (equal distribution)
    ranks = {url: 1 / n for url in graph}

    # Iterate multiple times to update the ranks based on link structure
    for _ in range(iterations):
        new_ranks = {}  # Dictionary to store updated ranks
        for url in graph:
            # Calculate the new rank score for the current URL
            # Sum the contributions from all pages linking to this URL
            rank_sum = sum(ranks[link] / len(graph[link]) for link in graph if url in graph[link])
            # Update the rank using the damping factor formula
            new_ranks[url] = (1 - d) / n + d * rank_sum
        
        # Update the ranks dictionary with new values for the next iteration
        ranks = new_ranks
    
    return ranks

# Example usage of the functions defined above
if __name__ == "__main__":
    # Define a small list of example URLs from the same domain
    urls = [
        "https://example.com",
        "https://google.com/page1",
        "https://gmail.com/page2",
    ]

    # Build the graph from the given URLs
    graph = build_graph(urls)
    print("Graph structure:", graph)  # Print the structure of the graph (for debugging)

    # Calculate PageRank scores for the URLs in the graph
    scores = page_rank(graph)

    # Display the PageRank scores for each URL
    print("PageRank scores:", scores)


Graph structure: {'https://example.com': set(), 'https://google.com/page1': {'https://google.com/page1//www.google.com/'}, 'https://gmail.com/page2': {'https://gmail.com/page2//www.google.com/'}}
PageRank scores: {'https://example.com': 0.05000000000000001, 'https://google.com/page1': 0.05000000000000001, 'https://gmail.com/page2': 0.05000000000000001}


In [2]:
!pip install beautifulsoup4 requests


Collecting beautifulsoup4
  Downloading beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
     ---------------------------------------- 0.0/147.9 kB ? eta -:--:--
     ---------- ---------------------------- 41.0/147.9 kB 1.9 MB/s eta 0:00:01
     --------------- --------------------- 61.4/147.9 kB 656.4 kB/s eta 0:00:01
     ------------------------ ----------- 102.4/147.9 kB 737.3 kB/s eta 0:00:01
     ------------------------------------ 147.9/147.9 kB 882.6 kB/s eta 0:00:00
Collecting soupsieve>1.2
  Downloading soupsieve-2.6-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.12.3 soupsieve-2.6



[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: C:\Users\khata\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
