# Assignment - 04
+ Implementing the PageRank Algorithm

In [None]:
import networkx as nx
import requests
from bs4 import BeautifulSoup

def fetch_html(url):
    """Fetch HTML content from a URL."""
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def parse_html(html_content):
    """Parse HTML content and extract links."""
    soup = BeautifulSoup(html_content, 'html.parser')
    links = []
    for a in soup.find_all('a', href=True):
        href = a['href']
        # Skip links that do not start with "http" to avoid relative URLs
        if href.startswith('http'):
            links.append(href)
    return links

def build_graph(urls):
    """Build a directed graph based on the links between the given URLs."""
    graph = nx.DiGraph()

    # Fetch HTML content and add nodes and edges to the graph
    for url in urls:
        html_content = fetch_html(url)
        if html_content:
            graph.add_node(url)  # Add the URL as a node
            links = parse_html(html_content)
            for link in links:
                if link in urls:  # Only add links that are in the provided list of URLs
                    graph.add_edge(url, link)  # Create a directed edge from URL to link

    return graph

def calculate_pagerank(graph):
    """Calculate PageRank for the nodes in the graph."""
    page_ranks = nx.pagerank(graph)
    return page_ranks

if __name__ == "__main__":
    # List of URLs to analyze (adjust the URLs as needed)
    urls_to_analyze = [
        'https://pll.harvard.edu/subject/data-science',
        'https://www.ibm.com/topics/data-science',
        'https://www.coursera.org/specializations/jhu-data-science',
        'https://www.youtube.com/watch?v=GhFgnkLPZj4',
        'https://pll.harvard.edu/subject/data-science',
        'https://www.ibm.com/topics/data-science',
        'https://www.ibm.com/topics/data-science','https://www.ibm.com/topics/data-science',
    ]

    # Step 1: Build the graph
    web_graph = build_graph(urls_to_analyze)

    # Step 2: Calculate the PageRank
    pagerank_values = calculate_pagerank(web_graph)

    # Step 3: Display the results
    print("PageRank Values:")
    for url, rank in pagerank_values.items():
        print(f"URL: {url}, PageRank: {rank:.4f}")

PageRank Values:
URL: https://pll.harvard.edu/subject/data-science, PageRank: 0.2500
URL: https://www.ibm.com/topics/data-science, PageRank: 0.2500
URL: https://www.coursera.org/specializations/jhu-data-science, PageRank: 0.2500
URL: https://www.youtube.com/watch?v=GhFgnkLPZj4, PageRank: 0.2500
