In [1]:
import requests
from bs4 import BeautifulSoup
import networkx as nx
import pandas as pd

In [2]:
def get_links_from_wikipedia(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = []
    for link in soup.find_all('a', href=True):
        href = link.get('href')
        if href.startswith('/wiki/') and ':' not in href:
            links.append(href)
    return links

In [5]:
def build_wikipedia_graph(start_url, max_depth, max_pages):
    G = nx.DiGraph()
    to_visit = [(start_url, 0)]
    visited = set()

    while to_visit:
        current_url, depth = to_visit.pop(0)

        if current_url not in visited and depth < max_depth:
            links = get_links_from_wikipedia("https://en.wikipedia.org" + current_url)
            visited.add(current_url)

            for link in links[:max_pages]:
                G.add_edge(current_url, link)
                to_visit.append((link, depth + 1))

    return G

In [6]:

def calculate_pagerank(graph):
    return nx.pagerank(graph)

In [7]:
start_url = "/wiki/PageRank"
max_depth = 2
max_pages = 10

In [8]:
graph = build_wikipedia_graph(start_url, max_depth, max_pages)
pagerank = calculate_pagerank(graph)

In [9]:
result_df = pd.DataFrame(pagerank.items(), columns=['Page', 'PageRank Score'])
result_df = result_df.sort_values(by='PageRank Score', ascending=False)

result_df


Unnamed: 0,Page,PageRank Score
1,/wiki/Main_Page,0.049815
3,/wiki/Google_Search,0.029284
11,/wiki/Smoking_on_My_Ex_Pack,0.029141
7,/wiki/Wikipedia,0.029141
8,/wiki/Free_content,0.029141
9,/wiki/Encyclopedia,0.029141
10,/wiki/English_language,0.029141
2,/wiki/Algorithm,0.028609
5,/wiki/Search_engine,0.028609
6,/wiki/Larry_Page,0.028609
