# Imports

In [1]:
from pyspark import SparkContext

# Helper Function

In [None]:
def parse_links(line):
    """
    Parses a line of pagelinks.txt and returns (source, [targets])
    Example: '1: [49, 5]' -> (1, [49, 5])
    """
    parts = line.strip().split(":")
    source = int(parts[0])
    targets = eval(parts[1])
    return (source, targets)


def compute_contributions(record):
    """
    Compute the contribution of a page's rank to each of its neighbors.

    Parameters:
    record (tuple): A tuple in the form (page, (neighbors, rank)),
                    where:
                      - page (int): the source page ID
                      - neighbors (list[int]): list of destination page IDs the page links to
                      - rank (float): the current rank value of the page

    Returns:
    list[tuple]: A list of tuples (neighbor, contribution), where each neighbor
                 receives an equal share of the page's rank.
                 If the page has no outgoing links, returns an empty list.
    """
    page, (neighbors, rank) = record
    num_links = len(neighbors)
    if num_links == 0:
        return []
    share = rank / num_links
    return [(neighbor, share) for neighbor in neighbors]

# Page Rank

## Initialize `SparkContext`

In [3]:
sc = SparkContext("local", "PageRank")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/07 18:52:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load and parse the link structure

In [4]:
lines = sc.textFile("pagelinks.txt")
links = lines.map(parse_links).cache()

## Initialize all ranks to 1.0

In [5]:
ranks = links.mapValues(lambda _: 1.0)

## PageRank Iterations

In [6]:
damping_factor = 0.85
num_iterations = 10

for i in range(num_iterations):
    joined = links.join(ranks)
    contributions_rdd = joined.flatMap(compute_contributions)
    ranks = contributions_rdd.reduceByKey(lambda a, b: a + b).mapValues(
        lambda rank: (1 - damping_factor) + damping_factor * rank
    )

## Collect and sort final ranks

In [7]:
final_ranks = ranks.collect()
sorted_ranks = sorted(final_ranks, key=lambda x: x[1], reverse=True)

## Output top and bottom results

In [8]:
print("\nTop 5 nodes by PageRank:")
for item in sorted_ranks[:5]:
    print(f"Node {item[0]} → Rank: {item[1]:.5f}")

print("\nBottom 5 nodes by PageRank:")
for item in sorted_ranks[-5:]:
    print(f"Node {item[0]} → Rank: {item[1]:.5f}")


Top 5 nodes by PageRank:
Node 49 → Rank: 3.28504
Node 33 → Rank: 2.74142
Node 62 → Rank: 2.67268
Node 58 → Rank: 2.27572
Node 61 → Rank: 2.08110

Bottom 5 nodes by PageRank:
Node 98 → Rank: 0.25399
Node 65 → Rank: 0.24168
Node 20 → Rank: 0.23869
Node 66 → Rank: 0.20009
Node 75 → Rank: 0.17398


## Print highest and lowest

In [9]:
highest_node, highest_score = sorted_ranks[0]
lowest_node, lowest_score = sorted_ranks[-1]

print(f"\nHighest Rank: Node {highest_node} with score {highest_score:.5f}")
print(f"Lowest Rank: Node {lowest_node} with score {lowest_score:.5f}")


Highest Rank: Node 49 with score 3.28504
Lowest Rank: Node 75 with score 0.17398


## Stop `SparkContext`

In [None]:
sc.stop()