In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, lit, coalesce, collect_list, size, sum as Fsum
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import requests
import csv
import os

spark = SparkSession.builder \
    .appName("Task04_PageRank") \
    .getOrCreate()


In [2]:
def normalize_url(base_url, link):
    parsed = urlparse(link)
    if parsed.netloc and parsed.netloc != urlparse(base_url).netloc:
        return None
    url = urljoin(base_url, parsed.path or "")
    url = url.split('#')[0]
    if url.endswith('/') and len(url) > len(base_url):
        url = url[:-1]
    return url

def crawl_and_save(start_url, output_csv, max_pages=5000, max_depth=3):
    from collections import deque

    visited = set([start_url])
    queue = deque([(start_url, 0)])
    static_ext = {'.jpg', '.jpeg', '.png', '.gif', '.svg', '.css', '.js',
                  '.ico', '.pdf', '.mp4', '.woff', '.ttf'}

    with open(output_csv, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["src", "dst"])

        while queue and len(visited) < max_pages:
            src, depth = queue.popleft()
            if depth >= max_depth:
                continue
            try:
                resp = requests.get(src, timeout=5)
                if 'text/html' not in resp.headers.get('Content-Type', ''):
                    continue
                soup = BeautifulSoup(resp.text, 'html.parser')
                for a in soup.find_all('a', href=True):
                    dst = normalize_url(start_url, a['href'])
                    if not dst or any(dst.lower().endswith(ext) for ext in static_ext):
                        continue
                    writer.writerow([src, dst])
                    if dst not in visited:
                        visited.add(dst)
                        queue.append((dst, depth + 1))
            except:
                pass


In [3]:
start_url = "https://it.tdtu.edu.vn"
edge_file = "edges.csv"

# Step 1: crawl and write directly to disk
crawl_and_save(start_url, edge_file, max_pages=5000, max_depth=3)

# Step 2: load edges into Spark
edges_df = spark.read.csv(edge_file, header=True) \
                     .select(col("src"), col("dst")) \
                     .persist()


In [4]:
class PageRank:
    def __init__(self, edges_df, damping=0.85):
        self.edges = edges_df
        self.damping = damping

    def run(self, num_iters=10):
        src_pages = self.edges.select("src").distinct()
        dst_pages = self.edges.select("dst").distinct()
        pages = src_pages.union(dst_pages).distinct().persist()

        N = pages.count()
        ranks = pages.withColumn("rank", lit(1.0 / N))

        links = self.edges.groupBy("src") \
                          .agg(collect_list("dst").alias("neighbors")) \
                          .persist()

        for _ in range(num_iters):
            contribs = links.join(ranks, "src") \
                            .select(explode("neighbors").alias("dst"),
                                    (col("rank") / size("neighbors")).alias("contrib"))

            sums = contribs.groupBy("dst") \
                           .agg(Fsum("contrib").alias("total_contrib"))

            dangling = ranks.join(links, "src", "left_anti")
            dangling_sum = dangling.agg(Fsum("rank")).first()[0] or 0.0
            dangling_share = dangling_sum / N

            teleport = (1.0 - self.damping) / N

            ranks = pages.withColumnRenamed("src", "dst") \
                         .join(sums, "dst", how="left") \
                         .select(
                             col("dst").alias("src"),
                             (lit(teleport) +
                              lit(self.damping) * (
                                  coalesce(col("total_contrib"), lit(0.0)) +
                                  lit(dangling_share)
                              )).alias("rank")
                         )

        self.ranks = ranks.persist()
        return self.ranks


In [5]:
# Run PageRank
pr = PageRank(edges_df, damping=0.85)
final_ranks = pr.run(num_iters=20)

# Show top 20 ranked pages
final_ranks.orderBy(col("rank").desc()) \
           .show(20, truncate=False)

# Write results to disk
final_ranks.orderBy(col("rank").desc()) \
           .write.csv("pagerank_output", header=True, mode="overwrite")

spark.stop()


+-----------------------------------------------------------------------------------------------------------------------+--------------------+
|src                                                                                                                    |rank                |
+-----------------------------------------------------------------------------------------------------------------------+--------------------+
|https://it.tdtu.edu.vn                                                                                                 |0.07672297279113768 |
|https://it.tdtu.edu.vn/giao-duc                                                                                        |0.04427371946846487 |
|https://it.tdtu.edu.vn/tuyen-sinh                                                                                      |0.03661015116861993 |
|https://it.tdtu.edu.vn/gioi-thieu                                                                                      |0.03627394582362061 |