In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, sum as _sum, collect_list, size, explode
from pyspark.sql.types import StructType, StructField, IntegerType

spark = (SparkSession.builder.appName("pageRank4")
         .config("spark.driver.memory", "30g")  # Sets the Spark driver memory to 30GB
         .config("spark.executor.memory", "30g")  # Sets the Spark executor memory to 30GB
         .config("spark.executor.cores", "5")  # Sets the number of cores for each executor to 5
         .config("spark.task.cpus", "1")  # Sets the number of cpus per task to be 1
         .config("spark.eventLog.enabled", "true")
         .config("spark.eventLog.dir", "hdfs://10.10.1.1:9000/eventLogs")
         .getOrCreate())

# Load data
schema = StructType([
    StructField("page", IntegerType(), True),
    StructField("link", IntegerType(), True)
])
df = spark.read.csv("hdfs://10.10.1.1:9000/data/web-BerkStan.txt", sep="\t", schema=schema)

# Initialize page ranks
pages = df.select("page").distinct()
links = df.groupBy("page").agg(collect_list("link").alias("links"))
ranks = pages.select("page", lit(1).alias("rank"))

# Calculate PageRank
for iteration in range(10):
    contributions = links.join(ranks, "page").select("links", (col("rank") / size("links")).alias("contribution"))
    contributions = contributions.withColumn("link", explode("links")).select("link", "contribution")
    
    ranks = contributions.groupBy("link").agg(_sum("contribution").alias("sum_contributions"))
    ranks = ranks.select(col("link").alias("page"), (lit(0.15) + lit(0.85) * col("sum_contributions")).alias("rank"))
    
# Sort the ranks in descending order
ranks = ranks.orderBy(col("rank").desc())

# Save results to HDFS
ranks.write.format("csv").mode("overwrite").save("hdfs://10.10.1.1:9000/data/pageRank_lyang439_3")

spark.stop()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/06 12:11:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [3]:
spark.stop()

In [4]:
!hdfs dfs -cat /data/pageRank_lyang439_2/* | head -n 100

272919,6531.324623752468
438238,4335.3231585644535
571448,2383.897607411886
601656,2195.394075596731
316792,1855.690875790153
319209,1632.819368497569
184094,1532.284237448338
571447,1492.9301630938758
401873,1436.160093346927
66244,1261.578395867347
68949,1260.7919421349268
284306,1257.2475650644858
68948,1251.1723536459365
77284,1235.298540597639
68947,1235.298540597639
68946,1235.298540597639
66909,1235.298540597639
86239,1235.2985405976387
86237,1235.2985405976387
95551,1235.2985405976387
96070,1235.2985405976385
95552,1235.2985405976385
86238,1235.2985405976378
768,1225.5975665113062
927,1117.83830511419
210376,920.6701252803653
95527,919.6797146521216
100130,916.0190658202812
101163,912.5380530106072
95018,911.1831080078113
100646,909.7095673033139
96045,904.3981315809871
66879,895.7909746044886
210305,893.0386730972384
319412,887.9352083382672
571451,875.785254625559
570985,871.5825582573203
544858,869.6096568148233
184142,863.2307781841789
299039,832.3149809807296
49176,819.868