In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, sum as _sum, collect_list, size, explode
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

spark = (SparkSession.builder.appName("part3_benchmark")
         .config("spark.driver.memory", "30g")  # Sets the Spark driver memory to 30GB
         .config("spark.executor.memory", "30g")  # Sets the Spark executor memory to 30GB
         .config("spark.executor.cores", "5")  # Sets the number of cores for each executor to 5
         .config("spark.task.cpus", "1")  # Sets the number of cpus per task to be 1
         .config("spark.eventLog.enabled", "true")
         .config("spark.eventLog.dir", "/mnt/data/spark-event-logs")
         .config("spark.local.dir", "/mnt/data/temp") 
         .master("spark://10.10.1.1:7077")  
         .getOrCreate())

# Load data
schema = StructType([
    StructField("page", StringType(), True),
    StructField("link", StringType(), True)
])
df = spark.read.csv("hdfs://10.10.1.1:9000/data/enwiki-pages-articles", sep="\t", schema=schema)

# Initialize page ranks
pages = df.select("page").distinct()
links = df.groupBy("page").agg(collect_list("link").alias("links"))
ranks = pages.select("page", lit(1).alias("rank"))

# Calculate PageRank
for iteration in range(4):
    contributions = links.join(ranks, "page").select("links", (col("rank") / size("links")).alias("contribution"))
    contributions = contributions.withColumn("link", explode("links")).select("link", "contribution")
    
    ranks = contributions.groupBy("link").agg(_sum("contribution").alias("sum_contributions"))
    ranks = ranks.select(col("link").alias("page"), (lit(0.15) + lit(0.85) * col("sum_contributions")).alias("rank"))
    
# Sort the ranks in descending order
ranks = ranks.orderBy(col("rank").desc())

# Save results to HDFS
ranks.write.format("csv").mode("overwrite").save("hdfs://10.10.1.1:9000/data/part3_benchmark_res")

spark.stop()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/02/08 10:31:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/02/08 10:31:52 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


                                                                                

In [4]:
!hdfs dfs -cat /data/pageRank_enwiki_task1_res/* | head -n 100

Project:AWB,35838.032145759484
WP:AES,30393.765128183117
Template:Redirect category shell,2884.716344355547
Special:DoubleRedirects,2719.107731847807
Wikipedia:Deletion review,2637.5522902511057
Category:Living people,2605.9503527614465
Wikipedia:Coordinates in infoboxes,2519.029977299664
WP:HC,2338.1262566653186
WP:GenFixes,1672.662058631576
Communes of France,1583.3889240918768
WP:F5,1296.7969193347044
User talk:Tom.Reding#Revise unhelpful potatoes MP#R,1246.1326767195278
United States,1245.173620287483
France,1202.7509778357037
Animal,1196.671056270897
Template:Taxonbar,1188.7357172394811
:Category:Taxonbar templates without from parameter,1168.2432227487902
Wikipedia:Articles for deletion/PAGENAME (2nd nomination),1143.2250085599621
Meanings of minor planet names,1111.0425842912314
en:User:COIBot#Blacklist,1014.4217942738013
Germany,975.1261313378924
Arthropod,974.3451008492086
bugzilla:42616,964.6366771344454
User:KolbertBot,955.2024184513782
Insect,901.642945834143
Help:Using tal