In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, sum as _sum, collect_list, size, explode
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

spark = (SparkSession.builder.appName("pageRank4_enwiki_task4_25")
         .config("spark.driver.memory", "30g")  # Sets the Spark driver memory to 30GB
         .config("spark.executor.memory", "30g")  # Sets the Spark executor memory to 30GB
         .config("spark.executor.cores", "5")  # Sets the number of cores for each executor to 5
         .config("spark.task.cpus", "1")  # Sets the number of cpus per task to be 1
         .config("spark.eventLog.enabled", "true")
         .config("spark.eventLog.dir", "/mnt/data/spark-event-logs")
         .config("spark.local.dir", "/mnt/data/temp") 
         .master("spark://10.10.1.1:7077")  
         .getOrCreate())

# Load data
schema = StructType([
    StructField("page", StringType(), True),
    StructField("link", StringType(), True)
])
df = spark.read.csv("hdfs://10.10.1.1:9000/data/enwiki-pages-articles", sep="\t", schema=schema)

# Initialize page ranks
pages = df.select("page").distinct()
links = df.groupBy("page").agg(collect_list("link").alias("links"))
ranks = pages.select("page", lit(1).alias("rank"))

# Calculate PageRank
for iteration in range(4):
    contributions = links.join(ranks, "page").select("links", (col("rank") / size("links")).alias("contribution"))
    contributions = contributions.withColumn("link", explode("links")).select("link", "contribution")
    
    ranks = contributions.groupBy("link").agg(_sum("contribution").alias("sum_contributions"))
    ranks = ranks.select(col("link").alias("page"), (lit(0.15) + lit(0.85) * col("sum_contributions")).alias("rank"))
    
# Sort the ranks in descending order
ranks = ranks.orderBy(col("rank").desc())

# Save results to HDFS
ranks.write.format("csv").mode("overwrite").save("hdfs://10.10.1.1:9000/data/pageRank_enwiki_task4_res")

spark.stop()

24/02/08 17:06:03 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).




24/02/08 17:09:21 ERROR TaskSchedulerImpl: Lost executor 1 on 10.10.1.2: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/02/08 17:09:21 WARN TaskSetManager: Lost task 118.0 in stage 8.0 (TID 397) (10.10.1.2 executor 1): ExecutorLostFailure (executor 1 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/02/08 17:09:21 WARN TaskSetManager: Lost task 117.0 in stage 8.0 (TID 396) (10.10.1.2 executor 1): ExecutorLostFailure (executor 1 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/02/08 17:09:21 WARN TaskSetManager: Lost task 123.0 in stage 8.0 (TID 402) (10.10.1.2 executor 1): ExecutorLostFailure (executor 1 exited caused b



24/02/08 17:09:24 WARN TaskSetManager: Lost task 119.0 in stage 8.0 (TID 398) (10.10.1.1 executor 0): FetchFailed(BlockManagerId(1, 10.10.1.2, 38389, None), shuffleId=0, mapIndex=84, mapId=84, reduceId=119, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:312)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1180)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:918)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:85)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.CompletionIterator.



24/02/08 17:09:25 WARN TaskSetManager: Lost task 124.0 in stage 8.0 (TID 403) (10.10.1.3 executor 2): FetchFailed(BlockManagerId(1, 10.10.1.2, 38389, None), shuffleId=0, mapIndex=5, mapId=5, reduceId=124, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:312)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1180)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:918)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:85)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.CompletionIterator.ha

[Stage 5:>   (0 + 7) / 30][Stage 6:>   (0 + 0) / 13][Stage 8:>(116 + 3) / 200]

24/02/08 17:09:25 WARN TaskSetManager: Lost task 127.0 in stage 8.0 (TID 406) (10.10.1.3 executor 2): FetchFailed(BlockManagerId(1, 10.10.1.2, 38389, None), shuffleId=0, mapIndex=5, mapId=5, reduceId=127, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:312)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1180)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:918)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:85)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.CompletionIterator.ha

                                                                                