In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, sum as _sum, collect_list, size, explode
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

spark = (SparkSession.builder.appName("part3_task2")
         .config("spark.driver.memory", "30g")  # Sets the Spark driver memory to 30GB
         .config("spark.executor.memory", "30g")  # Sets the Spark executor memory to 30GB
         .config("spark.executor.cores", "5")  # Sets the number of cores for each executor to 5
         .config("spark.task.cpus", "1")  # Sets the number of cpus per task to be 1
         .config("spark.eventLog.enabled", "true")
         .config("spark.eventLog.dir", "/mnt/data/spark-event-logs")
         .config("spark.local.dir", "/mnt/data/temp") 
         .master("spark://10.10.1.1:7077")  
         .getOrCreate())

# Load data
schema = StructType([
    StructField("page", StringType(), True),
    StructField("link", StringType(), True)
])
df = spark.read.csv("hdfs://10.10.1.1:9000/data/enwiki-pages-articles", sep="\t", schema=schema)
print(f"Number of df partitions: {df.rdd.getNumPartitions()}")

# Initialize page ranks
pages = df.select("page").distinct()
print(f"Number of pages partitions: {pages.rdd.getNumPartitions()}")


links = df.groupBy("page").agg(collect_list("link").alias("links"))
print(f"Number of links partitions: {links.rdd.getNumPartitions()}")

ranks = pages.select("page", lit(1).alias("rank"))
print(f"Number of ranks partitions: {ranks.rdd.getNumPartitions()}")

# Calculate PageRank
for iteration in range(4):
    contributions = links.join(ranks, "page").select("links", (col("rank") / size("links")).alias("contribution"))
    print(f"Number of contributions1 partitions: {contributions.rdd.getNumPartitions()}")
    
    contributions = contributions.withColumn("link", explode("links")).select("link", "contribution")
    print(f"Number of contributions2 partitions: {contributions.rdd.getNumPartitions()}")
    
    ranks = contributions.groupBy("link").agg(_sum("contribution").alias("sum_contributions"))
    print(f"Number of ranks1 partitions: {ranks.rdd.getNumPartitions()}")
    
    ranks = ranks.select(col("link").alias("page"), (lit(0.15) + lit(0.85) * col("sum_contributions")).alias("rank"))
    print(f"Number of ranks2 partitions: {ranks.rdd.getNumPartitions()}")
    
# Sort the ranks in descending order
print(f"Number of ranks3 partitions: {ranks.rdd.getNumPartitions()}")
ranks = ranks.orderBy(col("rank").desc())
print(f"Number of ranks4 partitions: {ranks.rdd.getNumPartitions()}")

# Save results to HDFS
ranks.repartition(3).write.format("csv").mode("overwrite").save("hdfs://10.10.1.1:9000/data/part3_task2_res")

spark.stop()

Number of df partitions: 206




Number of pages partitions: 20




Number of links partitions: 100




Number of ranks partitions: 20




Number of contributions1 partitions: 100




Number of contributions2 partitions: 100




Number of ranks1 partitions: 100




Number of ranks2 partitions: 100




Number of contributions1 partitions: 100




Number of contributions2 partitions: 100




Number of ranks1 partitions: 100




Number of ranks2 partitions: 100




Number of contributions1 partitions: 100




Number of contributions2 partitions: 100




Number of ranks1 partitions: 100




Number of ranks2 partitions: 100




Number of contributions1 partitions: 100




Number of contributions2 partitions: 100




24/02/08 14:50:11 WARN TaskSetManager: Lost task 34.0 in stage 137.0 (TID 10462) (10.10.1.1 executor 0): java.io.IOException: No space left on device
	at java.io.FileOutputStream.writeBytes(Native Method)
	at java.io.FileOutputStream.write(FileOutputStream.java:326)
	at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:59)
	at org.apache.spark.io.MutableCheckedOutputStream.write(MutableCheckedOutputStream.scala:43)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:225)
	at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:178)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at java.io.DataOutputStream.write(DataOutputStream.java:107)
	at org.apache.spark.sql.



24/02/08 14:50:23 WARN TaskSetManager: Lost task 88.0 in stage 137.0 (TID 10530) (10.10.1.1 executor 0): java.io.IOException: No space left on device
	at java.io.FileOutputStream.writeBytes(Native Method)
	at java.io.FileOutputStream.write(FileOutputStream.java:326)
	at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:59)
	at org.apache.spark.io.MutableCheckedOutputStream.write(MutableCheckedOutputStream.scala:43)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:225)
	at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:178)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at java.io.DataOutputStream.write(DataOutputStream.java:107)
	at org.apache.spark.sql.



24/02/08 14:50:24 ERROR TaskSetManager: Task 59 in stage 137.0 failed 4 times; aborting job
24/02/08 14:50:24 WARN TaskSetManager: Lost task 89.1 in stage 137.0 (TID 10547) (10.10.1.1 executor 0): TaskKilled (Stage cancelled)
24/02/08 14:50:24 WARN TaskSetManager: Lost task 72.3 in stage 137.0 (TID 10548) (10.10.1.1 executor 0): TaskKilled (Stage cancelled)
24/02/08 14:50:24 WARN TaskSetManager: Lost task 98.0 in stage 137.0 (TID 10546) (10.10.1.1 executor 0): TaskKilled (Stage cancelled)
24/02/08 14:50:24 WARN TaskSetManager: Lost task 97.0 in stage 137.0 (TID 10544) (10.10.1.1 executor 0): TaskKilled (Stage cancelled)
24/02/08 14:50:24 WARN TaskSetManager: Lost task 99.0 in stage 137.0 (TID 10549) (10.10.1.1 executor 0): TaskKilled (Stage cancelled)




24/02/08 14:50:24 WARN TaskSetManager: Lost task 96.0 in stage 137.0 (TID 10543) (10.10.1.3 executor 1): TaskKilled (Stage cancelled)
24/02/08 14:50:24 WARN TaskSetManager: Lost task 95.0 in stage 137.0 (TID 10542) (10.10.1.3 executor 1): TaskKilled (Stage cancelled)
24/02/08 14:50:24 WARN TaskSetManager: Lost task 88.1 in stage 137.0 (TID 10545) (10.10.1.3 executor 1): TaskKilled (Stage cancelled)
24/02/08 14:50:24 WARN TaskSetManager: Lost task 94.0 in stage 137.0 (TID 10541) (10.10.1.3 executor 1): TaskKilled (Stage cancelled)
24/02/08 14:50:24 WARN TaskSetManager: Lost task 93.0 in stage 137.0 (TID 10540) (10.10.1.2 executor 2): TaskKilled (Stage cancelled)
24/02/08 14:50:24 WARN TaskSetManager: Lost task 92.0 in stage 137.0 (TID 10539) (10.10.1.2 executor 2): TaskKilled (Stage cancelled)
24/02/08 14:50:24 WARN TaskSetManager: Lost task 80.1 in stage 137.0 (TID 10535) (10.10.1.2 executor 2): TaskKilled (Stage cancelled)
24/02/08 14:50:24 WARN TaskSetManager: Lost task 79.1 in stage

Py4JJavaError: An error occurred while calling o1722.javaToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 59 in stage 137.0 failed 4 times, most recent failure: Lost task 59.3 in stage 137.0 (TID 10536) (10.10.1.1 executor 0): java.io.IOException: No space left on device
	at java.io.FileOutputStream.writeBytes(Native Method)
	at java.io.FileOutputStream.write(FileOutputStream.java:326)
	at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:59)
	at org.apache.spark.io.MutableCheckedOutputStream.write(MutableCheckedOutputStream.scala:43)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:225)
	at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:178)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at java.io.DataOutputStream.write(DataOutputStream.java:107)
	at org.apache.spark.sql.catalyst.expressions.UnsafeRow.writeToStream(UnsafeRow.java:542)
	at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$1.writeValue(UnsafeRowSerializer.scala:69)
	at org.apache.spark.storage.DiskBlockObjectWriter.write(DiskBlockObjectWriter.scala:310)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:171)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2668)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2604)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2603)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2603)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1178)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1178)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1178)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2798)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2787)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: java.io.IOException: No space left on device
	at java.io.FileOutputStream.writeBytes(Native Method)
	at java.io.FileOutputStream.write(FileOutputStream.java:326)
	at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:59)
	at org.apache.spark.io.MutableCheckedOutputStream.write(MutableCheckedOutputStream.scala:43)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:225)
	at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:178)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at java.io.DataOutputStream.write(DataOutputStream.java:107)
	at org.apache.spark.sql.catalyst.expressions.UnsafeRow.writeToStream(UnsafeRow.java:542)
	at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$1.writeValue(UnsafeRowSerializer.scala:69)
	at org.apache.spark.storage.DiskBlockObjectWriter.write(DiskBlockObjectWriter.scala:310)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:171)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)




24/02/08 14:50:24 WARN TaskSetManager: Lost task 81.1 in stage 137.0 (TID 10537) (10.10.1.2 executor 2): TaskKilled (Stage cancelled)
24/02/08 14:50:24 WARN TaskSetManager: Lost task 91.0 in stage 137.0 (TID 10538) (10.10.1.2 executor 2): TaskKilled (Stage cancelled)


In [11]:
spark.stop()

In [5]:
spark = (SparkSession.builder.appName("part3_task2_small")
         .config("spark.driver.memory", "30g")  # Sets the Spark driver memory to 30GB
         .config("spark.executor.memory", "30g")  # Sets the Spark executor memory to 30GB
         .config("spark.executor.cores", "5")  # Sets the number of cores for each executor to 5
         .config("spark.task.cpus", "1")  # Sets the number of cpus per task to be 1
         .config("spark.eventLog.enabled", "true")
         .config("spark.eventLog.dir", "/mnt/data/spark-event-logs")
         .config("spark.local.dir", "/mnt/data/temp") 
         .master("spark://10.10.1.1:7077")  
         .getOrCreate())

# Load data
schema = StructType([
    StructField("page", StringType(), True),
    StructField("link", StringType(), True)
])
df = spark.read.csv("hdfs://10.10.1.1:9000/data/web-BerkStan.txt", sep="\t", schema=schema).repartition(4)
num_partitions = df.rdd.getNumPartitions()
print(f"Number of df partitions: {num_partitions}")

# Initialize page ranks
pages = df.select("page").distinct().repartition(4)
num_partitions = pages.rdd.getNumPartitions()
print(f"Number of pages partitions: {num_partitions}")

links = df.groupBy("page").agg(collect_list("link").alias("links")).repartition(4)
num_partitions = links.rdd.getNumPartitions()
print(f"Number of links partitions: {num_partitions}")

ranks = pages.select("page", lit(1).alias("rank")).repartition(4)
num_partitions = ranks.rdd.getNumPartitions()
print(f"Number of ranks partitions: {num_partitions}")

# Calculate PageRank
for iteration in range(3):
    contributions = links.join(ranks, "page").select("links", (col("rank") / size("links")).alias("contribution")).repartition(4)
    contributions = contributions.withColumn("link", explode("links")).select("link", "contribution")
    num_partitions = contributions.rdd.getNumPartitions()
    print(f"Number of contributions partitions: {num_partitions}")
    
    ranks = contributions.groupBy("link").agg(_sum("contribution").alias("sum_contributions")).repartition(4)
    ranks = ranks.select(col("link").alias("page"), (lit(0.15) + lit(0.85) * col("sum_contributions")).alias("rank"))
    num_partitions = ranks.rdd.getNumPartitions()
    print(f"Number of ranks 2 partitions: {num_partitions}")
    
# Sort the ranks in descending order
num_partitions = ranks.rdd.getNumPartitions()
print(f"Number of ranks 3 partitions: {num_partitions}")
ranks = ranks.orderBy(col("rank").desc()).repartition(4)
num_partitions = ranks.rdd.getNumPartitions()
print(f"Number of ranks 4 partitions: {num_partitions}")

# Save results to HDFS
ranks.write.format("csv").mode("overwrite").save("hdfs://10.10.1.1:9000/data/part3_task2_small_res")

spark.stop()

24/02/08 12:02:48 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).




Number of df partitions: 4




Number of pages partitions: 4




Number of links partitions: 4


[Stage 15:>                                                         (0 + 4) / 4]

Number of ranks partitions: 4




Number of contributions partitions: 4




Number of ranks 2 partitions: 4




Number of contributions partitions: 4


[Stage 265:>                                                        (0 + 4) / 4]

Number of ranks 2 partitions: 4




Number of contributions partitions: 4


[Stage 572:>                                                        (0 + 4) / 4]

Number of ranks 2 partitions: 4
Number of ranks 3 partitions: 4




Number of ranks 4 partitions: 4


                                                                                