In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, sum as _sum, collect_list, size, explode
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

spark = (SparkSession.builder.appName("part3_task2_conf_shuffle_400")
         .config("spark.driver.memory", "30g")  # Sets the Spark driver memory to 30GB
         .config("spark.executor.memory", "30g")  # Sets the Spark executor memory to 30GB
         .config("spark.executor.cores", "5")  # Sets the number of cores for each executor to 5
         .config("spark.task.cpus", "1")  # Sets the number of cpus per task to be 1
         .config("spark.eventLog.enabled", "true")
         .config("spark.eventLog.dir", "/mnt/data/spark-event-logs")
         .config("spark.local.dir", "/mnt/data/temp") 
         .config("spark.sql.shuffle.partitions", "400")
         .master("spark://10.10.1.1:7077")  
         .getOrCreate())

# Load data
schema = StructType([
    StructField("page", StringType(), True),
    StructField("link", StringType(), True)
])
df = spark.read.csv("hdfs://10.10.1.1:9000/data/enwiki-pages-articles", sep="\t", schema=schema)
# print(f"Number of df partitions: {df.rdd.getNumPartitions()}")

# Initialize page ranks
pages = df.select("page").distinct()
# print(f"Number of pages partitions: {pages.rdd.getNumPartitions()}")


links = df.groupBy("page").agg(collect_list("link").alias("links"))
# print(f"Number of links partitions: {links.rdd.getNumPartitions()}")

ranks = pages.select("page", lit(1).alias("rank"))
# print(f"Number of ranks partitions: {ranks.rdd.getNumPartitions()}")

# Calculate PageRank
for iteration in range(4):
    contributions = links.join(ranks, "page").select("links", (col("rank") / size("links")).alias("contribution"))
    # print(f"Number of contributions1 partitions: {contributions.rdd.getNumPartitions()}")
    
    contributions = contributions.withColumn("link", explode("links")).select("link", "contribution")
    # print(f"Number of contributions2 partitions: {contributions.rdd.getNumPartitions()}")
    
    ranks = contributions.groupBy("link").agg(_sum("contribution").alias("sum_contributions"))
    # print(f"Number of ranks1 partitions: {ranks.rdd.getNumPartitions()}")
    
    ranks = ranks.select(col("link").alias("page"), (lit(0.15) + lit(0.85) * col("sum_contributions")).alias("rank"))
    # print(f"Number of ranks2 partitions: {ranks.rdd.getNumPartitions()}")
    
# Sort the ranks in descending order
# print(f"Number of ranks3 partitions: {ranks.rdd.getNumPartitions()}")
ranks = ranks.orderBy(col("rank").desc())
# print(f"Number of ranks4 partitions: {ranks.rdd.getNumPartitions()}")

# Save results to HDFS
ranks.repartition(3).write.format("csv").mode("overwrite").save("hdfs://10.10.1.1:9000/data/part3_task2_res")

spark.stop()

24/02/08 21:38:25 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


                                                                                

In [11]:
spark.stop()

In [5]:
spark = (SparkSession.builder.appName("part3_task2_small")
         .config("spark.driver.memory", "30g")  # Sets the Spark driver memory to 30GB
         .config("spark.executor.memory", "30g")  # Sets the Spark executor memory to 30GB
         .config("spark.executor.cores", "5")  # Sets the number of cores for each executor to 5
         .config("spark.task.cpus", "1")  # Sets the number of cpus per task to be 1
         .config("spark.eventLog.enabled", "true")
         .config("spark.eventLog.dir", "/mnt/data/spark-event-logs")
         .config("spark.local.dir", "/mnt/data/temp") 
         .master("spark://10.10.1.1:7077")  
         .getOrCreate())

# Load data
schema = StructType([
    StructField("page", StringType(), True),
    StructField("link", StringType(), True)
])
df = spark.read.csv("hdfs://10.10.1.1:9000/data/web-BerkStan.txt", sep="\t", schema=schema).repartition(4)
num_partitions = df.rdd.getNumPartitions()
print(f"Number of df partitions: {num_partitions}")

# Initialize page ranks
pages = df.select("page").distinct().repartition(4)
num_partitions = pages.rdd.getNumPartitions()
print(f"Number of pages partitions: {num_partitions}")

links = df.groupBy("page").agg(collect_list("link").alias("links")).repartition(4)
num_partitions = links.rdd.getNumPartitions()
print(f"Number of links partitions: {num_partitions}")

ranks = pages.select("page", lit(1).alias("rank")).repartition(4)
num_partitions = ranks.rdd.getNumPartitions()
print(f"Number of ranks partitions: {num_partitions}")

# Calculate PageRank
for iteration in range(3):
    contributions = links.join(ranks, "page").select("links", (col("rank") / size("links")).alias("contribution")).repartition(4)
    contributions = contributions.withColumn("link", explode("links")).select("link", "contribution")
    num_partitions = contributions.rdd.getNumPartitions()
    print(f"Number of contributions partitions: {num_partitions}")
    
    ranks = contributions.groupBy("link").agg(_sum("contribution").alias("sum_contributions")).repartition(4)
    ranks = ranks.select(col("link").alias("page"), (lit(0.15) + lit(0.85) * col("sum_contributions")).alias("rank"))
    num_partitions = ranks.rdd.getNumPartitions()
    print(f"Number of ranks 2 partitions: {num_partitions}")
    
# Sort the ranks in descending order
num_partitions = ranks.rdd.getNumPartitions()
print(f"Number of ranks 3 partitions: {num_partitions}")
ranks = ranks.orderBy(col("rank").desc()).repartition(4)
num_partitions = ranks.rdd.getNumPartitions()
print(f"Number of ranks 4 partitions: {num_partitions}")

# Save results to HDFS
ranks.write.format("csv").mode("overwrite").save("hdfs://10.10.1.1:9000/data/part3_task2_small_res")

spark.stop()

24/02/08 12:02:48 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).




Number of df partitions: 4




Number of pages partitions: 4




Number of links partitions: 4


[Stage 15:>                                                         (0 + 4) / 4]

Number of ranks partitions: 4




Number of contributions partitions: 4




Number of ranks 2 partitions: 4




Number of contributions partitions: 4


[Stage 265:>                                                        (0 + 4) / 4]

Number of ranks 2 partitions: 4




Number of contributions partitions: 4


[Stage 572:>                                                        (0 + 4) / 4]

Number of ranks 2 partitions: 4
Number of ranks 3 partitions: 4




Number of ranks 4 partitions: 4


                                                                                