In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import collect_list, lit, explode, size, col,split, sum as sqlsum

spark = (SparkSession.builder.appName("pageRank")
         .config("spark.driver.memory", "30g")  # Sets the Spark driver memory to 30GB
         .config("spark.executor.memory", "30g")  # Sets the Spark executor memory to 30GB
         .config("spark.executor.cores", "5")  # Sets the number of cores for each executor to 5
         .config("spark.task.cpus", "1")  # Sets the number of cpus per task to be 1
         .getOrCreate())

# Define the path to the data
data_path = "hdfs://10.10.1.1:9000/data/web-BerkStan.txt"

# Read the data as a single column DataFrame, then split the column into two
links_df_raw = spark.read.text(data_path).filter(~col("value").startswith("#"))

# Split the 'value' column into two columns: 'source' and 'target'
links_df = links_df_raw.withColumn("split_value", split(col("value"), "\t")) \
                       .filter(size(col("split_value")) == 2) \
                       .select(
                           col("split_value").getItem(0).alias("source"),
                           col("split_value").getItem(1).alias("target")
                       )

# Deduplicate and group by source page to get a list of target pages
links_grouped = links_df.distinct().groupBy("source").agg(collect_list("target").alias("targets"))

# Initialize each page's rank to 1.0 and create a DataFrame
ranks_df = links_grouped.select("source").withColumn("rank", lit(1.0))

# Define the number of iterations
num_iterations = 10

for iteration in range(num_iterations):
    # Alias the DataFrames to avoid column ambiguity
    links_alias = links_grouped.alias("links")
    ranks_alias = ranks_df.alias("ranks")

    # Perform the join using aliases and qualified column names
    contributions = links_alias.join(ranks_alias, col("links.source") == col("ranks.source")) \
                               .select(
                                   col("links.source"),
                                   explode("links.targets").alias("target"),
                                   (col("ranks.rank") / size("links.targets")).alias("contribution")
                               ) \
                               .groupBy("target") \
                               .agg(sqlsum("contribution").alias("total_contributions"))

    # Update each page's rank based on the contributions
    ranks_df = contributions.select(
        col("target").alias("source"),  # Rename 'target' back to 'source' for the next iteration
        (lit(0.15) + lit(0.85) * col("total_contributions")).alias("rank")
    )


# Collect the page ranks
# sorted_results = ranks_df.orderBy(ranks_df['rank'].desc()).collect()
sorted_results = ranks_df.orderBy(ranks_df['rank'].desc())

# Display the results
# for result in sorted_results:
#     print(f"Page: {result['source']}, Rank: {result['rank']}")

# Save results to HDFS
sorted_results.write.format("csv").mode("overwrite").save("hdfs://10.10.1.1:9000/data/pageRank_chang256")

spark.stop()

ConnectionRefusedError: [Errno 111] Connection refused

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/users/lyang439/.local/lib/python3.7/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/users/lyang439/.local/lib/python3.7/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/users/lyang439/.local/lib/python3.7/site-packages/py4j/clientserver.py", line 540, in send_command
    "Error while sending or receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while sending or receiving


In [12]:
!hdfs dfs -cat /data/pageRank_chang256/* | head -n 100

272919,6531.324623752463
438238,4335.323158564451
571448,2383.897607411885
601656,2195.39407559673
316792,1855.6908757901526
319209,1632.8193684975708
184094,1532.2842374483505
571447,1492.930163093876
401873,1436.1600933469401
66244,1261.5783958673446
68949,1260.7919421349236
284306,1257.2475650644853
68948,1251.1723536459333
86239,1235.298540597636
66909,1235.298540597636
77284,1235.298540597636
95552,1235.298540597636
96070,1235.298540597636
86238,1235.2985405976358
86237,1235.2985405976358
95551,1235.2985405976358
68946,1235.2985405976358
68947,1235.2985405976358
768,1225.5975665113083
927,1117.8383051141857
210376,920.670125280365
95527,919.6797146521208
100130,916.0190658202802
101163,912.5380530106063
95018,911.1831080078103
100646,909.7095673033128
96045,904.3981315809862
66879,895.7909746044877
210305,893.0386730972384
319412,887.9352083382672
571451,875.7852546255594
570985,871.5825582573203
544858,869.6096568148229
184142,863.2307781841776
299039,832.3149809807288
49176,819.