In [None]:
from pyspark.sql import (
    functions as f,
    SparkSession,
    types as t
)

# Attribution 3.0 Unported (CC BY 3.0)
# https://www.kaggle.com/datasets/csanhueza/the-marvel-universe-social-network

spark = SparkSession.builder.appName("df_most_popular").getOrCreate()
csv_file_path = "file:///home/jovyan/work/sample/hero-network.csv"
# read file
df = spark.read\
            .option("header", "true")\
            .option("inferSchema", "true").csv(csv_file_path)

# pyspark.sql.functions.collect_set(col): Aggregate function: returns a set of objects with duplicate elements eliminated.
data = df.groupBy("hero1")\
            .agg(
                f.collect_set("hero2").alias("connection"))\
            .withColumnRenamed("hero1", "hero")
# data.show()
# pyspark.sql.functions.concat_ws(sep, *cols): Concatenates multiple input string columns together into a single string column, using the given separator.
data = data.withColumn("connection", f.concat_ws(",", f.col("connection")))
data.show()

# DataFrame.coalesce(numPartitions): Returns a new DataFrame that has exactly numPartitions partitions.
data.coalesce(1).write.option("header", True).csv("output")

# load the file
csv_file_path = "file:///home/jovyan/work/output"
df = spark.read\
            .option("header", "true")\
            .option("inferSchema", "true")\
            .csv(csv_file_path)
# df.show()

# pyspark.sql.functions.size(col): Collection function: returns the length of the array or map stored in the column.
df = df.withColumn(
        "connection_size",
        f.size(
            f.split(f.col("connection"), ",")))\
        .orderBy(f.desc("connection_size"))
df.show()

most_popular_hero = df.select("hero").first()
print(most_popular_hero.hero)