In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("marvel_most_popular_hero").master("local[*]").getOrCreate()

In [2]:
from pyspark.sql.types import StructField, StructType, IntegerType, StringType

schema = StructType([
    StructField("hero_id", IntegerType(), True),
    StructField("name", StringType(), True)
])

In [3]:
names = spark.read.csv("Marvel_Names", sep = " ", schema= schema)
lines = spark.read.text("Marvel_Graph")

In [4]:
from pyspark.sql.functions import split, sum, col, size, desc, min

connections = lines.withColumn("id", split(lines["value"], ' ')[0]) 
connections = connections.withColumn("connection_count", size(split(lines["value"]," "))-1) 
connections = connections.select("id", "connection_count")
connections = connections.groupBy("id").agg(sum("connection_count").alias("connection_count")).orderBy(desc("connection_count"))

inner_join = connections.join(names, names.hero_id == connections.id).orderBy(desc("connection_count"))

inner_join = inner_join.select("hero_id", "name", "connection_count")

inner_join.show(10, truncate = False)



+-------+--------------------+----------------+
|hero_id|name                |connection_count|
+-------+--------------------+----------------+
|859    |CAPTAIN AMERICA     |1937            |
|5306   |SPIDER-MAN/PETER PAR|1745            |
|2664   |IRON MAN/TONY STARK |1532            |
|5716   |THING/BENJAMIN J. GR|1429            |
|6306   |WOLVERINE/LOGAN     |1397            |
|3805   |MR. FANTASTIC/REED R|1389            |
|2557   |HUMAN TORCH/JOHNNY S|1374            |
|4898   |SCARLET WITCH/WANDA |1348            |
|5736   |THOR/DR. DONALD BLAK|1292            |
|403    |BEAST/HENRY &HANK& P|1283            |
+-------+--------------------+----------------+
only showing top 10 rows



In [5]:
#all superheroes with minimum connections
min_connections = inner_join.agg(min(inner_join["connection_count"])).collect()
min_connections = int(min_connections[0][0])

min_connection_heroes = inner_join.filter(inner_join["connection_count"] == min_connections)

min_connection_heroes.show()


+-------+--------------------+----------------+
|hero_id|                name|connection_count|
+-------+--------------------+----------------+
|    467|        BERSERKER II|               1|
|    577|              BLARE/|               1|
|   3490|MARVEL BOY II/MARTIN|               1|
|   2139|      GIURESCU, RADU|               1|
|   3489|MARVEL BOY/MARTIN BU|               1|
|   1089|       CLUMSY FOULUP|               1|
|   1841|              FENRIS|               1|
|   4517|              RANDAK|               1|
|   5028|           SHARKSKIN|               1|
|    835|     CALLAHAN, DANNY|               1|
|   1408|         DEATHCHARGE|               1|
|   4784|                RUNE|               1|
|   4602|         RED WOLF II|               1|
|   4945|         SEA LEOPARD|               1|
|   6411|              ZANTOR|               1|
|   3014|JOHNSON, LYNDON BAIN|               1|
|   3298|          LUNATIK II|               1|
|   2911|                KULL|          