In [26]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql import types 

# Initialize Spark session
spark = SparkSession.builder.appName("ApacheLogAnalysis").getOrCreate()

In [27]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 100)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 30)

In [28]:
graph_raw_df = spark.read.text("../data/marvel_graph.txt")
graph_raw_df

value
5988 748 1722 3752 4655 574...
5989 4080 4264 4446 3779 24...
5982 217 595 1194 3308 2940...
5983 1165 3836 4361 1282 71...
5980 2731 3712 1587 6084 24...
5981 3569 5353 4087 2653 20...
5986 2658 3712 2650 1265 13...
5987 2614 5716 1765 1818 29...
5984 590 4898 745 3805 2650...
5985 3233 2254 212 2023 272...


In [29]:
graph_split_df = graph_raw_df.select(func.split("value", " ").alias("parts"))
graph_split_df

parts
"[5988, 748, 1722, 3752, 465..."
"[5989, 4080, 4264, 4446, 37..."
"[5982, 217, 595, 1194, 3308..."
"[5983, 1165, 3836, 4361, 12..."
"[5980, 2731, 3712, 1587, 60..."
"[5981, 3569, 5353, 4087, 26..."
"[5986, 2658, 3712, 2650, 12..."
"[5987, 2614, 5716, 1765, 18..."
"[5984, 590, 4898, 745, 3805..."
"[5985, 3233, 2254, 212, 202..."


In [30]:
graph_df_tight = graph_split_df.select(
    func.col("parts")[0].cast("int").alias("hero_id"), 
    func.expr("size(parts)-2").alias("friends"),
)
graph_df_tight

hero_id,friends
5988,48
5989,40
5982,42
5983,14
5980,24
5981,17
5986,142
5987,81
5984,41
5985,19


In [31]:
graph_df = graph_df_tight.groupBy("hero_id").agg(func.sum("friends").alias("friends"))
graph_df

hero_id,friends
2142,222
3749,11
5803,7
1580,18
3918,497
496,45
6336,44
1342,19
833,8
1645,19


In [32]:
graph_df.count()

6486

In [33]:
hero_names_raw = spark.read.text("../data/marvel_names.txt")
hero_names_raw

value
"1 ""24-HOUR MAN/EMMANUEL"""
"2 ""3-D MAN/CHARLES CHAN"""
"3 ""4-D MAN/MERCURIO"""
"4 ""8-BALL/"""
"5 ""A"""
"6 ""A'YIN"""
"7 ""ABBOTT, JACK"""
"8 ""ABCISSA"""
"9 ""ABEL"""
"10 ""ABOMINATION/EMIL BLO"""


In [34]:
rg = r'^(\d+) "(.+)"$'
hero_names_df = hero_names_raw.select(
    func.regexp_extract("value", rg, 1).try_cast("int").alias("hero_id"),
    func.regexp_extract("value", rg, 2).alias("hero_name"),
)
hero_names_df = hero_names_df.dropna()
hero_names_df  

hero_id,hero_name
1,24-HOUR MAN/EMMANUEL
2,3-D MAN/CHARLES CHAN
3,4-D MAN/MERCURIO
4,8-BALL/
5,A
6,A'YIN
7,"ABBOTT, JACK"
8,ABCISSA
9,ABEL
10,ABOMINATION/EMIL BLO


In [35]:
graph_df.createOrReplaceTempView("friend")
hero_names_df.createOrReplaceTempView("hero")

top_10_popular_heroes = spark.sql("""
SELECT h.hero_id, h.hero_name, f.friends
FROM friend f
JOIN hero h ON h.hero_id == f.hero_id
ORDER BY f.friends DESC
LIMIT 10
""")
top_10_popular_heroes

hero_id,hero_name,friends
859,CAPTAIN AMERICA,1933
5306,SPIDER-MAN/PETER PAR,1741
2664,IRON MAN/TONY STARK,1528
5716,THING/BENJAMIN J. GR,1426
6306,WOLVERINE/LOGAN,1394
3805,MR. FANTASTIC/REED R,1386
2557,HUMAN TORCH/JOHNNY S,1371
4898,SCARLET WITCH/WANDA,1345
5736,THOR/DR. DONALD BLAK,1289
403,BEAST/HENRY &HANK& P,1280


In [39]:
min_friends = spark.sql("SELECT min(friends) FROM friend")
min_friends = min_friends.first()[0]
min_friends

0

In [41]:

obscure_heroes = spark.sql(f"""
SELECT h.hero_id, h.hero_name
FROM friend f
JOIN hero h ON h.hero_id == f.hero_id
WHERE f.friends = {min_friends}
""")
obscure_heroes

hero_id,hero_name
3490,MARVEL BOY II/MARTIN
4517,RANDAK
5028,SHARKSKIN
577,BLARE/
4784,RUNE
1089,CLUMSY FOULUP
2911,KULL
3489,MARVEL BOY/MARTIN BU
1841,FENRIS
3298,LUNATIK II
