# Data Engineering 2: Lab 06 - Solution
---------------

In [0]:
# Import graphframes into the libaries of your cluster (take the jar of the zip archive)

from graphframes import *
from pyspark.sql.functions import desc

#TODO: replace these paths with your csv upload paths

celebrities = spark.read.option("inferSchema", "true").csv("/FileStore/tables/celebrities.csv", sep=",", header=True)
celebrities.printSchema()

followers = spark.read.option("inferSchema", "true").csv("/FileStore/tables/followers.csv", sep=",", header=True)
followers.printSchema()

In [0]:
# TODO: print 10 entries of the celebrities
display(celebrities.take(10))

# TODO: print 3 entries of the Followers == edge table
display(followers.take(3))

In [0]:
#TODO: build graph
celebrity_follower = GraphFrame(celebrities, followers)
celebrity_follower.vertices.printSchema()
celebrity_follower.edges.printSchema()
print(str(celebrity_follower.vertices.count()))

In [0]:
#TODO: print the age of the youngest celebrity
display(celebrity_follower.vertices.groupBy().min("age"))

#TODO: print the age of the oldest celebrity
oldest = celebrity_follower.vertices.groupBy().max("age")
display(oldest)

In [0]:
# Find out more info on the oldest person in the graph
# Note: we can use the collect() API to return the list of results (in this case, only 1) 
# so in order to get the actual value for maxAge we need to:

# TODO: 1. collect the results
results = oldest.collect()

# TODO: 2. get the first element in the list == a row
firstElem = results[0]

# TODO: 3. get the value of the field we need and convert it to a string. Note: it's better to address by name, but could also have done it by ID (ie "firstElem[0]")
maxAge = str(firstElem["max(age)"])

# TODO: 4. filter by age, where the age must be = the maximum age we just computed
display(celebrity_follower.vertices.filter("age=" + maxAge))

id,Name,Handle,Age,Profession
80,Dalai Lama,@DalaiLama,81,religious


In [0]:
# TODO: print the average age of all celebrities
avg = celebrity_follower.vertices.groupBy().avg("age")
display(avg)

In [0]:
# TODO: print the number of incoming edges per vertex
display(celebrity_follower.inDegrees)

In [0]:
# TODO: print the number of outgoing edges per vertex
display(celebrity_follower.outDegrees)

In [0]:
# TODO: print the most influential celebrity will have the biggest number of followers
# first we need to retrieve this value
maxNumFollowers = celebrity_follower.inDegrees.groupBy().max("inDegree")
display(maxNumFollowers)

In [0]:
# TODO: Find out more info on the most influential in the graph

# we use the same trick as above to get the value of the id for the most influential celebrity
# the collect() API always returns a list
# so we need to take the first element (row) and then extract from it the id
maxNumFollowersValue = maxNumFollowers.collect()[0]["max(inDegree)"]


mostFollowedCelebrityID = celebrity_follower.inDegrees.filter('inDegree=' + str(maxNumFollowersValue))
# we use a filter to return the id of the celebrity with that number of followers
display(mostFollowedCelebrityID)

In [0]:
# TODO: Get more information about the most followed celebrity. Join back (by ID) with the original celebrities dataframe and print the result.
mostFollowedCelebrity = celebrity_follower.vertices.join(mostFollowedCelebrityID, "id")
display(mostFollowedCelebrity)

In [0]:
# TODO: count the number of celebrities by profession and sort descending
display(celebrity_follower.vertices.groupBy("Profession").count().sort(desc("count")))

In [0]:
# TODO: print reciprocal relationships (like in the lecture)
motifs = celebrity_follower.find("(person1) - [follows1] -> (person2); (person2) - [follows2] -> (person1)")
display(motifs)

person1,follows1,person2,follows2
"List(8, Ellen DeGeneres, @TheEllenShow, 59, tv show host)","List(8, 1, follow)","List(1, Katy Perry, @katyperry, 32, singer)","List(1, 8, follow)"
"List(15, Ariana Grande, @ArianaGrande, 23, singer)","List(15, 1, follow)","List(1, Katy Perry, @katyperry, 32, singer)","List(1, 15, follow)"
"List(7, Justin Timberlake, @jtimberlake, 36, singer)","List(7, 1, follow)","List(1, Katy Perry, @katyperry, 32, singer)","List(1, 7, follow)"
"List(5, Lady Gaga, @ladygaga, 31, singer)","List(5, 1, follow)","List(1, Katy Perry, @katyperry, 32, singer)","List(1, 5, follow)"
"List(24, Kaka, @KAKA, 34, sports player)","List(24, 1, follow)","List(1, Katy Perry, @katyperry, 32, singer)","List(1, 24, follow)"
"List(20, Harry Styles, @Harry_Styles, 23, singer)","List(20, 1, follow)","List(1, Katy Perry, @katyperry, 32, singer)","List(1, 20, follow)"
"List(12, Jennifer Lopez, @JLO, 47, singer)","List(12, 1, follow)","List(1, Katy Perry, @katyperry, 32, singer)","List(1, 12, follow)"
"List(45, Conan O'Brien, @ConanOBrien, 53, tv show host)","List(45, 1, follow)","List(1, Katy Perry, @katyperry, 32, singer)","List(1, 45, follow)"
"List(18, P!nk, @Pink, 37, singer)","List(18, 1, follow)","List(1, Katy Perry, @katyperry, 32, singer)","List(1, 18, follow)"
"List(3, Barack Obama, @BarackObama, 55, politician)","List(3, 1, follow)","List(1, Katy Perry, @katyperry, 32, singer)","List(1, 3, follow)"


In [0]:
# TODO: print the motifs of the query before and filter for person1 = Lady Gaga
filtered = motifs.filter("person1.Name = 'Lady Gaga'")
display(filtered)

person1,follows1,person2,follows2
"List(5, Lady Gaga, @ladygaga, 31, singer)","List(5, 1, follow)","List(1, Katy Perry, @katyperry, 32, singer)","List(1, 5, follow)"
"List(5, Lady Gaga, @ladygaga, 31, singer)","List(5, 3, follow)","List(3, Barack Obama, @BarackObama, 55, politician)","List(3, 5, follow)"
"List(5, Lady Gaga, @ladygaga, 31, singer)","List(5, 4, follow)","List(4, Taylor Swift, @taylorswift13, 27, singer)","List(4, 5, follow)"
"List(5, Lady Gaga, @ladygaga, 31, singer)","List(5, 13, follow)","List(13, Shakira, @shakira, 40, singer)","List(13, 5, follow)"
"List(5, Lady Gaga, @ladygaga, 31, singer)","List(5, 14, follow)","List(14, Selena Gomez, @selenagomez, 24, singer)","List(14, 5, follow)"
"List(5, Lady Gaga, @ladygaga, 31, singer)","List(5, 19, follow)","List(19, Jimmy Fallon, @jimmyfallon, 42, tv show host)","List(19, 5, follow)"
"List(5, Lady Gaga, @ladygaga, 31, singer)","List(5, 36, follow)","List(36, Kevin Hart, @KevinHart4real, 37, actor)","List(36, 5, follow)"
"List(5, Lady Gaga, @ladygaga, 31, singer)","List(5, 38, follow)","List(38, Nicki Minaj, @NICKIMINAJ, 34, singer)","List(38, 5, follow)"
"List(5, Lady Gaga, @ladygaga, 31, singer)","List(5, 73, follow)","List(73, Snoop Dogg, @SnoopDogg, 45, singer)","List(73, 5, follow)"
"List(5, Lady Gaga, @ladygaga, 31, singer)","List(5, 86, follow)","List(86, Marcos Mion, @marcosmion, 37, tv show host)","List(86, 5, follow)"


In [0]:
# TODO: print the motifs of the query before and filter out all professions = singer of person2
filtered2 = motifs.filter("person1.Name = 'Lady Gaga'").filter("person2.Profession != 'singer'")
display(filtered2)

person1,follows1,person2,follows2
"List(5, Lady Gaga, @ladygaga, 31, singer)","List(5, 3, follow)","List(3, Barack Obama, @BarackObama, 55, politician)","List(3, 5, follow)"
"List(5, Lady Gaga, @ladygaga, 31, singer)","List(5, 19, follow)","List(19, Jimmy Fallon, @jimmyfallon, 42, tv show host)","List(19, 5, follow)"
"List(5, Lady Gaga, @ladygaga, 31, singer)","List(5, 36, follow)","List(36, Kevin Hart, @KevinHart4real, 37, actor)","List(36, 5, follow)"
"List(5, Lady Gaga, @ladygaga, 31, singer)","List(5, 86, follow)","List(86, Marcos Mion, @marcosmion, 37, tv show host)","List(86, 5, follow)"
