In [None]:
sc.addPyFile("/opt/homebrew/Cellar/apache-spark/3.1.2/libexec/jars/graphframes-0.8.0-spark3.0-s_2.12.jar")

In [None]:
from graphframes import *
from pyspark.sql.functions import *

In [26]:
# Vertics DataFrame
v = spark.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 37),
  ("d", "David", 29),
  ("e", "Esther", 32),
  ("f", "Fanny", 38),
  ("g", "Gabby", 60)
], ["id", "name", "age"])

# Edges DataFrame
e = spark.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
  ("f", "c", "follow"),
  ("e", "f", "follow"),
  ("e", "d", "friend"),
  ("d", "a", "friend"),
  ("a", "e", "friend"),
  ("g", "e", "follow")
], ["src", "dst", "relationship"])

# Create a GraphFrame
g = GraphFrame(v, e)

g.vertices.show(1)
g.edges.show(1)
# A GraphFrame has additional attributes
g.outDegrees.show(3) # number point out
g.inDegrees.show(3) # number point in
#g.inDegrees.explain()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  a|Alice| 34|
+---+-----+---+
only showing top 1 row

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  b|      friend|
+---+---+------------+
only showing top 1 row

+---+---------+
| id|outDegree|
+---+---------+
|  g|        1|
|  f|        1|
|  e|        2|
+---+---------+
only showing top 3 rows

+---+--------+
| id|inDegree|
+---+--------+
|  f|       1|
|  e|       2|
|  d|       1|
+---+--------+
only showing top 3 rows



In [27]:
# A triplet view of the graph
g.triplets.show()
#g.triplets.explain()

+----------------+--------------+----------------+
|             src|          edge|             dst|
+----------------+--------------+----------------+
| {e, Esther, 32}|{e, f, follow}|  {f, Fanny, 38}|
|  {g, Gabby, 60}|{g, e, follow}| {e, Esther, 32}|
|  {a, Alice, 34}|{a, e, friend}| {e, Esther, 32}|
| {e, Esther, 32}|{e, d, friend}|  {d, David, 29}|
|  {f, Fanny, 38}|{f, c, follow}|{c, Charlie, 37}|
|    {b, Bob, 36}|{b, c, follow}|{c, Charlie, 37}|
|{c, Charlie, 37}|{c, b, follow}|    {b, Bob, 36}|
|  {a, Alice, 34}|{a, b, friend}|    {b, Bob, 36}|
|  {d, David, 29}|{d, a, friend}|  {a, Alice, 34}|
+----------------+--------------+----------------+



In [None]:
# g.vertices and g.edges are just DataFrames
# You can use any DataFrame API on them
g.edges.filter("src = 'a'").show(1)
g.edges.filter("src = 'a'").count()

In [None]:
# Count the number of followers of c.
# This queries the edge DataFrame.
print(g.edges.filter("relationship = 'follow' and dst = 'c'").count())

In [18]:
# Rewrite indegree
myInDegrees = g.edges.groupBy('dst').count()\
               .withColumnRenamed('dst', 'id').withColumnRenamed('count', 'inDegree')
myInDegrees.show()
#myInDegrees.explain()

+---+--------+
| id|inDegree|
+---+--------+
|  f|       1|
|  e|       2|
|  d|       1|
|  c|       2|
|  b|       2|
|  a|       1|
+---+--------+



In [19]:
print(g.inDegrees.storageLevel)

Serialized 1x Replicated


In [20]:
g.inDegrees.cache()

DataFrame[id: string, inDegree: int]

In [21]:
print(g.inDegrees.storageLevel)

Disk Memory Deserialized 1x Replicated


In [22]:
print(g.vertices.storageLevel)

Serialized 1x Replicated


In [None]:
g.cache()

In [23]:
print(g.vertices.storageLevel)
print(g.edges.storageLevel)

Serialized 1x Replicated
Serialized 1x Replicated


# MotifFinding

In [41]:
#g.triplets.show()

In [42]:
g.find("(a)-[]->(b); (b)-[]->(a)").show()

+----------------+----------------+
|               a|               b|
+----------------+----------------+
|{c, Charlie, 37}|    {b, Bob, 36}|
|    {b, Bob, 36}|{c, Charlie, 37}|
+----------------+----------------+



In [43]:
# Search for pairs of vertices with edges in both directions between them.
motifs = g.find("(a)-[]->(b); (b)-[]->(a)").filter('a.id < b.id')
motifs.show()

+------------+----------------+
|           a|               b|
+------------+----------------+
|{b, Bob, 36}|{c, Charlie, 37}|
+------------+----------------+



In [46]:
# Find triangles
triangles = g.find("(a)-[]->(b); (b)-[]->(c); (c)-[]->(a)")
triangles = triangles.filter("a.id < b.id AND a.id < c.id")
triangles.show()

+--------------+---------------+--------------+
|             a|              b|             c|
+--------------+---------------+--------------+
|{a, Alice, 34}|{e, Esther, 32}|{d, David, 29}|
+--------------+---------------+--------------+



In [47]:
# Negation
oneway = g.find("(a)-[]->(b); !(b)-[]->(a)")
oneway.show()

+---------------+----------------+
|              a|               b|
+---------------+----------------+
| {a, Alice, 34}| {e, Esther, 32}|
|{e, Esther, 32}|  {d, David, 29}|
| {a, Alice, 34}|    {b, Bob, 36}|
| {g, Gabby, 60}| {e, Esther, 32}|
|{e, Esther, 32}|  {f, Fanny, 38}|
| {f, Fanny, 38}|{c, Charlie, 37}|
| {d, David, 29}|  {a, Alice, 34}|
+---------------+----------------+



In [48]:
# Find vertices without incoming edges:
g.find("!()-[]->(a)").show()

+--------------+
|             a|
+--------------+
|{g, Gabby, 60}|
+--------------+



In [57]:
# More meaningful queries can be expressed by applying filters.
# Question: where is this filter applied?
g.find("(a)-[e]->(b); (b)-[]->(a)").show()#.filter("b.age > 36").show()
#g.find("(a)-[]->(b); (b)-[]->(a)").filter("b.age > 36").explain()

+----------------+--------------+----------------+
|               a|             e|               b|
+----------------+--------------+----------------+
|{c, Charlie, 37}|{c, b, follow}|    {b, Bob, 36}|
|    {b, Bob, 36}|{b, c, follow}|{c, Charlie, 37}|
+----------------+--------------+----------------+



In [58]:
# Find chains of 4 vertices such that at least 2 of the 3 edges are "friend" relationships.
# The when function is similar to the CASE WHEN in SQL

chain4 = g.find("(a)-[e1]->(b); (b)-[e2]->(c); (c)-[e3]->(d)").where('a!=d AND a!=c AND b!=d')

friendTo1 = lambda e: when(e['relationship'] == 'friend', 1).otherwise(0)

chain4.select('*',friendTo1(chain4['e1']).alias('f1'), \
                  friendTo1(chain4['e2']).alias('f2'), \
                  friendTo1(chain4['e3']).alias('f3')) \
      .where('f1 + f2 + f3 >= 2').select('a', 'b', 'c', 'd').show()

+---------------+---------------+---------------+----------------+
|              a|              b|              c|               d|
+---------------+---------------+---------------+----------------+
| {d, David, 29}| {a, Alice, 34}|{e, Esther, 32}|  {f, Fanny, 38}|
| {d, David, 29}| {a, Alice, 34}|   {b, Bob, 36}|{c, Charlie, 37}|
|{e, Esther, 32}| {d, David, 29}| {a, Alice, 34}|    {b, Bob, 36}|
| {g, Gabby, 60}|{e, Esther, 32}| {d, David, 29}|  {a, Alice, 34}|
+---------------+---------------+---------------+----------------+



In [59]:
# Select subgraph of users older than 30, and relationships of type "friend".
# Drop isolated vertices (users) which are not contained in any edges (relationships).

g1 = g.filterVertices("age > 30").filterEdges("relationship = 'friend'")\
      .dropIsolatedVertices()

g1.vertices.show()
g1.edges.show()

+---+------+---+
| id|  name|age|
+---+------+---+
|  e|Esther| 32|
|  b|   Bob| 36|
|  a| Alice| 34|
+---+------+---+

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  e|      friend|
|  a|  b|      friend|
+---+---+------------+



# Quiz

In [89]:
# Find Alice's two-hop neighbors' names, regardless of the edge type.
g.find("(a)-[]->(b); (b)-[]->(c); !(a)-[]->(c)")\
.where("a.id='a'")\
.select('c.name')\
.distinct().show()

+-------+
|   name|
+-------+
|  Fanny|
|Charlie|
|  David|
+-------+



In [87]:
# Redo the previous question, but exclude Alice's two-hop neighbors who have an edge back to Alice.
g.find("(a)-[]->(b); (b)-[]->(c); !(a)-[]->(c); (c)-[]->(a)")\
.where("a.id='a'")\
.select('c.name')\
.distinct().show()

+-----+
| name|
+-----+
|David|
+-----+



In [100]:
# Find all people who follow Charlie.
g.find("(a)-[e]->(b)")\
.where("b.id='c' and e.relationship='follow'")\
.select("a.name")\
.distinct().show()

+-----+
| name|
+-----+
|Fanny|
|  Bob|
+-----+



In [129]:
# Find all people who are being followed by at least 2 people.
g.find("(a)-[e]->(b)")\
.where("e.relationship='follow'")\
.select('b.name', 'e.src')\
.groupBy('name')\
.count()\
.where("count>=2").show()

+-------+-----+
|   name|count|
+-------+-----+
|Charlie|    2|
+-------+-----+

