In [47]:
import pixiedust
pixiedust.installPackage("graphframes:graphframes:0.1.0-spark1.6")

Package already installed: graphframes:graphframes:0.1.0-spark1.6


<maven.artifact.Artifact at 0x7f5d2615f4d0>

In [48]:
from graphframes import GraphFrame

In [49]:
# Vertex DataFrame
v = sqlContext.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
  ("d", "David", 29),
  ("e", "Esther", 32),
  ("f", "Fanny", 36),
  ("g", "Gabby", 60)
], ["id", "name", "age"])

# Edge DataFrame
e = sqlContext.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
  ("f", "c", "follow"),
  ("e", "f", "follow"),
  ("e", "d", "friend"),
  ("d", "a", "friend"),
  ("a", "e", "friend")
], ["src", "dst", "relationship"])

In [50]:
# Create a GraphFrame
g = GraphFrame(v, e)

In [51]:
# take a look at the vertices (show)
g.vertices.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  a|  Alice| 34|
|  b|    Bob| 36|
|  c|Charlie| 30|
|  d|  David| 29|
|  e| Esther| 32|
|  f|  Fanny| 36|
|  g|  Gabby| 60|
+---+-------+---+



In [None]:
# take a look at the edges (show)
g.edges.show()

In [39]:
# find the youngest user in the group                 # g.vertices.groupBy().min("age").show()
g.vertices.groupBy().min("age").show()

+--------+
|min(age)|
+--------+
|      29|
+--------+



In [40]:
# how many follows are in the graph? 
numFollows = g.edges.filter("relationship = 'follow'").count()

print "Total number of follows: " + str(numFollows)

Total number of follows: 4


In [52]:
# Motif finding (DSL)
# (a) - [e] -> (b) 

# Ex. Find all the pairs of vertices with edges in both directions (find) 
g.find("(a)-[]->(b); (b)-[]->(a)").show()

# find (filter) only those where one of the nodes is older than 30
g.find("(a)-[]->(b); (b)-[]->(a)").filter("a.age > 30")

# more complex: a->b, b->c but !a->b                                               g.find("(a)-[]->(b); (b)-[]->(c); !(a)-[]->(c)").filter("a.name = 'Alice'").show()
g.find("(a)-[]->(b); (b)-[]->(c); !(a)-[]->(c)").show()

+--------------+--------------+
|             a|             b|
+--------------+--------------+
|[c,Charlie,30]|    [b,Bob,36]|
|    [b,Bob,36]|[c,Charlie,30]|
+--------------+--------------+

+--------------+--------------+--------------+
|             a|             b|             c|
+--------------+--------------+--------------+
| [e,Esther,32]|  [f,Fanny,36]|[c,Charlie,30]|
|[c,Charlie,30]|    [b,Bob,36]|[c,Charlie,30]|
|  [a,Alice,34]| [e,Esther,32]|  [f,Fanny,36]|
|  [a,Alice,34]|    [b,Bob,36]|[c,Charlie,30]|
|    [b,Bob,36]|[c,Charlie,30]|    [b,Bob,36]|
| [e,Esther,32]|  [d,David,29]|  [a,Alice,34]|
|  [a,Alice,34]| [e,Esther,32]|  [d,David,29]|
|  [d,David,29]|  [a,Alice,34]|    [b,Bob,36]|
|  [f,Fanny,36]|[c,Charlie,30]|    [b,Bob,36]|
|  [d,David,29]|  [a,Alice,34]| [e,Esther,32]|
+--------------+--------------+--------------+



In [46]:
# Select subgraph of users older than 30, and edges of type "friend"
v2 = g.vertices.filter("age > 30")
e2 = g.edges.filter("relationship = 'friend'")
g2 = GraphFrame(v2, e2)