In [1]:
import $ivy.`org.apache.spark::spark-sql:3.5.5`
import $ivy.`org.apache.spark::spark-graphx:3.5.5`
import $cp.`/opt/spark/jars/graphframes-0.8.3-spark3.5-s_2.12.jar`

[32mimport [39m[36m$ivy.$[39m
[32mimport [39m[36m$ivy.$[39m
[32mimport [39m[36m$cp.$[39m

In [2]:
import org.apache.log4j.{Level, Logger}
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)

[32mimport [39m[36morg.apache.log4j.{Level, Logger}[39m

In [3]:
import org.apache.spark.sql._

[32mimport [39m[36morg.apache.spark.sql._[39m

In [4]:
val spark = SparkSession
                .builder()
                .master("local[*]")
                .appName("GraphFrames")
                .config("spark.log.level", "WARN")
                .getOrCreate()

import spark.implicits._

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/03/13 13:50:02 INFO SparkContext: Running Spark version 3.5.5
25/03/13 13:50:02 INFO SparkContext: OS info Mac OS X, 15.3.2, aarch64
25/03/13 13:50:02 INFO SparkContext: Java version 11.0.26
25/03/13 13:50:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "WARN".
25/03/13 13:50:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


[36mspark[39m: [32mSparkSession[39m = org.apache.spark.sql.SparkSession@1227edf
[32mimport [39m[36mspark.implicits._[39m

# Creating GraphFrames

In [5]:
import org.graphframes.GraphFrame

// Vertex DataFrame
val v = spark.createDataFrame(List(
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
  ("d", "David", 29),
  ("e", "Esther", 32),
  ("f", "Fanny", 36),
  ("g", "Gabby", 60)
)).toDF("id", "name", "age")

// Edge DataFrame
val e = spark.createDataFrame(List(
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
  ("f", "c", "follow"),
  ("e", "f", "follow"),
  ("e", "d", "friend"),
  ("d", "a", "friend"),
  ("a", "e", "friend")
)).toDF("src", "dst", "relationship")

// Create a GraphFrame
val g = GraphFrame(v, e)

[32mimport [39m[36morg.graphframes.GraphFrame[39m
[36mv[39m: [32mDataFrame[39m = [id: string, name: string ... 1 more field]
[36me[39m: [32mDataFrame[39m = [src: string, dst: string ... 1 more field]
[36mg[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])

# Basic GraphFrame queries

In [6]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

// Display the vertex and edge DataFrames
g.vertices.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  a|  Alice| 34|
|  b|    Bob| 36|
|  c|Charlie| 30|
|  d|  David| 29|
|  e| Esther| 32|
|  f|  Fanny| 36|
|  g|  Gabby| 60|
+---+-------+---+



[32mimport [39m[36morg.graphframes.{examples,GraphFrame}[39m
[36mg[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])

In [7]:
g.edges.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  b|      friend|
|  b|  c|      follow|
|  c|  b|      follow|
|  f|  c|      follow|
|  e|  f|      follow|
|  e|  d|      friend|
|  d|  a|      friend|
|  a|  e|      friend|
+---+---+------------+



In [8]:
import org.apache.spark.sql.DataFrame

// Get a DataFrame with columns "id" and "inDeg" (in-degree)
val vertexInDegrees: DataFrame = g.inDegrees

vertexInDegrees.show()

+---+--------+
| id|inDegree|
+---+--------+
|  b|       2|
|  c|       2|
|  f|       1|
|  d|       1|
|  a|       1|
|  e|       1|
+---+--------+



[32mimport [39m[36morg.apache.spark.sql.DataFrame[39m
[36mvertexInDegrees[39m: [32mDataFrame[39m = [id: string, inDegree: int]

In [9]:
// Find the youngest user's age in the graph.
// This queries the vertex DataFrame.
g.vertices.groupBy().min("age").show()

+--------+
|min(age)|
+--------+
|      29|
+--------+



In [10]:
// Count the number of "follows" in the graph.
// This queries the edge DataFrame.
val numFollows = g.edges.filter("relationship = 'follow'").count()

[36mnumFollows[39m: [32mLong[39m = [32m4L[39m

In [11]:
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions._
import org.graphframes.{examples,GraphFrame}

val g: GraphFrame = examples.Graphs.friends

// Find chains of 4 vertices.
val chain4 = g.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[cd]->(d)")

// Query on sequence, with state (cnt)
//  (a) Define method for updating state given the next element of the motif.
def sumFriends(cnt: Column, relationship: Column): Column = {
  when(relationship === "friend", cnt + 1).otherwise(cnt)
}
//  (b) Use sequence operation to apply method to sequence of elements in motif.
//      In this case, the elements are the 3 edges.
val condition = { Seq("ab", "bc", "cd")
  .foldLeft(lit(0))((cnt, e) => sumFriends(cnt, col(e)("relationship"))) }
//  (c) Apply filter to DataFrame.
val chainWith2Friends2 = chain4.where(condition >= 2)
chainWith2Friends2.show()

+---------------+--------------+---------------+--------------+---------------+--------------+----------------+
|              a|            ab|              b|            bc|              c|            cd|               d|
+---------------+--------------+---------------+--------------+---------------+--------------+----------------+
|{e, Esther, 32}|{e, d, friend}| {d, David, 29}|{d, a, friend}| {a, Alice, 34}|{a, e, friend}| {e, Esther, 32}|
|{e, Esther, 32}|{e, d, friend}| {d, David, 29}|{d, a, friend}| {a, Alice, 34}|{a, b, friend}|    {b, Bob, 36}|
| {d, David, 29}|{d, a, friend}| {a, Alice, 34}|{a, e, friend}|{e, Esther, 32}|{e, d, friend}|  {d, David, 29}|
| {d, David, 29}|{d, a, friend}| {a, Alice, 34}|{a, e, friend}|{e, Esther, 32}|{e, f, follow}|  {f, Fanny, 36}|
| {d, David, 29}|{d, a, friend}| {a, Alice, 34}|{a, b, friend}|   {b, Bob, 36}|{b, c, follow}|{c, Charlie, 30}|
| {a, Alice, 34}|{a, e, friend}|{e, Esther, 32}|{e, d, friend}| {d, David, 29}|{d, a, friend}|  {a, Alic

[32mimport [39m[36morg.apache.spark.sql.Column[39m
[32mimport [39m[36morg.apache.spark.sql.functions._[39m
[32mimport [39m[36morg.graphframes.{examples,GraphFrame}[39m
[36mg[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])
[36mchain4[39m: [32mDataFrame[39m = [a: struct<id: string, name: string ... 1 more field>, ab: struct<src: string, dst: string ... 1 more field> ... 5 more fields]
defined [32mfunction[39m [36msumFriends[39m
[36mcondition[39m: [32mColumn[39m = CASE WHEN (cd[relationship] = friend) THEN (CASE WHEN (bc[relationship] = friend) THEN (CASE WHEN (ab[relationship] = friend) THEN (0 + 1) ELSE 0 END + 1) ELSE CASE WHEN (ab[relationship] = friend) THEN (0 + 1) ELSE 0 END END + 1) ELSE CASE WHEN (bc[relationship] = friend) THEN (CASE WHEN (ab[relationship] = friend) THEN (0 + 1) ELSE 0 END + 1) ELSE CASE WHEN (ab[relationship] = friend) THEN (0 + 1) ELSE 0 END END END


## Subgraphs

In [12]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

// Select subgraph of users older than 30, and relationships of type "friend".
// Drop isolated vertices (users) which are not contained in any edges (relationships).
val g1 = g.filterVertices("age > 30").filterEdges("relationship = 'friend'").dropIsolatedVertices()

[32mimport [39m[36morg.graphframes.{examples,GraphFrame}[39m
[36mg[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])
[36mg1[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])

In [13]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

// Select subgraph based on edges "e" of type "follow"
// pointing from a younger user "a" to an older user "b".
val paths = { g.find("(a)-[e]->(b)")
  .filter("e.relationship = 'follow'")
  .filter("a.age < b.age") }

// "paths" contains vertex info. Extract the edges.
val e2 = paths.select("e.src", "e.dst", "e.relationship")
// In Spark 1.5+, the user may simplify this call:
//  val e2 = paths.select("e.*")

// Construct the subgraph
val g2 = GraphFrame(g.vertices, e2).dropIsolatedVertices()

[32mimport [39m[36morg.graphframes.{examples,GraphFrame}[39m
[36mg[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])
[36mpaths[39m: [32mDataset[39m[[32mRow[39m] = [a: struct<id: string, name: string ... 1 more field>, e: struct<src: string, dst: string ... 1 more field> ... 1 more field]
[36me2[39m: [32mDataFrame[39m = [src: string, dst: string ... 1 more field]
[36mg2[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])

In [14]:
g2.vertices.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  b|    Bob| 36|
|  c|Charlie| 30|
|  e| Esther| 32|
|  f|  Fanny| 36|
+---+-------+---+



In [15]:
g2.edges.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  c|  b|      follow|
|  e|  f|      follow|
+---+---+------------+



# Graph algorithms

## Breadth-first search (BFS)

In [16]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

// Search from "Esther" for users of age < 32.
val paths = g.bfs.fromExpr("name = 'Esther'").toExpr("age < 32").run()
paths.show()

// Specify edge filters or max path lengths.
{ g.bfs.fromExpr("name = 'Esther'").toExpr("age < 32")
  .edgeFilter("relationship != 'friend'")
  .maxPathLength(3).run()
}

+---------------+--------------+--------------+
|           from|            e0|            to|
+---------------+--------------+--------------+
|{e, Esther, 32}|{e, d, friend}|{d, David, 29}|
+---------------+--------------+--------------+



[32mimport [39m[36morg.graphframes.{examples,GraphFrame}[39m
[36mg[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])
[36mpaths[39m: [32mDataFrame[39m = [from: struct<id: string, name: string ... 1 more field>, e0: struct<src: string, dst: string ... 1 more field> ... 1 more field]
[36mres16_4[39m: [32mDataFrame[39m = [from: struct<id: string, name: string ... 1 more field>, e0: struct<src: string, dst: string ... 1 more field> ... 3 more fields]

## Strongly connected components

In [17]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

val result = g.stronglyConnectedComponents.maxIter(10).run()
result.select("id", "component").orderBy("component").show()

+---+-------------+
| id|    component|
+---+-------------+
|  g| 146028888064|
|  f| 412316860416|
|  a| 670014898176|
|  e| 670014898176|
|  d| 670014898176|
|  b|1047972020224|
|  c|1047972020224|
+---+-------------+



[32mimport [39m[36morg.graphframes.{examples,GraphFrame}[39m
[36mg[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])
[36mresult[39m: [32mDataFrame[39m = [id: string, name: string ... 2 more fields]

## Label Propagation Algorithm (LPA)

In [18]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

val result = g.labelPropagation.maxIter(5).run()
result.select("id", "label").show()

25/03/13 13:50:26 WARN CacheManager: Asked to cache already cached data.


+---+-------------+
| id|        label|
+---+-------------+
|  b|1047972020224|
|  e|1460288880640|
|  a|1460288880640|
|  f|1460288880640|
|  g| 146028888064|
|  d|1460288880640|
|  c|1382979469312|
+---+-------------+



[32mimport [39m[36morg.graphframes.{examples,GraphFrame}[39m
[36mg[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])
[36mresult[39m: [32mDataFrame[39m = [id: string, name: string ... 2 more fields]

## PageRank

In [19]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

// Run PageRank until convergence to tolerance "tol".
val results = g.pageRank.resetProbability(0.15).tol(0.01).run()
// Display resulting pageranks and final edge weights
// Note that the displayed pagerank may be truncated, e.g., missing the E notation.
// In Spark 1.5+, you can use show(truncate=false) to avoid truncation.
results.vertices.select("id", "pagerank").show()
results.edges.select("src", "dst", "weight").show()

// Run PageRank for a fixed number of iterations.
val results2 = g.pageRank.resetProbability(0.15).maxIter(10).run()

// Run PageRank personalized for vertex "a"
val results3 = g.pageRank.resetProbability(0.15).maxIter(10).sourceId("a").run()

25/03/13 13:50:29 WARN CacheManager: Asked to cache already cached data.


+---+-------------------+
| id|           pagerank|
+---+-------------------+
|  b|  2.655507832863289|
|  e|0.37085233187676075|
|  a|0.44910633706538744|
|  f| 0.3283606792049851|
|  g| 0.1799821386239711|
|  d| 0.3283606792049851|
|  c| 2.6878300011606218|
+---+-------------------+

+---+---+------+
|src|dst|weight|
+---+---+------+
|  f|  c|   1.0|
|  e|  f|   0.5|
|  e|  d|   0.5|
|  d|  a|   1.0|
|  c|  b|   1.0|
|  b|  c|   1.0|
|  a|  e|   0.5|
|  a|  b|   0.5|
+---+---+------+



[32mimport [39m[36morg.graphframes.{examples,GraphFrame}[39m
[36mg[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])
[36mresults[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 2 more fields], e:[src: string, dst: string ... 2 more fields])
[36mresults2[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 2 more fields], e:[src: string, dst: string ... 2 more fields])
[36mresults3[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 2 more fields], e:[src: string, dst: string ... 2 more fields])

## Shortest paths

In [20]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

val results = g.shortestPaths.landmarks(Seq("a", "d")).run()
results.select("id", "distances").show()

25/03/13 13:50:38 WARN CacheManager: Asked to cache already cached data.


+---+----------------+
| id|       distances|
+---+----------------+
|  b|              {}|
|  e|{a -> 2, d -> 1}|
|  a|{a -> 0, d -> 2}|
|  f|              {}|
|  g|              {}|
|  d|{a -> 1, d -> 0}|
|  c|              {}|
+---+----------------+



[32mimport [39m[36morg.graphframes.{examples,GraphFrame}[39m
[36mg[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])
[36mresults[39m: [32mDataFrame[39m = [id: string, name: string ... 2 more fields]

## Triangle count

In [21]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

val results = g.triangleCount.run()
results.select("id", "count").show()

25/03/13 13:50:41 WARN CacheManager: Asked to cache already cached data.


+---+-----+
| id|count|
+---+-----+
|  a|    1|
|  b|    0|
|  c|    0|
|  d|    1|
|  e|    1|
|  f|    0|
|  g|    0|
+---+-----+



[32mimport [39m[36morg.graphframes.{examples,GraphFrame}[39m
[36mg[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])
[36mresults[39m: [32mDataFrame[39m = [count: bigint, id: string ... 2 more fields]

# Saving and loading GraphFrames

In [22]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

// Save vertices and edges as Parquet to some location.
g.vertices.write.mode("overwrite").parquet("vertices")
g.edges.write.mode("overwrite").parquet("edges")

// Load the vertices and edges back.
val sameV = spark.read.parquet("vertices")
val sameE = spark.read.parquet("edges")

// Create an identical GraphFrame.
val sameG = GraphFrame(sameV, sameE)

[32mimport [39m[36morg.graphframes.{examples,GraphFrame}[39m
[36mg[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])
[36msameV[39m: [32mDataFrame[39m = [id: string, name: string ... 1 more field]
[36msameE[39m: [32mDataFrame[39m = [src: string, dst: string ... 1 more field]
[36msameG[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])

# Message passing via AggregateMessages

In [23]:
import org.graphframes.{examples,GraphFrame}
import org.graphframes.lib.AggregateMessages
val g: GraphFrame = examples.Graphs.friends

// We will use AggregateMessages utilities later, so name it "AM" for short.
val AM = AggregateMessages

// For each user, sum the ages of the adjacent users.
val msgToSrc = AM.dst("age")
val msgToDst = AM.src("age")
val agg = { g.aggregateMessages
  .sendToSrc(msgToSrc)  // send destination user's age to source
  .sendToDst(msgToDst)  // send source user's age to destination
  .agg(sum(AM.msg).as("summedAges")) } // sum up ages, stored in AM.msg column
  
agg.show()

+---+----------+
| id|summedAges|
+---+----------+
|  a|        97|
|  b|        94|
|  c|       108|
|  f|        62|
|  e|        99|
|  d|        66|
+---+----------+



[32mimport [39m[36morg.graphframes.{examples,GraphFrame}[39m
[32mimport [39m[36morg.graphframes.lib.AggregateMessages[39m
[36mg[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])
[36mAM[39m: [32mAggregateMessages[39m.type = org.graphframes.lib.AggregateMessages$@60c1b2a8
[36mmsgToSrc[39m: [32mColumn[39m = dst[age]
[36mmsgToDst[39m: [32mColumn[39m = src[age]
[36magg[39m: [32mDataFrame[39m = [id: string, summedAges: bigint]

# GraphX-GraphFrame conversions

In [24]:
import org.apache.spark.graphx.Graph
import org.apache.spark.sql.Row
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

// Convert to GraphX
val gx: Graph[Row, Row] = g.toGraphX

25/03/13 13:50:46 WARN CacheManager: Asked to cache already cached data.


[32mimport [39m[36morg.apache.spark.graphx.Graph[39m
[32mimport [39m[36morg.apache.spark.sql.Row[39m
[32mimport [39m[36morg.graphframes.{examples,GraphFrame}[39m
[36mg[39m: [32mGraphFrame[39m = GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])
[36mgx[39m: [32mGraph[39m[[32mRow[39m, [32mRow[39m] = org.apache.spark.graphx.impl.GraphImpl@144c07fa