In [None]:
import $ivy.`org.apache.spark::spark-sql:3.5.3`
import $ivy.`org.apache.spark::spark-graphx:3.5.3`
import $cp.`/opt/spark/jars/graphframes-0.8.3-spark3.5-s_2.12.jar`

In [None]:
import org.apache.log4j.{Level, Logger}
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)

In [None]:
import org.apache.spark.sql._

In [None]:
val spark = SparkSession
                .builder()
                .master("local[*]")
                .appName("GraphFrames")
                .getOrCreate()

import spark.implicits._

# Creating GraphFrames

In [None]:
import org.graphframes.GraphFrame

// Vertex DataFrame
val v = spark.createDataFrame(List(
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
  ("d", "David", 29),
  ("e", "Esther", 32),
  ("f", "Fanny", 36),
  ("g", "Gabby", 60)
)).toDF("id", "name", "age")

// Edge DataFrame
val e = spark.createDataFrame(List(
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
  ("f", "c", "follow"),
  ("e", "f", "follow"),
  ("e", "d", "friend"),
  ("d", "a", "friend"),
  ("a", "e", "friend")
)).toDF("src", "dst", "relationship")

// Create a GraphFrame
val g = GraphFrame(v, e)

# Basic GraphFrame queries

In [None]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

// Display the vertex and edge DataFrames
g.vertices.show()

In [None]:
g.edges.show()

In [None]:
import org.apache.spark.sql.DataFrame

// Get a DataFrame with columns "id" and "inDeg" (in-degree)
val vertexInDegrees: DataFrame = g.inDegrees

vertexInDegrees.show()

In [None]:
// Find the youngest user's age in the graph.
// This queries the vertex DataFrame.
g.vertices.groupBy().min("age").show()

In [None]:
// Count the number of "follows" in the graph.
// This queries the edge DataFrame.
val numFollows = g.edges.filter("relationship = 'follow'").count()

In [None]:
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions._
import org.graphframes.{examples,GraphFrame}

val g: GraphFrame = examples.Graphs.friends

// Find chains of 4 vertices.
val chain4 = g.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[cd]->(d)")

// Query on sequence, with state (cnt)
//  (a) Define method for updating state given the next element of the motif.
def sumFriends(cnt: Column, relationship: Column): Column = {
  when(relationship === "friend", cnt + 1).otherwise(cnt)
}
//  (b) Use sequence operation to apply method to sequence of elements in motif.
//      In this case, the elements are the 3 edges.
val condition = { Seq("ab", "bc", "cd")
  .foldLeft(lit(0))((cnt, e) => sumFriends(cnt, col(e)("relationship"))) }
//  (c) Apply filter to DataFrame.
val chainWith2Friends2 = chain4.where(condition >= 2)
chainWith2Friends2.show()

## Subgraphs

In [None]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

// Select subgraph of users older than 30, and relationships of type "friend".
// Drop isolated vertices (users) which are not contained in any edges (relationships).
val g1 = g.filterVertices("age > 30").filterEdges("relationship = 'friend'").dropIsolatedVertices()

In [None]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

// Select subgraph based on edges "e" of type "follow"
// pointing from a younger user "a" to an older user "b".
val paths = { g.find("(a)-[e]->(b)")
  .filter("e.relationship = 'follow'")
  .filter("a.age < b.age") }

// "paths" contains vertex info. Extract the edges.
val e2 = paths.select("e.src", "e.dst", "e.relationship")
// In Spark 1.5+, the user may simplify this call:
//  val e2 = paths.select("e.*")

// Construct the subgraph
val g2 = GraphFrame(g.vertices, e2).dropIsolatedVertices()

In [None]:
g2.vertices.show()

In [None]:
g2.edges.show()

# Graph algorithms

## Breadth-first search (BFS)

In [None]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

// Search from "Esther" for users of age < 32.
val paths = g.bfs.fromExpr("name = 'Esther'").toExpr("age < 32").run()
paths.show()

// Specify edge filters or max path lengths.
{ g.bfs.fromExpr("name = 'Esther'").toExpr("age < 32")
  .edgeFilter("relationship != 'friend'")
  .maxPathLength(3).run()
}

## Strongly connected components

In [None]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

val result = g.stronglyConnectedComponents.maxIter(10).run()
result.select("id", "component").orderBy("component").show()

## Label Propagation Algorithm (LPA)

In [None]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

val result = g.labelPropagation.maxIter(5).run()
result.select("id", "label").show()

## PageRank

In [None]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

// Run PageRank until convergence to tolerance "tol".
val results = g.pageRank.resetProbability(0.15).tol(0.01).run()
// Display resulting pageranks and final edge weights
// Note that the displayed pagerank may be truncated, e.g., missing the E notation.
// In Spark 1.5+, you can use show(truncate=false) to avoid truncation.
results.vertices.select("id", "pagerank").show()
results.edges.select("src", "dst", "weight").show()

// Run PageRank for a fixed number of iterations.
val results2 = g.pageRank.resetProbability(0.15).maxIter(10).run()

// Run PageRank personalized for vertex "a"
val results3 = g.pageRank.resetProbability(0.15).maxIter(10).sourceId("a").run()

// Run PageRank personalized for vertex ["a", "b", "c", "d"] in parallel
val results4 = g.parallelPersonalizedPageRank.resetProbability(0.15).maxIter(10).sourceIds(Array("a", "b", "c", "d")).run()

## Shortest paths

In [None]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

val results = g.shortestPaths.landmarks(Seq("a", "d")).run()
results.select("id", "distances").show()

## Triangle count

In [None]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

val results = g.triangleCount.run()
results.select("id", "count").show()

# Saving and loading GraphFrames

In [None]:
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

// Save vertices and edges as Parquet to some location.
g.vertices.write.mode("overwrite").parquet("vertices")
g.edges.write.mode("overwrite").parquet("edges")

// Load the vertices and edges back.
val sameV = spark.read.parquet("vertices")
val sameE = spark.read.parquet("edges")

// Create an identical GraphFrame.
val sameG = GraphFrame(sameV, sameE)

# Message passing via AggregateMessages

In [None]:
import org.graphframes.{examples,GraphFrame}
import org.graphframes.lib.AggregateMessages
val g: GraphFrame = examples.Graphs.friends

// We will use AggregateMessages utilities later, so name it "AM" for short.
val AM = AggregateMessages

// For each user, sum the ages of the adjacent users.
val msgToSrc = AM.dst("age")
val msgToDst = AM.src("age")
val agg = { g.aggregateMessages
  .sendToSrc(msgToSrc)  // send destination user's age to source
  .sendToDst(msgToDst)  // send source user's age to destination
  .agg(sum(AM.msg).as("summedAges")) } // sum up ages, stored in AM.msg column
  
agg.show()

# GraphX-GraphFrame conversions

In [None]:
import org.apache.spark.graphx.Graph
import org.apache.spark.sql.Row
import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends

// Convert to GraphX
val gx: Graph[Row, Row] = g.toGraphX