In [None]:
import $ivy.`org.apache.spark::spark-core:3.5.3`
import $ivy.`org.apache.spark::spark-graphx:3.5.3`

In [None]:
import org.apache.log4j.{Level, Logger}
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)

In [None]:
import org.apache.spark._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD

In [None]:
val conf = new SparkConf().setAppName("GraphX").setMaster("local[*]")
val sc = new SparkContext(conf)

# Создаём граф

In [None]:
// Create an RDD for the vertices
val users: RDD[(VertexId, (String, String))] =
  sc.parallelize(Seq((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
                       (5L, ("franklin", "prof")), (2L, ("istoica", "prof")),
                       (4L, ("peter", "student"))))

// Create an RDD for edges
val relationships: RDD[Edge[String]] =
  sc.parallelize(Seq(Edge(3L, 7L, "collab"),    Edge(5L, 3L, "advisor"),
                       Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi"),
                       Edge(4L, 0L, "student"),   Edge(5L, 0L, "colleague")))

// Define a default user in case there are relationship with missing user
val defaultUser = ("John Doe", "Missing")

// Build the initial Graph
val graph = Graph(users, relationships, defaultUser)

# Операции с графами

## Information about the Graph

In [None]:
println(s"numEdges: ${graph.numEdges}\nnumVertices: ${graph.numVertices}")

In [None]:
graph.inDegrees.collect

In [None]:
graph.outDegrees.collect

In [None]:
graph.degrees.collect

## Views of the graph as collections

In [None]:
graph.vertices.collect.foreach(println)

In [None]:
graph.vertices.collect

In [None]:
graph.edges.collect.foreach(println)

In [None]:
graph.edges.collect

In [None]:
graph.triplets.collect.foreach(println)

In [None]:
graph.triplets.collect

In [None]:
graph.triplets.map(
  triplet => triplet.srcAttr._1 + " is the " + triplet.attr + " of " + triplet.dstAttr._1
).collect.foreach(println(_))

## Structural Operators

In [None]:
// Remove missing vertices as well as the edges to connected to them
val validGraph = graph.subgraph(vpred = (id, attr) => attr._2 != "Missing")

In [None]:
// The valid subgraph will disconnect users 4 and 5 by removing user 0
validGraph.vertices.collect.foreach(println(_))

In [None]:
validGraph.triplets.map(
  triplet => triplet.srcAttr._1 + " is the " + triplet.attr + " of " + triplet.dstAttr._1
).collect.foreach(println(_))

In [None]:
import org.apache.spark.graphx.EdgeDirection._

graph.collectNeighborIds(In).collect()

# Graph Algorithms

## PageRank

GraphX also includes an example social network dataset that we can run PageRank on. A set of users is given in ``data/graphx/users.txt``, and a set of relationships between users is given in ``data/graphx/followers.txt``. We compute the PageRank of each user as follows:

In [None]:
import org.apache.spark.graphx.GraphLoader

// Load the edges as a graph
val graph = GraphLoader.edgeListFile(sc, "/opt/spark/data/graphx/followers.txt")

// Run PageRank
val ranks = graph.pageRank(0.0001).vertices

// Join the ranks with the usernames
val users = sc.textFile("/opt/spark/data/graphx/users.txt").map { line =>
  val fields = line.split(",")
  (fields(0).toLong, fields(1))
}
val ranksByUsername = users.join(ranks).map {
  case (id, (username, rank)) => (username, rank)
}

// Print the result
println(ranksByUsername.collect().mkString("\n"))

## Connected Components

GraphX contains an implementation of the algorithm in the ConnectedComponents object, and we compute the connected components of the example social network dataset from the PageRank section as follows:

In [None]:
import org.apache.spark.graphx.GraphLoader

// Load the graph as in the PageRank example
val graph = GraphLoader.edgeListFile(sc, "/opt/spark/data/graphx/followers.txt")

// Find the connected components
val cc = graph.connectedComponents().vertices

// Join the connected components with the usernames
val users = sc.textFile("/opt/spark/data/graphx/users.txt").map { line =>
  val fields = line.split(",")
  (fields(0).toLong, fields(1))
}
val ccByUsername = users.join(cc).map {
  case (id, (username, cc)) => (username, cc)
}

// Print the result
println(ccByUsername.collect().mkString("\n"))

## Triangle Counting

We compute the triangle count of the social network dataset from the PageRank section. Note that TriangleCount requires the edges to be in canonical orientation (srcId < dstId) and the graph to be partitioned using Graph.partitionBy.

In [None]:
import org.apache.spark.graphx.{GraphLoader, PartitionStrategy}

// Load the edges in canonical order and partition the graph for triangle count
val graph = GraphLoader.edgeListFile(sc, "/opt/spark/data/graphx/followers.txt", true)
  .partitionBy(PartitionStrategy.RandomVertexCut)
  
// Find the triangle count for each vertex
val triCounts = graph.triangleCount().vertices

// Join the triangle counts with the usernames
val users = sc.textFile("/opt/spark/data/graphx/users.txt").map { line =>
  val fields = line.split(",")
  (fields(0).toLong, fields(1))
}
val triCountByUsername = users.join(triCounts).map { case (id, (username, tc)) =>
  (username, tc)
}

// Print the result
println(triCountByUsername.collect().mkString("\n"))

# Pregel

We can use the Pregel operator to express computation such as single source shortest path in the following example.

In [None]:
import org.apache.spark.graphx.{Graph, VertexId}
import org.apache.spark.graphx.util.GraphGenerators

// A graph with edge attributes containing distances
val graph: Graph[Long, Double] = GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble)

val sourceId: VertexId = 42 // The ultimate source

// Initialize the graph such that all vertices except the root have distance infinity.
val initialGraph = graph.mapVertices((id, _) => if (id == sourceId) 0.0 else Double.PositiveInfinity)

val sssp = initialGraph.pregel(Double.PositiveInfinity)(
  (id, dist, newDist) => math.min(dist, newDist), // Vertex Program
  triplet => {  // Send Message
    if (triplet.srcAttr + triplet.attr < triplet.dstAttr) {
      Iterator((triplet.dstId, triplet.srcAttr + triplet.attr))
    } else {
      Iterator.empty
    }
  },
  (a, b) => math.min(a, b) // Merge Message
)

println(sssp.vertices.collect.mkString("\n"))