In [1]:
sc

In [2]:
spark

# 构造GraphFrame

- 顶点 Vertex:  需要有一个 `id`列
- 边 Edge: 需要有 `src`（边缘的源顶点ID）和`dst`（边缘的目标顶点ID）

In [5]:
from graphframes import *

In [7]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("graphframe") \
        .master('local[4]') \
        .getOrCreate()
sc = spark.sparkContext

In [3]:
sc.master

'local[*]'

In [6]:
# Vertex DataFrame
v = spark.createDataFrame(
    [("a", "Alice", 34),
      ("b", "Bob", 36),
      ("c", "Charlie", 30),
      ("d", "David", 29),
      ("e", "Esther", 32),
      ("f", "Fanny", 36),
      ("g", "Gabby", 60)],
    ['id', 'name', 'age']
)
# Edge DataFrame
e = spark.createDataFrame(
    [("a", "b", "friend"),
      ("b", "c", "follow"),
      ("c", "b", "follow"),
      ("f", "c", "follow"),
      ("e", "f", "follow"),
      ("e", "d", "friend"),
      ("d", "a", "friend"),
      ("a", "e", "friend")],
    ['src', 'dst', 'relationship']
)
# Create a GraphFrame
g = GraphFrame(v, e)

# 基本的查询

In [7]:
# 顶点
g.vertices.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  a|  Alice| 34|
|  b|    Bob| 36|
|  c|Charlie| 30|
|  d|  David| 29|
|  e| Esther| 32|
|  f|  Fanny| 36|
|  g|  Gabby| 60|
+---+-------+---+



In [8]:
# 边
g.edges.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  b|      friend|
|  b|  c|      follow|
|  c|  b|      follow|
|  f|  c|      follow|
|  e|  f|      follow|
|  e|  d|      friend|
|  d|  a|      friend|
|  a|  e|      friend|
+---+---+------------+



In [9]:
# 入度
inDegrees = g.inDegrees
type(inDegrees)

pyspark.sql.dataframe.DataFrame

In [10]:
inDegrees.columns  # 

['id', 'inDegree']

In [11]:
inDegrees.show()

+---+--------+
| id|inDegree|
+---+--------+
|  f|       1|
|  e|       1|
|  d|       1|
|  c|       2|
|  b|       2|
|  a|       1|
+---+--------+



In [12]:
# 出度
outDegrees = g.outDegrees
outDegrees.show()

+---+---------+
| id|outDegree|
+---+---------+
|  f|        1|
|  e|        2|
|  d|        1|
|  c|        1|
|  b|        1|
|  a|        2|
+---+---------+



In [13]:
# Find the youngest user's age in the graph.
g.vertices.groupBy().min('age').show()

+--------+
|min(age)|
+--------+
|      29|
+--------+



In [14]:
# Count the number of "follows" in the graph.
numFollows = g.edges.filter("relationship = 'follow'").count()
numFollows

4

# 主题挖掘

`(a)-[ab]->(b); (b)-[bc]->(c)`

- (x) 表示顶点  \[xy\]表示边
- (b) 表示中间顶点
- (a) 出发顶点
- (c) 目的顶点
- 边 \[ab\] 出发地与中间地的顶点
- \[bc\] 中间地与目的地

In [16]:
# 搜索顶点之间具有双向边的顶点对
motifs = g.find("(a)-[e]->(b); (b)-[e2]->(a)")
motifs.show()

+----------------+--------------+----------------+--------------+
|               a|             e|               b|            e2|
+----------------+--------------+----------------+--------------+
|[c, Charlie, 30]|[c, b, follow]|    [b, Bob, 36]|[b, c, follow]|
|    [b, Bob, 36]|[b, c, follow]|[c, Charlie, 30]|[c, b, follow]|
+----------------+--------------+----------------+--------------+



In [17]:
type(motifs)

pyspark.sql.dataframe.DataFrame

In [18]:
motifs.filter("b.age > 30").show()

+----------------+--------------+------------+--------------+
|               a|             e|           b|            e2|
+----------------+--------------+------------+--------------+
|[c, Charlie, 30]|[c, b, follow]|[b, Bob, 36]|[b, c, follow]|
+----------------+--------------+------------+--------------+



In [19]:
# 由 顶点a 到 顶点b 再到 顶点 c 
# Be aware that names do not identify distinct elements
# a和c 有可能指向相同的顶点
motifs = g.find("(a)-[e]->(b); (b)-[e2]->(c)")
motifs.show()

+----------------+--------------+----------------+--------------+----------------+
|               a|             e|               b|            e2|               c|
+----------------+--------------+----------------+--------------+----------------+
|  [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|[e, f, follow]|  [f, Fanny, 36]|
|  [d, David, 29]|[d, a, friend]|  [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|
|  [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|[e, d, friend]|  [d, David, 29]|
| [e, Esther, 32]|[e, f, follow]|  [f, Fanny, 36]|[f, c, follow]|[c, Charlie, 30]|
|[c, Charlie, 30]|[c, b, follow]|    [b, Bob, 36]|[b, c, follow]|[c, Charlie, 30]|
|  [a, Alice, 34]|[a, b, friend]|    [b, Bob, 36]|[b, c, follow]|[c, Charlie, 30]|
|  [f, Fanny, 36]|[f, c, follow]|[c, Charlie, 30]|[c, b, follow]|    [b, Bob, 36]|
|    [b, Bob, 36]|[b, c, follow]|[c, Charlie, 30]|[c, b, follow]|    [b, Bob, 36]|
|  [d, David, 29]|[d, a, friend]|  [a, Alice, 34]|[a, b, friend]|    [b, Bob, 36]|
| [e

In [21]:
# 头尾 2个顶点不同
motifs.filter("a.id != c.id").show()

+---------------+--------------+----------------+--------------+----------------+
|              a|             e|               b|            e2|               c|
+---------------+--------------+----------------+--------------+----------------+
| [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|[e, f, follow]|  [f, Fanny, 36]|
| [d, David, 29]|[d, a, friend]|  [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|
| [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|[e, d, friend]|  [d, David, 29]|
|[e, Esther, 32]|[e, f, follow]|  [f, Fanny, 36]|[f, c, follow]|[c, Charlie, 30]|
| [a, Alice, 34]|[a, b, friend]|    [b, Bob, 36]|[b, c, follow]|[c, Charlie, 30]|
| [f, Fanny, 36]|[f, c, follow]|[c, Charlie, 30]|[c, b, follow]|    [b, Bob, 36]|
| [d, David, 29]|[d, a, friend]|  [a, Alice, 34]|[a, b, friend]|    [b, Bob, 36]|
|[e, Esther, 32]|[e, d, friend]|  [d, David, 29]|[d, a, friend]|  [a, Alice, 34]|
+---------------+--------------+----------------+--------------+----------------+



In [22]:
# 存在 a->b 但没有b->a
motifs = g.find("(a)-[]->(b); !(b)-[]->(a)")
motifs.show()

+---------------+----------------+
|              a|               b|
+---------------+----------------+
| [a, Alice, 34]| [e, Esther, 32]|
|[e, Esther, 32]|  [d, David, 29]|
|[e, Esther, 32]|  [f, Fanny, 36]|
| [a, Alice, 34]|    [b, Bob, 36]|
| [f, Fanny, 36]|[c, Charlie, 30]|
| [d, David, 29]|  [a, Alice, 34]|
+---------------+----------------+



In [23]:
from pyspark.sql.functions import col, lit, when
from pyspark.sql.types import IntegerType

In [25]:
# 创建文字列
lit? 

In [24]:
# a->b->c->d
chain4 = g.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[cd]->(d)")
chain4.show()

+----------------+--------------+----------------+--------------+----------------+--------------+----------------+
|               a|            ab|               b|            bc|               c|            cd|               d|
+----------------+--------------+----------------+--------------+----------------+--------------+----------------+
|  [d, David, 29]|[d, a, friend]|  [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|[e, f, follow]|  [f, Fanny, 36]|
| [e, Esther, 32]|[e, d, friend]|  [d, David, 29]|[d, a, friend]|  [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|
|  [d, David, 29]|[d, a, friend]|  [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|[e, d, friend]|  [d, David, 29]|
|  [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|[e, f, follow]|  [f, Fanny, 36]|[f, c, follow]|[c, Charlie, 30]|
|  [f, Fanny, 36]|[f, c, follow]|[c, Charlie, 30]|[c, b, follow]|    [b, Bob, 36]|[b, c, follow]|[c, Charlie, 30]|
|    [b, Bob, 36]|[b, c, follow]|[c, Charlie, 30]|[c, b, follow]|    [b, Bob, 36

In [29]:
from functools import reduce

In [32]:
# 3条边中至少有2条是friend关系
# Query on sequence, with state (cnt)
#  (a) Define method for updating state given the next element of the motif.
sumFriends = lambda cnt, relationship: when(relationship=="friend", cnt+1).otherwise(cnt)
#  (b) Use sequence operation to apply method to sequence of elements in motif.
#      In this case, the elements are the 3 edges.
condition = reduce(
    lambda cnt, e: sumFriends(cnt, col(e).relationship),
    ["ab", "bc", "cd"],
    lit(0)  # 给定初始值0
)

#  (c) Apply filter to DataFrame.
chainWith2Friends2 = chain4.where(condition>=2)
chainWith2Friends2.show()

+---------------+--------------+---------------+--------------+---------------+--------------+----------------+
|              a|            ab|              b|            bc|              c|            cd|               d|
+---------------+--------------+---------------+--------------+---------------+--------------+----------------+
| [d, David, 29]|[d, a, friend]| [a, Alice, 34]|[a, e, friend]|[e, Esther, 32]|[e, f, follow]|  [f, Fanny, 36]|
|[e, Esther, 32]|[e, d, friend]| [d, David, 29]|[d, a, friend]| [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|
| [d, David, 29]|[d, a, friend]| [a, Alice, 34]|[a, e, friend]|[e, Esther, 32]|[e, d, friend]|  [d, David, 29]|
| [d, David, 29]|[d, a, friend]| [a, Alice, 34]|[a, b, friend]|   [b, Bob, 36]|[b, c, follow]|[c, Charlie, 30]|
|[e, Esther, 32]|[e, d, friend]| [d, David, 29]|[d, a, friend]| [a, Alice, 34]|[a, b, friend]|    [b, Bob, 36]|
| [a, Alice, 34]|[a, e, friend]|[e, Esther, 32]|[e, d, friend]| [d, David, 29]|[d, a, friend]|  [a, Alic

# Subgraphs


In [25]:
# 过滤  顶点 边  丢弃孤立的 没有边连接的顶点  
g1 = g.filterVertices('age > 30').filterEdges("relationship='friend'").dropIsolatedVertices()

In [26]:
g1.vertices.show()

+---+------+---+
| id|  name|age|
+---+------+---+
|  e|Esther| 32|
|  b|   Bob| 36|
|  a| Alice| 34|
+---+------+---+



In [27]:
g1.edges.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  e|      friend|
|  a|  b|      friend|
+---+---+------------+



In [28]:
# Select subgraph based on edges "e" of type "follow"
# pointing from a younger user "a" to an older user "b".
paths = g.find("(a)-[e]->(b)") \
         .filter("e.relationship='follow'") \
         .filter("a.age < b.age")


paths.show()

+----------------+--------------+--------------+
|               a|             e|             b|
+----------------+--------------+--------------+
| [e, Esther, 32]|[e, f, follow]|[f, Fanny, 36]|
|[c, Charlie, 30]|[c, b, follow]|  [b, Bob, 36]|
+----------------+--------------+--------------+



In [29]:
# "paths" contains vertex info. Extract the edges.
# e2 = paths.select("e.src", 'e.dst', "e.relationship")
e2 = paths.select("e.*")
e2.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  e|  f|      follow|
|  c|  b|      follow|
+---+---+------------+



In [30]:
# Construct the subgraph
g2 = GraphFrame(g.vertices, e2)

In [31]:
g2.vertices.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  a|  Alice| 34|
|  b|    Bob| 36|
|  c|Charlie| 30|
|  d|  David| 29|
|  e| Esther| 32|
|  f|  Fanny| 36|
|  g|  Gabby| 60|
+---+-------+---+



In [32]:
g2.edges.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  e|  f|      follow|
|  c|  b|      follow|
+---+---+------------+



In [46]:
g2.dropIsolatedVertices().vertices.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  f|  Fanny| 36|
|  e| Esther| 32|
|  c|Charlie| 30|
|  b|    Bob| 36|
+---+-------+---+



# Graph algorithms
## Breadth-first search (BFS)

查找从一个顶点（或一组顶点）到另一个顶点（或一组顶点）的最短路径。

In [50]:
g.bfs?
# g.bfs(fromExpr, toExpr, edgeFilter=None, maxPathLength=10)

In [33]:
# Search from "Esther" for users of age < 32.
paths = g.bfs(
    fromExpr="name = 'Esther'",
    toExpr="age < 32")
paths.show()

+---------------+--------------+--------------+
|           from|            e0|            to|
+---------------+--------------+--------------+
|[e, Esther, 32]|[e, d, friend]|[d, David, 29]|
+---------------+--------------+--------------+



In [34]:
# Specify edge filters or max path lengths.
g.bfs("name = 'Esther'", "age < 32", edgeFilter="relationship != 'friend'", maxPathLength=3).show()

+---------------+--------------+--------------+--------------+----------------+
|           from|            e0|            v1|            e1|              to|
+---------------+--------------+--------------+--------------+----------------+
|[e, Esther, 32]|[e, f, follow]|[f, Fanny, 36]|[f, c, follow]|[c, Charlie, 30]|
+---------------+--------------+--------------+--------------+----------------+



## Connected components

In [35]:
sc.setCheckpointDir('./chpt')

In [36]:
result = g.connectedComponents()
result.select("id", "component").orderBy("component").show()

+---+------------+
| id|   component|
+---+------------+
|  g|146028888064|
|  a|412316860416|
|  f|412316860416|
|  b|412316860416|
|  d|412316860416|
|  e|412316860416|
|  c|412316860416|
+---+------------+



## Strongly connected components

In [58]:
result = g.stronglyConnectedComponents(maxIter=10)
result.select("id", "component").orderBy("component").show()

+---+-------------+
| id|    component|
+---+-------------+
|  g| 146028888064|
|  f| 412316860416|
|  a| 670014898176|
|  d| 670014898176|
|  e| 670014898176|
|  b|1047972020224|
|  c|1047972020224|
+---+-------------+



## Label Propagation Algorithm (LPA)
标签传播算法

https://blog.csdn.net/App_12062011/article/details/91353424

https://www.cnblogs.com/LittleHann/p/10699988.html

In [59]:
result = g.labelPropagation(maxIter=5)
result.select('id', 'label').show()

+---+-------------+
| id|        label|
+---+-------------+
|  g| 146028888064|
|  b|1047972020224|
|  e| 412316860416|
|  a| 670014898176|
|  f| 670014898176|
|  d| 670014898176|
|  c|1382979469312|
+---+-------------+



## PageRank

In [62]:
# Run PageRank until convergence to tolerance "tol".
results = g.pageRank(resetProbability=0.15, tol=0.01)
# GraphFrame with new vertices column “pagerank” and new edges column “weight”

results.vertices.select("id", "pagerank").show()

+---+-------------------+
| id|           pagerank|
+---+-------------------+
|  g| 0.1799821386239711|
|  b|  2.655507832863289|
|  e|0.37085233187676075|
|  a|0.44910633706538744|
|  f| 0.3283606792049851|
|  d| 0.3283606792049851|
|  c| 2.6878300011606218|
+---+-------------------+



In [63]:
results.edges.select("src", "dst", "weight").show()

+---+---+------+
|src|dst|weight|
+---+---+------+
|  a|  b|   0.5|
|  b|  c|   1.0|
|  e|  f|   0.5|
|  e|  d|   0.5|
|  c|  b|   1.0|
|  a|  e|   0.5|
|  f|  c|   1.0|
|  d|  a|   1.0|
+---+---+------+



In [65]:
# Run PageRank for a fixed number of iterations.
results2 = g.pageRank(resetProbability=0.15, maxIter=10)
results2.vertices.show()

+---+-------+---+-------------------+
| id|   name|age|           pagerank|
+---+-------+---+-------------------+
|  g|  Gabby| 60|0.17073170731707318|
|  b|    Bob| 36| 2.7025217677349773|
|  e| Esther| 32| 0.3613490987992571|
|  a|  Alice| 34| 0.4485115093698443|
|  f|  Fanny| 36|0.32504910549694244|
|  d|  David| 29|0.32504910549694244|
|  c|Charlie| 30| 2.6667877057849627|
+---+-------+---+-------------------+



In [67]:
# Run PageRank personalized for vertex "a"
results3 = g.pageRank(resetProbability=0.15, maxIter=10, sourceId='a')
results3.vertices.show()

+---+-------+---+-------------------+
| id|   name|age|           pagerank|
+---+-------+---+-------------------+
|  g|  Gabby| 60|                0.0|
|  b|    Bob| 36| 0.3366143039702568|
|  e| Esther| 32|0.07657840357273027|
|  a|  Alice| 34|0.17710831642683564|
|  f|  Fanny| 36|0.03189213697274781|
|  d|  David| 29|0.03189213697274781|
|  c|Charlie| 30| 0.3459147020846817|
+---+-------+---+-------------------+



In [68]:
# Run PageRank personalized for vertex ["a", "b", "c", "d"] in parallel
results4 = g.parallelPersonalizedPageRank(resetProbability=.15, sourceIds=["a", 'b', 'c', 'd'], maxIter=10)

In [71]:
results4.vertices.show(truncate=False)

+---+-------+---+--------------------------------------------------------------------------------------------+
|id |name   |age|pageranks                                                                                   |
+---+-------+---+--------------------------------------------------------------------------------------------+
|g  |Gabby  |60 |(4,[0,1,2,3],[0.0,0.0,0.0,0.0])                                                             |
|b  |Bob    |36 |(4,[0,1,2,3],[0.3366143039702568,0.6309963479403321,0.36900365205966795,0.3188063856578461])|
|e  |Esther |32 |(4,[0,1,2,3],[0.07657840357273027,0.0,0.0,0.06378427394549561])                             |
|a  |Alice  |34 |(4,[0,1,2,3],[0.17710831642683564,0.0,0.0,0.15315680714546054])                             |
|f  |Fanny  |36 |(4,[0,1,2,3],[0.03189213697274781,0.0,0.0,0.027108316426835637])                            |
|d  |David  |29 |(4,[0,1,2,3],[0.03189213697274781,0.0,0.0,0.17710831642683564])                             |
|

## Shortest paths
计算从每个顶点到给定的地标顶点集的最短路径
 注意，这考虑了边缘方向。

("a", "b", "friend"),
      ("b", "c", "follow"),
      ("c", "b", "follow"),
      ("f", "c", "follow"),
      ("e", "f", "follow"),
      ("e", "d", "friend"),
      ("d", "a", "friend"),
      ("a", "e", "friend")

In [37]:
results = g.shortestPaths(landmarks=['a', 'd'])
results.select("id", "distances").show()
# e -> a 最短距离2

+---+----------------+
| id|       distances|
+---+----------------+
|  g|              []|
|  b|              []|
|  e|[d -> 1, a -> 2]|
|  a|[a -> 0, d -> 2]|
|  f|              []|
|  d|[d -> 0, a -> 1]|
|  c|              []|
+---+----------------+



## Triangle count
Computes the number of triangles passing through each vertex.

In [74]:
results = g.triangleCount()    
results.select('id', 'count').show()
# e->d->a->e  

+---+-----+
| id|count|
+---+-----+
|  g|    0|
|  f|    0|
|  e|    1|
|  d|    1|
|  c|    0|
|  b|    0|
|  a|    1|
+---+-----+



# Saving and loading GraphFrames

In [None]:
# Save vertices and edges as Parquet to some location.
g.vertices.write.parquet("hdfs://myLocation/vertices")
g.edges.write.parquet("hdfs://myLocation/edges")

# Load the vertices and edges back.
sameV = spark.read.parquet("hdfs://myLocation/vertices")
sameE = spark.read.parquet("hdfs://myLocation/edges")

# Create an identical GraphFrame.
sameG = GraphFrame(sameV, sameE)

# Message passing via AggregateMessages
- aggregateMessages: 在顶点之间发送消息，并汇总每个顶点的消息
- joins: 将消息聚合与原始图形合并


```
class graphframes.lib.AggregateMessages[source]
Collection of utilities usable with graphframes.GraphFrame.aggregateMessages().

dst
Reference for destination column, used for specifying messages.

edge
Reference for edge column, used for specifying messages.

static getCachedDataFrame(df)[source]
Create a new cached copy of a DataFrame.

This utility method is useful for iterative DataFrame-based algorithms. See Scala documentation for more details.

WARNING: This is NOT the same as DataFrame.cache().
The original DataFrame will NOT be cached.
msg
Reference for message column, used for specifying aggregation function.

src
Reference for source column, used for specifying messages.
```

In [38]:
g.aggregateMessages?

In [39]:
from graphframes.lib import AggregateMessages as AM
from pyspark.sql.functions import sum as sqlsum

# 聚合 与之相连的人的年龄
msgToSrc = AM.dst['age']
msgToDst = AM.src['age']
agg = g.aggregateMessages(
    sqlsum(AM.msg).alias('summedAges'),
    sendToSrc=msgToSrc,
    sendToDst=msgToDst
)
agg.show()

+---+----------+
| id|summedAges|
+---+----------+
|  f|        62|
|  e|        99|
|  d|        66|
|  c|       108|
|  b|        94|
|  a|        97|
+---+----------+



TypeError: 'Column' object is not callable