# Graph analysis with GraphFrames

This notebook shows you how to use GraphFrames to perform graph analysis. 


Check PySpark installation

In [None]:
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()

'D:\\spark-3.1.2-bin-hadoop3.2'

In [2]:

import pyspark

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = pyspark.SparkConf().setAppName('appName').setMaster('local')

sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)

In [7]:
nums = sc.parallelize([1,2,3,4])
nums.map(lambda x: x*x).collect()

[1, 4, 9, 16]

Create DataFrames

In [4]:
spark = SparkSession(sc)

vertices = spark.createDataFrame([('1', 'Carter', 'Derrick', 50), 
                                  ('2', 'May', 'Derrick', 26),
                                 ('3', 'Mills', 'Jeff', 80),
                                  ('4', 'Hood', 'Robert', 65),
                                  ('5', 'Banks', 'Mike', 93),
                                 ('98', 'Berg', 'Tim', 28),
                                 ('99', 'Page', 'Allan', 16)],
                                 ['id', 'name', 'firstname', 'age'])
edges = spark.createDataFrame([('1', '2', 'friend'), 
                               ('2', '1', 'friend'),
                              ('3', '1', 'friend'),
                              ('1', '3', 'friend'),
                               ('2', '3', 'follows'),
                               ('3', '4', 'friend'),
                               ('4', '3', 'friend'),
                               ('5', '3', 'friend'),
                               ('3', '5', 'friend'),
                               ('4', '5', 'follows'),
                              ('98', '99', 'friend'),
                              ('99', '98', 'friend')],
                              ['src', 'dst', 'type'])

In [5]:
import pyspark

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
#conf = pyspark.SparkConf().setAppName('appName').setMaster('spark://ec2-18-220-46-60.us-east-2.compute.amazonaws.com:7077')
conf = pyspark.SparkConf().setAppName('appName').setMaster('local')

#sc = pyspark.SparkContext(conf=conf)
sc.addPyFile('D:\spark-3.1.2-bin-hadoop3.2\jars\graphframes-0.8.1-spark2.4-s_2.11.jar')
spark = SparkSession(sc)

In [6]:
from functools import reduce
from pyspark.sql.functions import col, lit, when
from graphframes import *

Building the Graph

In [11]:
g = GraphFrame(vertices, edges)

print(g)


GraphFrame(v:[id: string, name: string ... 2 more fields], e:[src: string, dst: string ... 1 more field])


In [12]:
## Take a look at the DataFrames
g.vertices.show()
g.edges.show()
## Check the number of edges of each vertex
g.degrees.show()

+---+------+---------+---+
| id|  name|firstname|age|
+---+------+---------+---+
|  1|Carter|  Derrick| 50|
|  2|   May|  Derrick| 26|
|  3| Mills|     Jeff| 80|
|  4|  Hood|   Robert| 65|
|  5| Banks|     Mike| 93|
| 98|  Berg|      Tim| 28|
| 99|  Page|    Allan| 16|
+---+------+---------+---+

+---+---+-------+
|src|dst|   type|
+---+---+-------+
|  1|  2| friend|
|  2|  1| friend|
|  3|  1| friend|
|  1|  3| friend|
|  2|  3|follows|
|  3|  4| friend|
|  4|  3| friend|
|  5|  3| friend|
|  3|  5| friend|
|  4|  5|follows|
| 98| 99| friend|
| 99| 98| friend|
+---+---+-------+

+---+------+
| id|degree|
+---+------+
|  3|     7|
| 98|     2|
| 99|     2|
|  5|     3|
|  1|     4|
|  4|     3|
|  2|     3|
+---+------+



we can find the age of the youngest person in the graph

In [13]:
youngest = g.vertices.groupBy().min("age")
print(youngest)

g.vertices.groupBy().min("age").show()

DataFrame[min(age): bigint]
+--------+
|min(age)|
+--------+
|      16|
+--------+



In [14]:
g.vertices.filter("age > 30").show()
g.inDegrees.filter("inDegree >= 2").sort("inDegree", ascending=False).show()
g.edges.filter('type == "friend"')

+---+------+---------+---+
| id|  name|firstname|age|
+---+------+---------+---+
|  1|Carter|  Derrick| 50|
|  3| Mills|     Jeff| 80|
|  4|  Hood|   Robert| 65|
|  5| Banks|     Mike| 93|
+---+------+---------+---+

+---+--------+
| id|inDegree|
+---+--------+
|  3|       4|
|  5|       2|
|  1|       2|
+---+--------+



DataFrame[src: string, dst: string, type: string]