### Graph Frames

In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "natasha pritykovskaya Graph Frames") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
from graphframes import *

In [4]:
vertices = spark.createDataFrame([
    ("1", "Alex", 28, "M", "MIPT"),
    ("2", "Emeli", 28, "F", "MIPT"),
    ("3", "Natasha", 27, "F", "SPbSU"),
    ("4", "Pavel", 30, "M", "MIPT"),
    ("5", "Oleg", 35, "M", "MIPT"),
    ("6", "Ivan", 30, "M", "MSU"),
    ("7", "Ilya", 29, "M", "MSU")], 
["id", "name", "age", "gender", "university"])

In [5]:
edges = spark.createDataFrame([
    ("1", "2", "friend"),
    ("2", "1", "friend"),
    ("1", "3", "friend"),
    ("3", "1", "friend"),
    ("1", "4", "friend"),
    ("4", "1", "friend"),
    ("2", "3", "friend"),
    ("3", "2", "friend"),
    ("2", "5", "friend"),
    ("5", "2", "friend"),
    ("3", "4", "friend"),
    ("4", "3", "friend"),
    ("3", "5", "friend"),
    ("5", "3", "friend"),
    ("3", "6", "friend"),
    ("6", "3", "friend"),
    ("3", "7", "friend"),
    ("7", "3", "friend")
], ["src", "dst", "relationship"])

<img src="pics/graph_graphframes.png" width=500/>

In [6]:
g = GraphFrame(vertices, edges)

In [7]:
g.vertices.show()

+---+-------+---+------+----------+
| id|   name|age|gender|university|
+---+-------+---+------+----------+
|  1|   Alex| 28|     M|      MIPT|
|  2|  Emeli| 28|     F|      MIPT|
|  3|Natasha| 27|     F|     SPbSU|
|  4|  Pavel| 30|     M|      MIPT|
|  5|   Oleg| 35|     M|      MIPT|
|  6|   Ivan| 30|     M|       MSU|
|  7|   Ilya| 29|     M|       MSU|
+---+-------+---+------+----------+



In [8]:
g.edges.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  1|  2|      friend|
|  2|  1|      friend|
|  1|  3|      friend|
|  3|  1|      friend|
|  1|  4|      friend|
|  4|  1|      friend|
|  2|  3|      friend|
|  3|  2|      friend|
|  2|  5|      friend|
|  5|  2|      friend|
|  3|  4|      friend|
|  4|  3|      friend|
|  3|  5|      friend|
|  5|  3|      friend|
|  3|  6|      friend|
|  6|  3|      friend|
|  3|  7|      friend|
|  7|  3|      friend|
+---+---+------------+



In [9]:
g.vertices.filter("age > 30").show()

+---+----+---+------+----------+
| id|name|age|gender|university|
+---+----+---+------+----------+
|  5|Oleg| 35|     M|      MIPT|
+---+----+---+------+----------+



In [10]:
g.inDegrees.filter("inDegree >= 2").show(10)

+---+--------+
| id|inDegree|
+---+--------+
|  3|       6|
|  5|       2|
|  1|       3|
|  4|       2|
|  2|       3|
+---+--------+



# Кол-во треугольников

In [11]:
g.triangleCount().show()

+-----+---+-------+---+------+----------+
|count| id|   name|age|gender|university|
+-----+---+-------+---+------+----------+
|    0|  7|   Ilya| 29|     M|       MSU|
|    3|  3|Natasha| 27|     F|     SPbSU|
|    1|  5|   Oleg| 35|     M|      MIPT|
|    0|  6|   Ivan| 30|     M|       MSU|
|    2|  1|   Alex| 28|     M|      MIPT|
|    1|  4|  Pavel| 30|     M|      MIPT|
|    2|  2|  Emeli| 28|     F|      MIPT|
+-----+---+-------+---+------+----------+



### Компоненты связности

In [12]:
edges = spark.createDataFrame([
    ("1", "2", "friend"),
    ("1", "4", "friend"),
    ("3", "5", "friend"),
    ("3", "6", "friend"),
], ["src", "dst", "relationship"])

<img src="pics/graph_small_amount_of_links.png" width=500/>

In [13]:
g = GraphFrame(vertices, edges)

In [15]:
sc = spark.

sc.setCheckpointDir("/user/natalya.pritykovskaya/")
result = g.connectedComponents()
result.select("id", "component").orderBy("component").show()

AttributeError: 'SparkSession' object has no attribute 'setCheckpointDir'

In [None]:
g = g.dropIsolatedVertices()

In [None]:
g.vertices.show()

### Page rank

In [None]:
vertices = spark.createDataFrame([
    ("1", "Alex", 28, "M", "MIPT"),
    ("2", "Emeli", 28, "F", "MIPT"),
    ("3", "Natasha", 27, "F", "SPbSU"),
    ("4", "Pavel", 30, "M", "MIPT"),
    ("5", "Oleg", 35, "M", "MIPT"),
    ("6", "Ivan", 30, "M", "MSU"),
    ("7", "Ilya", 29, "M", "MSU")], 
["id", "name", "age", "gender", "university"])

In [None]:
edges = spark.createDataFrame([
    ("1", "2", "friend"),
    ("2", "1", "friend"),
    ("1", "3", "friend"),
    ("3", "1", "friend"),
    ("1", "4", "friend"),
    ("4", "1", "friend"),
    ("2", "3", "friend"),
    ("3", "2", "friend"),
    ("2", "5", "friend"),
    ("5", "2", "friend"),
    ("3", "4", "friend"),
    ("4", "3", "friend"),
    ("3", "5", "friend"),
    ("5", "3", "friend"),
    ("3", "6", "friend"),
    ("6", "3", "friend"),
    ("3", "7", "friend"),
    ("7", "3", "friend")
], ["src", "dst", "relationship"])

In [None]:
g = GraphFrame(vertices, edges)

In [None]:
results = g.pageRank(resetProbability=0.15, tol=0.01)
results.vertices.show()

In [None]:
results.edges.show()

In [None]:
dir(g)

# Положим реальный граф

In [53]:
graphPath = "/lectures/lecture04/trainGraph"
usersToPredictPath = "/lectures/lecture04/prediction.csv"

In [54]:
from pyspark.sql.types import *

schema = StructType(fields=[
    StructField("user", IntegerType()),
    StructField("friendsString", StringType())
])

data = spark.read.format("csv") \
        .schema(schema) \
        .option("delimiter", "\t") \
        .load(graphPath) 

In [55]:
from pyspark.sql.functions import udf
from pyspark.sql.functions import col, explode, collect_list, sort_array, size, split


def cutStartEndBrackets(s):
    return s[2:-2]

cutStartEndBracketsUDF = udf(cutStartEndBrackets, StringType())

userFriend = \
    data.select(col("user"), split(cutStartEndBracketsUDF(col("friendsString")), "\),\(").alias("friendsMasks"))\
    .withColumn("friendMask", explode('friendsMasks'))\
    .withColumn("dst", split(col("friendMask"), ",")[0])\
    .withColumn("src", col("user"))\
    .select(col("src").cast("integer"), col("dst").cast("integer"))

userFriendSymmetric = \
    userFriend\
        .withColumn("tmp", col("src"))\
        .withColumn("src", col("dst"))\
        .withColumn("dst", col("tmp"))\
        .select("src", "dst")
        
edges = userFriend.unionAll(userFriendSymmetric)

vertices = edges.select(col("src").alias("id")).distinct()

In [56]:
from graphframes import * 

g = GraphFrame(vertices, edges)

In [29]:
g.inDegrees.filter("inDegree > 20").count()

319719

In [33]:
spark.stop()