# PageRank with GraphFrames
> A ouvrir avec un Jupyter qui a graphframes !

In [1]:
import os
import time

import findspark

findspark.init()
import pyspark
from graphframes import *
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import lit, desc

In [2]:
spark = SparkSession.builder \
    .master('local') \
    .appName('PageRank') \
    .getOrCreate()

In [3]:
PATH_NETWORK = '/Users/rboyer/dev/pyspark/pagerank/data/test.txt'

In [4]:
t1 = time.time()
beta = 0.8

# Crée le schema du dataframe
schema = StructType([
    StructField("src", StringType(), True),
    StructField("dst", StringType(), True),
])

# Charge le dataframe a partir du .txt
transition_matrix = spark.read.load(
    PATH_NETWORK,
    header=None,
    schema=schema,
    format='csv',
    sep='\t',
)

# Initialise le vecteur pagerank
initial_pr_source = transition_matrix \
    .select('src') \
    .withColumnRenamed('src', 'id') \
    .distinct()
initial_pr_desti = transition_matrix \
    .select('dst') \
    .withColumnRenamed('dst', 'id') \
    .distinct()

pagerank = initial_pr_source.union(initial_pr_desti) \
    .distinct()

In [5]:
transition_matrix.show()

+---+---+
|src|dst|
+---+---+
|  1|  2|
|  1|  4|
|  1|  3|
|  2|  1|
|  2|  4|
|  3|  3|
|  4|  2|
|  4|  3|
|  1|  6|
|  5|  1|
+---+---+



In [6]:
pagerank.show()

+---+
| id|
+---+
|  3|
|  5|
|  6|
|  1|
|  4|
|  2|
+---+



In [8]:
our_graph = GraphFrame(pagerank, transition_matrix)

In [None]:
page_ranks = our_graph.pageRank(resetProbability=0.20, tol=0.5)

In [None]:
page_ranks.vertices.select('id', 'pagerank').show()