In [1]:
# Import the libraries we will need
import pandas as pd
import numpy as np

import findspark
findspark.init()

from pyspark.sql import *
import pyspark.sql.functions as F
from pyspark.sql.types import *

# create the Spark Session
spark = SparkSession.builder.appName("Q2").getOrCreate()

## PageRank

In [25]:
full = spark.sparkContext.textFile("hw3-bundle/hw3-bundle/pagerank_hits/data/graph-full.txt")
full = full.map(lambda x: (int(x.split()[0]), (int(x.split()[1])))).distinct() # (source, destination)
out_deg = full.map(lambda x: (x[0], 1)).reduceByKey(lambda v1, v2: v1 + v2).map(lambda x: (x[0], 1 / x[1]))
M = full.join(out_deg).map(lambda x: (x[1][0], x[0], x[1][1]))

beta = 0.8
n = 1000
r = np.ones(n) / n

for i in range(40):
    M_dot_r = M.map(lambda x: (x[0], x[2] * r[int(x[1] - 1)])).reduceByKey(lambda v1, v2: v1 + v2).sortByKey()
    M_dot_r_np = np.array(M_dot_r.map(lambda x: x[1]).collect())
    r = beta * M_dot_r_np + (1 - beta) / n

    print(f"At iteration {i}, top 5 scores are {np.sort(r)[-5:]}")

At iteration 0, top 5 scores are [0.00180945 0.00181908 0.00184313 0.00186635 0.0018759 ]
At iteration 1, top 5 scores are [0.00181202 0.00182504 0.00187849 0.00196126 0.00208872]
At iteration 2, top 5 scores are [0.00184116 0.00185567 0.0019266  0.00193022 0.00202381]
At iteration 3, top 5 scores are [0.00182881 0.00185143 0.00192724 0.00194234 0.00201246]
At iteration 4, top 5 scores are [0.00182763 0.00185305 0.00192584 0.00194528 0.0020213 ]
At iteration 5, top 5 scores are [0.00182735 0.00185279 0.00192504 0.00194323 0.00202065]
At iteration 6, top 5 scores are [0.00182736 0.00185251 0.00192537 0.00194333 0.00202037]
At iteration 7, top 5 scores are [0.00182738 0.0018526  0.00192548 0.00194333 0.00202035]
At iteration 8, top 5 scores are [0.00182739 0.00185262 0.00192545 0.00194335 0.0020203 ]
At iteration 9, top 5 scores are [0.00182738 0.00185263 0.00192545 0.00194334 0.00202029]
At iteration 10, top 5 scores are [0.00182737 0.00185263 0.00192545 0.00194334 0.00202029]
At iterat

In [26]:
# top 5
print(f"top 5 node ids and scores are {np.argpartition(r, -5)[-5:] + 1} and {np.partition(r, -5)[-5:]}")

top 5 node ids and scores are [285 243 965 263 537] and [0.00182737 0.00185263 0.00192545 0.00202029 0.00194334]


In [27]:
# bottom 5
print(f"bottom 5 node ids and scores are {np.argpartition(r, 5)[:5] + 1} and {np.partition(r, 5)[:5]}")

bottom 5 node ids and scores are [558  93 424  62 408] and [0.0003286  0.00035136 0.00035482 0.00035315 0.0003878 ]


## HITS

In [51]:
full = spark.sparkContext.textFile("hw3-bundle/hw3-bundle/pagerank_hits/data/graph-full.txt")
L = full.map(lambda x: (int(x.split()[0]), (int(x.split()[1])))).distinct() # (source, destination)
L_T = L.map(lambda x: (x[1], x[0]))

n = 1000
lam = 1
mu = 1

h = np.ones(n)

for i in range(40):
    L_T_h = L_T.map(lambda x: (x[0], h[int(x[1] - 1)])).reduceByKey(lambda v1, v2: v1 + v2).sortByKey()
    a = np.array(L_T_h.map(lambda x: x[1]).collect())
    a = a / a.max()

    L_a = L.map(lambda x: (x[0], a[int(x[1] - 1)])).reduceByKey(lambda v1, v2: v1 + v2).sortByKey()
    h = np.array(L_a.map(lambda x: x[1]).collect())
    h = h / h.max()

    print(f"At iteration {i}, top 5 hubbiness scored nodes are {np.argpartition(h, -5)[-5:] + 1}")
    print(f"At iteration {i}, top 5 hubbiness scores are {np.partition(h, -5)[-5:]}")
    print(f"At iteration {i}, bottom 5 hubbiness scored nodes are {np.argpartition(h, 5)[:5] + 1}")
    print(f"At iteration {i}, bottom 5 hubbiness scores are {np.partition(h, 5)[:5]}")

    print(f"At iteration {i}, top 5 authority scored nodes are {np.argpartition(a, -5)[-5:] + 1}")
    print(f"At iteration {i}, top 5 authority scores are {np.partition(a, -5)[-5:]}")
    print(f"At iteration {i}, bottom 5 authority scored nodes are {np.argpartition(a, 5)[:5] + 1}")
    print(f"At iteration {i}, bottom 5 authority scores are {np.partition(a, 5)[:5]}")

At iteration 0, top 5 hubbiness scored nodes are [444 472 234 155 840]
At iteration 0, top 5 hubbiness scores are [0.85714286 0.86813187 0.93406593 0.94505495 1.        ]
At iteration 0, bottom 5 hubbiness scored nodes are [141  23 835 539 890]
At iteration 0, bottom 5 hubbiness scores are [0.08791209 0.06593407 0.08791209 0.09340659 0.10989011]
At iteration 0, top 5 authority scored nodes are [533 893 146  16 502]
At iteration 0, top 5 authority scores are [0.9375 0.9375 1.     1.     1.    ]
At iteration 0, bottom 5 authority scored nodes are [408 424 558  93  62]
At iteration 0, bottom 5 authority scores are [0.125 0.125 0.125 0.125 0.125]
At iteration 1, top 5 hubbiness scored nodes are [389 155 234 840 472]
At iteration 1, top 5 hubbiness scores are [0.86108802 0.94488657 0.91572633 1.         0.86459199]
At iteration 1, bottom 5 hubbiness scored nodes are [141  23 835 539 890]
At iteration 1, bottom 5 hubbiness scores are [0.06811973 0.04483905 0.0626559  0.07025775 0.08540207]
A

In [52]:
# top 5
print(f"top 5 node ids and hubbiness scores are {np.argpartition(h, -5)[-5:] + 1} and {np.partition(h, -5)[-5:]}")
print(f"top 5 node ids and authority scores are {np.argpartition(a, -5)[-5:] + 1} and {np.partition(a, -5)[-5:]}")

top 5 node ids and hubbiness scores are [472 389 234 155 840] and [0.86328411 0.86341711 0.89866453 0.94996186 1.        ]
top 5 node ids and authority scores are [473 799  16 893 146] and [0.8998662  0.95101582 0.96355728 1.         0.92467036]


In [53]:
# bottom 5
print(f"bottom 5 node ids and hubbiness scores are {np.argpartition(h, 5)[:5] + 1} and {np.partition(h, 5)[:5]}")
print(f"bottom 5 node ids and authority scores are {np.argpartition(a, 5)[:5] + 1} and {np.partition(a, 5)[:5]}")

bottom 5 node ids and hubbiness scores are [ 23 835 539 141 889] and [0.04206685 0.05779059 0.06602659 0.06453118 0.07678414]
bottom 5 node ids and authority scores are [ 19 462 135  24 910] and [0.05608316 0.07544229 0.0665391  0.08171239 0.08571673]
