In [None]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=b651c85028de7c80aef2a6c277e73c741ccafdd0db842a7f82d32b39a7f407ef
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0
The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei fonts-wqy-zenhei fonts-indic

In [None]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import *
import pandas as pd
import numpy as np

In [None]:
# initiate SparkContext
conf = SparkConf().setAppName("ChihaoShen").setMaster("local[*]")
sc = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()
spark

In [None]:
# read data
graph_small = sc.textFile("/content/drive/MyDrive/CSC4008/ass/ass2/A3-data/graph-small.txt")
graph_full = data = sc.textFile("/content/drive/MyDrive/CSC4008/ass/ass2/A3-data/graph-full.txt")

In [None]:
# specify graph
graph = graph_full
n = 1000  # number of nodes

In [None]:
# PageRank
def set_matrix(x):
  v = np.zeros(n)
  v[x[1][0]] = 1 / x[1][1]
  return (x[0], v)

# change the starting node from 1 to 0
edges = graph.map(lambda line: line.split()).map(lambda x: (int(x[0]) - 1, int(x[1]) - 1)).distinct()
matrix = edges.map(lambda e: (e[0], ([e[1]], 1)))
matrix = matrix.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])).sortByKey()
matrix = matrix.map(set_matrix)

In [None]:
r = np.full(n, 1/n)
beta = 0.8
for i in range(40):
  r = matrix.map(lambda x: x[1] * r[x[0]]).reduce(lambda x, y: x + y)
  r = beta * r + (1 - beta) / n

In [None]:
top5 = np.argsort(r)[-5:] + 1
top5 = top5[::-1]
print("The top 5 ids with the highest score are")
for i in range(5):
  print(f"id: {top5[i]}; score: {r[top5[i] - 1]}")

The top 5 ids with the highest score are
id: 263; score: 0.0020202911815182184
id: 537; score: 0.0019433415714531497
id: 965; score: 0.0019254478071662631
id: 243; score: 0.001852634016241731
id: 285; score: 0.0018273721700645144


In [None]:
bottom5 = np.argsort(r)[:5] + 1
print("The bottom 5 ids with the lowest score are")
for i in range(5):
  print(f"id: {bottom5[i]}; score: {r[bottom5[i] - 1]}")

The bottom 5 ids with the lowest score are
id: 558; score: 0.0003286018525215297
id: 93; score: 0.00035135689375165774
id: 62; score: 0.00035314810510596274
id: 424; score: 0.0003548153864930145
id: 408; score: 0.00038779848719291705


In [None]:
# HITS
def set_matrix(x):
  v = np.zeros(n)
  v[x[1]] = 1
  return (x[0], v)

# change the starting node from 1 to 0
edges = graph.map(lambda line: line.split()).map(lambda x: (int(x[0]) - 1, int(x[1]) - 1)).distinct()
matrix = edges.map(lambda e: (e[0], [e[1]]))
matrix = matrix.reduceByKey(lambda x, y: x + y).sortByKey()
matrix = matrix.map(set_matrix)

In [None]:
h = np.ones(n)
for i in range(40):
  a = matrix.map(lambda x: x[1] * h[x[0]]).reduce(lambda x, y: x + y)
  a /= np.max(a)
  h = matrix.map(lambda x: x[1].dot(a)).collect()
  h /= np.max(h)

In [None]:
h_top5 = np.argsort(h)[-5:] + 1
h_top5 = h_top5[::-1]
print("The top 5 ids with the highest hubbiness score are")
for i in range(5):
  print(f"id: {h_top5[i]}; score: {h[h_top5[i] - 1]}")

The top 5 ids with the highest hubbiness score are
id: 840; score: 1.0
id: 155; score: 0.9499618624906541
id: 234; score: 0.8986645288972265
id: 389; score: 0.863417110184379
id: 472; score: 0.8632841092495217


In [None]:
h_bottom5 = np.argsort(h)[:5] + 1
print("The bottom 5 ids with the lowest hubbiness score are")
for i in range(5):
  print(f"id: {h_bottom5[i]}; score: {h[h_bottom5[i] - 1]}")

The bottom 5 ids with the lowest hubbiness score are
id: 23; score: 0.042066854890936534
id: 835; score: 0.057790593544330165
id: 141; score: 0.0645311764622518
id: 539; score: 0.06602659373418493
id: 889; score: 0.07678413939216454


In [None]:
a_top5 = np.argsort(a)[-5:] + 1
a_top5 = a_top5[::-1]
print("The top 5 ids with the highest authority score are")
for i in range(5):
  print(f"id: {a_top5[i]}; score: {a[a_top5[i] - 1]}")

The top 5 ids with the highest authority score are
id: 893; score: 1.0
id: 16; score: 0.9635572849634397
id: 799; score: 0.9510158161074017
id: 146; score: 0.9246703586198443
id: 473; score: 0.899866197360405


In [None]:
a_bottom5 = np.argsort(a)[:5] + 1
print("The bottom 5 ids with the lowest authority score are")
for i in range(5):
  print(f"id: {a_bottom5[i]}; score: {a[a_bottom5[i] - 1]}")

The bottom 5 ids with the lowest authority score are
id: 19; score: 0.056083163776076174
id: 135; score: 0.06653910487622795
id: 462; score: 0.075442286246419
id: 24; score: 0.08171239406816945
id: 910; score: 0.08571673456144878
