In [1]:
# TODO: Add comments and descriptions

# Coverage

# 0. Introduction

  # 0.1. Web/Graph Representation

# 1. Simplified PageRank

# 2. Dead-ends and Spider Traps

# 3. PageRank with Random Teleportation (Taxation)

# 4. Spam-Farms

# 5. TrustRank

# 6. Apply PageRank on a Real Dataset

# 7. One More Optimization

# References

## 0. Introduction

### 0.1. Web/Graph Representation

In [2]:
def toy_graph():
    G = dict()
    G[0] = [1, 2, 3]
    G[1] = [0, 3]
    G[2] = [0]
    G[3] = [1, 2]
    return G

## 1. Simplified Pagerank 

In [3]:
def pagerank(G, iteration_count=100):
    
    N = len(G.keys())
    next_rank_lst = [1/N for _ in range(N)]
    current_rank_lst = next_rank_lst[:]
    
    for i in range(iteration_count):
        current_rank_lst, next_rank_lst = next_rank_lst, current_rank_lst
        for j in range(N):
            next_rank_lst[j] = 0
        for node in G:
            if G[node]:
                contribution = current_rank_lst[node] / len(G[node])
                for edge in G[node]:
                    next_rank_lst[edge] += contribution
    
    return next_rank_lst

In [4]:
G = toy_graph()
rank_lst = pagerank(G)
print(rank_lst)
print(sum(rank_lst))

[0.3333333333333333, 0.2222222222222222, 0.2222222222222222, 0.2222222222222222]
1.0


## 2. Dead-ends and Spider Traps

### 2.1 Dead-ends

In [5]:
G = toy_graph()
G[2].clear()

In [6]:
rank_lst = pagerank(G)
print(rank_lst)
print(sum(rank_lst))

[3.4282767441682065e-15, 4.996463459841385e-15, 4.996463459841385e-15, 4.996463459841385e-15]
1.841766712369236e-14


### 2.2 Spider Traps

In [7]:
G = toy_graph()
G[2] = [2]

In [8]:
rank_lst = pagerank(G)
print(rank_lst)
print(sum(rank_lst))

[3.4282767441682065e-15, 4.996463459841385e-15, 0.9999999999999868, 4.996463459841385e-15]
1.0000000000000002


In [9]:
G = toy_graph()
G[2] = [4]
G[4] = [2]

In [10]:
rank_lst = pagerank(G)
print(rank_lst)
print(sum(rank_lst))

[2.742621395334566e-15, 3.997170767873109e-15, 0.4499999999999953, 3.997170767873109e-15, 0.5499999999999937]
0.9999999999999998


## 3. Pagerank with Random Teleportation

In [11]:
def pagerank(G, beta=0.85, iteration_count=100):
    
    N = len(G.keys())
    next_rank_lst = [1/N for _ in range(N)]
    current_rank_lst = next_rank_lst[:]
    
    for i in range(iteration_count):
        current_rank_lst, next_rank_lst = next_rank_lst, current_rank_lst
        for j in range(N):
            next_rank_lst[j] = (1 - beta) / N
        for node in G:
            if G[node]:
                contribution = beta * (current_rank_lst[node] / len(G[node]))
                for edge in G[node]:
                    next_rank_lst[edge] += contribution
        
        leakage_contribution = (1 - sum(next_rank_lst)) / N
        for j in range(N):
            next_rank_lst[j] += leakage_contribution
    
    return next_rank_lst

In [12]:
G = toy_graph()
G[2].clear()

In [13]:
rank_lst = pagerank(G, beta=0.8)
print(rank_lst)
print(sum(rank_lst))

[0.20833333333333334, 0.2638888888888889, 0.2638888888888889, 0.2638888888888889]
1.0


In [14]:
G = toy_graph()
G[2] = [2]

In [15]:
rank_lst = pagerank(G, beta=0.8)
print(rank_lst)
print(sum(rank_lst))

[0.10135135135135134, 0.12837837837837837, 0.6418918918918919, 0.12837837837837837]
1.0


## 4. Spam Farms

In [16]:
G = toy_graph()
G[2].clear()
for i in range(len(G.keys()), 100):
    G[i] = [2]
    G[2].append(i)

In [17]:
rank_lst = pagerank(G, beta=0.8)
print(rank_lst[:4])

[0.004054054054054079, 0.005135135135135165, 0.4409309308444677, 0.005135135135135165]


## 5. TrustRank

In [18]:
def pagerank(G, beta=0.85, iteration_count=100, teleport_lst=None):
    
    if not teleport_lst:
        teleport_lst = G.keys()
    
    N = len(G.keys())
    next_rank_lst = [1/N for _ in range(N)]
    current_rank_lst = next_rank_lst[:]
    
    teleport_lst_count = len(teleport_lst)
    
    for i in range(iteration_count):
        current_rank_lst, next_rank_lst = next_rank_lst, current_rank_lst
        for j in range(N):
            next_rank_lst[j] = 0
        for node in teleport_lst:
            next_rank_lst[node] = (1 - beta) / teleport_lst_count
        for node in G:
            if G[node]:
                contribution = beta * (current_rank_lst[node] / len(G[node]))
                for edge in G[node]:
                    next_rank_lst[edge] += contribution
        
        leakage_contribution = (1 - sum(next_rank_lst)) / N
        for j in range(N):
            next_rank_lst[j] += leakage_contribution
        
    return next_rank_lst

In [19]:
G = toy_graph()
G[2].clear()
for i in range(len(G.keys()), 100):
    G[i] = [2]
    G[2].append(i)

trust_lst = [1]
rank_lst = pagerank(G, beta=0.8, teleport_lst=trust_lst)
print(rank_lst[:4])

[0.11583011583011586, 0.28957528957528955, 0.2488202487173264, 0.14671814671814676]


## 6. Apply PageRank on a Real Dataset

In [20]:
# Download dataset

import gzip
import shutil
import urllib
import os

url = 'https://snap.stanford.edu/data/web-NotreDame.txt.gz'

filename = 'data.txt'
gzip_file = '%s.gz' % filename

# Download the dataset
is_download_required = not os.path.isfile(filename) 

if is_download_required:
    urllib.request.urlretrieve(url, gzip_file)
    
    with gzip.open(gzip_file, 'rb') as f_in:
        with open(filename, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

In [21]:
# Parse input file
G = dict()

with open(filename, 'r') as file:
    for line in file:
        if line[0] == '#':
            continue
        
        u, v = map(int, line.split())
        
        if u not in G:
            G[u] = list()
        
        G[u].append(v)

N = max(G.keys()) + 1
for i in range(N):
    if i not in G:
        G[i] = list()

In [22]:
rank_lst = pagerank(G)

In [23]:
rank_lst

[0.0054665477569531785,
 0.000479674316898441,
 0.00027508637392546846,
 0.00036790315479619837,
 0.0003595245504737839,
 0.00030499611604539067,
 0.00029261417107363043,
 0.00030023960041393184,
 0.0002836810796440879,
 0.0002861091929132,
 0.00027474915113494155,
 0.0003044988464747658,
 0.00029568396941778897,
 0.0003222889507037878,
 0.0007830841465847079,
 0.00038596375765807557,
 0.00044646728419846146,
 4.665982176571399e-05,
 4.643720778559399e-05,
 6.693632777419212e-05,
 6.853402163987906e-05,
 4.5990184796664115e-05,
 1.875088172277251e-05,
 3.488124437499418e-05,
 3.4551268475007655e-05,
 4.174618366275664e-05,
 1.875088172277251e-05,
 3.5055574204950165e-05,
 1.875088172277251e-05,
 4.6439407071194834e-05,
 1.875088172277251e-05,
 1.875088172277251e-05,
 2.2699774283056088e-05,
 5.2150947667285635e-05,
 2.6492447139288255e-05,
 0.00012556338459318751,
 3.436565390784405e-05,
 3.839842571652896e-05,
 3.10641964866535e-05,
 0.0002047046321651744,
 9.399087658886809e-05,
 6.9

## 7. One More Optimization

In [24]:
def pagerank(G, beta=0.85, iteration_count=100, teleport_lst=None, eps=1e-8):
    
    if not teleport_lst:
        teleport_lst = G.keys()
    
    N = len(G.keys())
    next_rank_lst = [1/N for _ in range(N)]
    current_rank_lst = next_rank_lst[:]
    
    teleport_lst_count = len(teleport_lst)
    
    for i in range(iteration_count):
        current_rank_lst, next_rank_lst = next_rank_lst, current_rank_lst
        for j in range(N):
            next_rank_lst[j] = 0
        for node in teleport_lst:
            next_rank_lst[node] = (1 - beta) / teleport_lst_count
        for node in G:
            if G[node]:
                contribution = beta * (current_rank_lst[node] / len(G[node]))
                for edge in G[node]:
                    next_rank_lst[edge] += contribution
        
        leakage_contribution = (1 - sum(next_rank_lst)) / N
        for j in range(N):
            next_rank_lst[j] += leakage_contribution
        
        total_diff = 0
        for c, n in zip(current_rank_lst, next_rank_lst):
            total_diff += abs(c - n)
        
        if total_diff < eps:
            return next_rank_lst
    
    return next_rank_lst

In [25]:
rank_lst = pagerank(G)

## References

 - Rajaraman, Anand, and Jeffrey David Ullman. Mining of massive datasets. Cambridge University Press, 2011.
