In [171]:
import numpy as np
import pandas as pd
import random

In [172]:
file = open("hollins.dat", "r")
first_line = file.readline().strip().split(" ")
n_pages = int(first_line[0])
n_links = int(first_line[1])
print(f"Number of pages -> {n_pages}")
print(f"Number of links -> {n_links}")

Number of pages -> 6012
Number of links -> 23875


In [173]:
# storing page links

i = 0
page_names = []
for row in file:
    row = row.strip().split(" ")[1]
    page_names.append(row)
    if i >= n_pages-1:
        break
    i = i+1
print(f"Size page_names -> {len(page_names)}")
print(f"Page 1 -> {page_names[0]}")

Size page_names -> 6012
Page 1 -> http://www1.hollins.edu/


In [174]:
# matrix A

A = np.zeros((n_pages, n_pages), dtype=float)
for row in file:
    row = row.strip().split(" ")
    p1 = int(row[0])
    p2 = int(row[1])
    A[p2-1, p1-1] = 1
file.close()

print(f"Shape A -> {A.shape}")
print(f"Sum of ones in A = {np.sum(A)} should be equal to n_links = {n_links}")
print(f"nan values -> {np.sum(np.isnan(A))}")

Shape A -> (6012, 6012)
Sum of ones in A = 23875.0 should be equal to n_links = 23875
nan values -> 0


In [None]:
# Each columns should sum to 1
# As long there are dangling nodes, I have to make them vote equally for every other page

print("Number of dangling nodes before -> ", (np.sum(A, axis=0) == 0).sum())

mask = np.sum(A, axis=0) == 0
print(mask)
A[:, mask] = 1
A = A * (np.ones(A.shape, dtype=float)-np.eye(A.shape[0]))

# finally I can normalize the matrix
A = A/np.sum(A, axis = 0)
print("Number of dangling nodes now -> ", (np.sum(A, axis=0) == 0).sum())

Number of dangling nodes before ->  3189
[False False  True ...  True  True  True]
Number of dangling nodes now ->  0


In [176]:
print(f"Final A.shape -> {A.shape}")
no_outgoing = A.shape[0]-np.count_nonzero(np.sum(A, axis = 0))
print(f"Number of pages with NO outgoing links -> {no_outgoing}")
if (no_outgoing==0):
    print("The matrix is columns-stochastic!")
else:
    print("The matrix isn't columns-stochastic yet, revise your code")

Final A.shape -> (6012, 6012)
Number of pages with NO outgoing links -> 0
The matrix is columns-stochastic!


In [177]:
# Building matrix M, as there might be more than one subwebs
S = np.ones(A.shape, dtype=float)/A.shape[0]
# m is arbitrary, but I am trying first with m = 0.15 (default)
m = 0.15
M = (1-m)*A + m*S
# M is column-stochastic for definition too
print(f"Shape M -> {M.shape}")

Shape M -> (6012, 6012)


In [178]:
# find the highest ranked page
# cannot compute finding avery eigenvalue and eigenvector beacuse the matrix is way too big
# I have to apply the power method

def power_method(A):
    #  Find the largest eigenvalue and the corresponding eigenvector
    max_iterations=1000
    tolerance=1e-10
    n = A.shape[0]
    # x is initialized as a random vector of n elements
    x = np.random.rand(n)
    x = x / np.linalg.norm(x) # normalization of x

    eigenvalue_old = 0
    iterations = 0
    
    for i in range(max_iterations):
        # Multiply by A
        y = A @ x
        # Estimate eigenvalue (Rayleigh quotient)
        eigenvalue = x @ y
        # Normalize the vector
        x_new = y/np.linalg.norm(y)
        # Check for convergence
        if np.abs(eigenvalue - eigenvalue_old) < tolerance:
            break
        x = x_new
        eigenvalue_old = eigenvalue
        iterations += 1
    
    return eigenvalue, x, iterations

In [179]:
# Alternative implementation with eigenvalue calculation
def my_power_method(A, m):
    n = A.shape[0]
    # Initialize
    x = np.random.rand(n)
    x = x / np.sum(x)
    s = np.ones(n) / n

    iterations = 0
    max_iterations = 1000
    tolerance = 1e-10
    
    for i in range(max_iterations):
        x_old = x.copy()
        # PageRank iteration
        x = (1 - m) * A @ x + m * s
        x = x / np.sum(x)
        
        if np.linalg.norm(x - x_old, 1) < tolerance:
            break
            
        iterations += 1
    
    return 1, x, iterations

In [180]:
max_eigenvalue, eigenspace_basis, n_iterations = power_method(M)
# max_eigenvalue, eigenspace_basis, n_iterations = my_power_method(A, m)
tol = 1e-7
print(max_eigenvalue)
if np.abs(max_eigenvalue-1)< tol:
    print(f"Power method converged correctly in {n_iterations} iterations")
else:
    print(f"Power method did not fully converged in {n_iterations} iterations")

# I find the normalized eigenspace basis
eigenspace_basis = eigenspace_basis[:n_pages] # last page does not count as it is an auxiliar page
normalized = eigenspace_basis/np.sum(eigenspace_basis)

0.9999999999255273
Power method converged correctly in 105 iterations


In [181]:
# Then I find the score of the highest ranked page
highest_ranked_page = np.argmax(normalized)
print(f"Highest ranked page is {highest_ranked_page+1} -> {np.max(normalized):.6f}")

# I want to show the ranking for top_n_pages pages
top_n_pages = 20
print(f"\nIf I want to see the top {top_n_pages} highest ranked pages:")
top_indices = np.argsort(-normalized)
for i in range(top_n_pages):
    print(f"{i+1} ({normalized[top_indices[i]]:.6f} points). {top_indices[i]+1} -> {page_names[top_indices[i]]}")

Highest ranked page is 2 -> 0.019879

If I want to see the top 20 highest ranked pages:
1 (0.019879 points). 2 -> http://www.hollins.edu/
2 (0.009288 points). 37 -> http://www.hollins.edu/admissions/visit/visit.htm
3 (0.008611 points). 38 -> http://www.hollins.edu/about/about_tour.htm
4 (0.008065 points). 61 -> http://www.hollins.edu/htdig/index.html
5 (0.008027 points). 52 -> http://www.hollins.edu/admissions/info-request/info-request.cfm
6 (0.007165 points). 43 -> http://www.hollins.edu/admissions/apply/apply.htm
7 (0.006583 points). 425 -> http://www.hollins.edu/academics/library/resources/web_linx.htm
8 (0.005989 points). 27 -> http://www.hollins.edu/admissions/admissions.htm
9 (0.005572 points). 28 -> http://www.hollins.edu/academics/academics.htm
10 (0.004453 points). 4023 -> http://www1.hollins.edu/faculty/saloweyca/clas%20395/Sculpture/sld001.htm
11 (0.004385 points). 29 -> http://www.hollins.edu/grad/coedgrad.htm
12 (0.003778 points). 5254 -> http://www1.hollins.edu/faculty/sa