In [2]:
import numpy as np
import operator

In [None]:
def build_link_matrix_A(filename):
    """
    Builds the Link Matrix A from a .dat dataset file.
    
    This function handles reading sparse data (edges) and creating 
    the dense or sparse matrix needed for computation. It also manages dangling nodes.
    
    Args:
        filename (str): The path to the .dat file.                            
    Returns:
        tuple: (A, N)
            - A (numpy.ndarray): The constructed N x N matrix.
            - N (int): The total number of nodes (pages).
    """
    
    links = []
    
    # =========================================================================
    # --- PART 1: FILE READING AND PRE-PROCESSING ---
    # =========================================================================
    try:
        with open(filename, 'r') as file:
            
            # 1. Read Header
            # The file starts with "N M" (Nodes, Edges). We need N to size the matrix.
            header_line = file.readline().strip().split()
            if not header_line or len(header_line) < 2:
                raise ValueError("The file must start with a valid 'N M' line.")
            
            num_nodes = int(header_line[0])
            N = num_nodes  
            
            # 2. Skip URL mapping lines
            # The first N lines after the header are URLs (strings). 
            # For the math, we only need numeric IDs, so we skip them.
            print(f"Skipping the first {num_nodes} URL mapping lines...")
            for _ in range(num_nodes):
                file.readline()
            
            # 3. Process the edges (Links)
            # Dictionary to count how many links exit each page (Out-Degree, n_j).
            # Used to calculate the transition probability (1 / n_j).
            out_degree = {} 
            valid_link_count = 0
            
            for line in file:
                parts = line.strip().split()
                # We expect lines made of "Source_ID Target_ID"
                if len(parts) == 2:
                    try:
                        source_id = int(parts[0])
                        target_id = int(parts[1])
                        
                        # Count the out-degree for the source node
                        out_degree[source_id] = out_degree.get(source_id, 0) + 1
                        
                        # Store the edge for later processing
                        links.append((source_id, target_id))
                        valid_link_count += 1
                    except ValueError:
                        # Robust handling for malformed lines
                        continue
                        
            print(f"Read {valid_link_count} valid edges.")

    except Exception as e:
        print(f"Error processing the file: {e}")
        return None, 0

    if N == 0:
        return None, 0

    # =========================================================================
    # --- PART 2: BASIC CONSTRUCTION OF A (The Link Matrix) ---
    # =========================================================================
    
    A = np.zeros((N, N))

    # Populate the matrix using the read edges.
    # Rule: A[i, j] = 1 / n_j  (where j=source, i=destination)
    for source_id, target_id in links:
        # Convert from ID (base-1, from file) to Index (base-0, for Python)
        source_idx = source_id - 1  # Column j
        target_idx = target_id - 1  # Row i
        
        # Recover n_j (how many links exit the source node)
        n_j = out_degree.get(source_id, 0)
        
        # Safety check on indices and division by zero
        if 0 <= source_idx < N and 0 <= target_idx < N and n_j > 0:
            A[target_idx, source_idx] = 1.0 / n_j

    # At this point, if a node has no outgoing links (so, it is dangling node),
    # its column is entirely ZERO. (Substochastic Matrix).

    # =========================================================================
    # --- PART 3: APPLYING THE PATCH ---
    # =========================================================================
    dangling_count = 0
    patch_value = 1.0 / N
    
    # Scan all columns (source nodes)
    for j in range(N):
        # Check if it is a Dangling Node (out_degree = 0)
        if out_degree.get(j + 1, 0) == 0:
            # Replace the zero column with 1/N.
            A[:, j] = patch_value
            dangling_count += 1
            
    print(f"Patch applied to {dangling_count} dangling nodes out of {N} total.")
    print("The resulting matrix is now perfectly column-stochastic (column sums = 1).")
    
    print(f"Link matrix {N}x{N} ready for calculation.")
    return A, N

In [1]:
def calculate_pagerank(L_matrix, m, max_iter=200, tolerance=1e-7):
    """
    Compute the PageRank using the Power Method.
    
    This function implements the iterative formula:
    x(k+1) = (1 - m) * A * x(k) + m * s
    
    Args:
        L_matrix (numpy.ndarray): The Link Matrix A (N x N).
        m (float): Teleportation probability, (1-m) is the Damping Factor.
        max_iter (int): Maximum number of iterations to avoid infinite loops.
        tolerance (float): Convergence threshold. If the difference between two iterations is less than this, we stop.
        
    Returns:
        tuple: (PageRank_eigenvector, number_of_iterations)
    """
    
    # 1. Determine the size of the Web (N)
    n = L_matrix.shape[0]
    
    # 2. Create the Teleportation vector (s)
    # It is a column vector where each element is 1/N.
    s = np.full((n, 1), 1/n)
    
    # 3. Initialization of the PageRank vector (x)
    # At the beginning, we assume all pages have the same importance (1/N).
    x = np.full((n, 1), 1/n)
    
    k = 0
    
    # --- START POWER METHOD ---
    for k in range(max_iter):
        
        x_prev = x.copy()
        
        # Note: L_matrix is sparse (has many zeros), so this operation is fast.
        Ax = L_matrix @ x_prev
        
        # This step simulates the multiplication by the matrix M without having to build it.
        x = (1 - m) * Ax + m * s
        
        diff = np.sum(np.abs(x - x_prev))
        
        if diff < tolerance:
            # Exit the for loop early if we have convergence
            break
            
    # --- END POWER METHOD ---
    
    return x, k + 1

In [7]:
print("=============================================")
print("  EXECUTION: WEB WITH 4 PAGES (FIGURE 2.1) ")
print("=============================================")

# Construction of the link matrix
A_4pages = np.array([
    [0.0, 0.0, 1.0, 0.5],
    [1/3, 0.0, 0.0, 0.0],
    [1/3, 0.5, 0.0, 0.5],
    [1/3, 0.5, 0.0, 0.0]
])

# Calculation of PageRank
pagerank_scores_4pages,iterations_4pages = calculate_pagerank(A_4pages, m=0.15)

# Preparation of results: list of tuples (Page ID, Score)
page_indices_4pages = np.arange(1, A_4pages.shape[0] + 1)
results_4pages = list(zip(page_indices_4pages, pagerank_scores_4pages.flatten()))
results_4pages_sorted = sorted(results_4pages, key=operator.itemgetter(1), reverse=True)

# Printing the results
for page_id, score in results_4pages_sorted:
    print(f"Pagina {page_id}: {score:.4f}")
    
print(f"Calculation completed in {iterations_4pages} iterations.")


print("\n\n=============================================")
print("  EXECUTION: WEB WITH 5 PAGES (FIGURE 2.2) ")
print("=============================================")

#Construction of the link matrix
A_5pages = np.array([
    [0.0, 1.0, 0.0, 0.0, 0.0],   
    [1.0, 0.0, 0.0, 0.0, 0.0],  
    [0.0, 0.0, 0.0, 1.0, 0.5],   
    [0.0, 0.0, 1.0, 0.0, 0.5],   
    [0.0, 0.0, 0.0, 0.0, 0.0]    
])

# Calculation of PageRank
pagerank_scores_5pages,iterations_5pages = calculate_pagerank(A_5pages, m=0.15)

# Preparation of results: list of tuples (Page ID, Score)
page_indices_5pages = np.arange(1, A_5pages.shape[0] + 1)
results_5pages = list(zip(page_indices_5pages, pagerank_scores_5pages.flatten()))
results_5pages_sorted = sorted(results_5pages, key=operator.itemgetter(1), reverse=True)

# Printing the results
for page_id, score in results_5pages_sorted:
    print(f"Page {page_id}: {score:.4f}")
    
print(f"Calculation completed in {iterations_5pages} iterations.")

  EXECUTION: WEB WITH 4 PAGES (FIGURE 2.1) 
Pagina 1: 0.3682
Pagina 3: 0.2880
Pagina 4: 0.2021
Pagina 2: 0.1418
Calculation completed in 21 iterations.


  EXECUTION: WEB WITH 5 PAGES (FIGURE 2.2) 
Page 3: 0.2850
Page 4: 0.2850
Page 1: 0.2000
Page 2: 0.2000
Page 5: 0.0300
Calculation completed in 2 iterations.


In [8]:
# Calculation of PageRank on the hollins.dat dataset
filename = 'hollins.dat'
m=0.15 
top_k=10

# 1. CONSTRUCTION OF THE LINK MATRIX A 
A_matrix, N_nodes = build_link_matrix_A(filename)

# 2. CALCULATION OF PAGERANK
pagerank_scores, iterations = calculate_pagerank(A_matrix, m=m)
print(f"Calculation completed in {iterations} iterations.")

# 3. PREPARATION AND PRINTING OF RESULTS
page_ids = np.arange(1, N_nodes + 1)
results = list(zip(page_ids, pagerank_scores.flatten()))
results_sorted = sorted(results, key=operator.itemgetter(1), reverse=True)
print(f"\n--- TOP {top_k} Pages Ranking (out of {N_nodes} total) ---")

# Print the top K results
for rank, (page_id, score) in enumerate(results_sorted[:top_k], 1):
    print(f"Rank {rank}: Page ID {page_id} (Score: {score:.6f})")
print("-------------------------------------------------------")


# For a good evaluation, we print the importance of the least important node, which should approach the theoretical minimum score (m/N).
min_score = results_sorted[-1][1]
expected_min = m / N_nodes
print(f"Minimum score (last page): {min_score:.6f}")
print(f"Theoretical minimum score ({m}/N): {expected_min:.6f}")

Skipping the first 6012 URL mapping lines...
Read 23875 valid edges.
Patch applied to 3189 dangling nodes out of 6012 total.
The resulting matrix is now perfectly column-stochastic (column sums = 1).
Link matrix 6012x6012 ready for calculation.
Calculation completed in 71 iterations.

--- TOP 10 Pages Ranking (out of 6012 total) ---
Rank 1: Page ID 2 (Score: 0.019879)
Rank 2: Page ID 37 (Score: 0.009288)
Rank 3: Page ID 38 (Score: 0.008610)
Rank 4: Page ID 61 (Score: 0.008065)
Rank 5: Page ID 52 (Score: 0.008027)
Rank 6: Page ID 43 (Score: 0.007165)
Rank 7: Page ID 425 (Score: 0.006583)
Rank 8: Page ID 27 (Score: 0.005989)
Rank 9: Page ID 28 (Score: 0.005572)
Rank 10: Page ID 4023 (Score: 0.004452)
-------------------------------------------------------
Minimum score (last page): 0.000058
Theoretical minimum score (0.15/N): 0.000025
