In [31]:
import networkx as nx

# Path to your local edge-list file
path = "web-Google.txt"

# Read into a directed graph
G = nx.read_edgelist(
    path,
    comments="#",            # ignore SNAP header lines
    delimiter="\t",          # tab-separated
    create_using=nx.DiGraph(),  # build a DiGraph
    nodetype=int             # cast node IDs to int
)

In [32]:
# Print basic info manually
print(f"Nodes: {G.number_of_nodes()}")
print(f"Edges: {G.number_of_edges()}")

Nodes: 875713
Edges: 5105039


In [35]:
import statistics

# Compute PageRank
pr = nx.pagerank(G, alpha=0.85, tol=1e-6, max_iter=100)

target, orig_score = min(pr.items(), key=lambda kv: kv[1])
print(f"Baseline → node {target} has PageRank {orig_score:.6e}")

print(f"Out-degree of target: {G.out_degree(target)}")
print(f"In-degree of target: {G.in_degree(target)}")

Baseline → node 6 has PageRank 2.828111e-07
Out-degree of target: 3
In-degree of target: 0


In [38]:
max_node = max(pr, key=pr.get)
max_score = pr[max_node]
print(f"Node with max PageRank: {max_node} (score = {max_score:.6f})")

print(f"Out-degree of max_node: {G.out_degree(max_node)}")
print(f"In-degree of max_node: {G.in_degree(max_node)}")

Node with max PageRank: 163075 (score = 0.000952)
Out-degree of max_node: 36
In-degree of max_node: 4731


In [37]:
# Simulate a link-farm attack:
# Create 100 new nodes each pointing at target
G_attack = G.copy()
max_id = max(G_attack.nodes())
new_nodes = list(range(max_id+1, max_id+1+1000))
for n in new_nodes:
    G_attack.add_edge(n, target)

In [39]:
pr_attack = nx.pagerank(G_attack, alpha=0.85, tol=1e-6, max_iter=100)
atk_score = pr_attack[target]
print(f"After attack → node {target} has PageRank {atk_score:.9f}")

After attack → node 6 has PageRank 0.000273794


In [43]:
max_node = max(pr_attack, key=pr_attack.get)
max_score = pr[max_node]
print(f"Node with max PageRank: {max_node} (score = {max_score:.6f})")

Node with max PageRank: 163075 (score = 0.000952)


In [41]:
in_thresh=0  # max in-degree to consider “spammy”
out_thresh=1  # max out-degree to consider “spammy”
# find spam nodes
spam_nodes = [
        u for u in G_attack.nodes()
        if G_attack.in_degree(u) <= in_thresh
           and 1 <= G_attack.out_degree(u) <= out_thresh
    ]
len(spam_nodes)  # number of spam nodes

38564

In [49]:
# Remove the spam‐to‐victim edges entirely
G_prune = G_attack.copy()
edges_to_remove = [
    (u, v) for u in spam_nodes for v in G_attack.successors(u)
]
G_prune.remove_edges_from(edges_to_remove)

# 2. Recompute ordinary PageRank
pr_prune = nx.pagerank(G_prune, alpha=0.85, tol=1e-6, max_iter=100)

Before defense → PR[6] = 2.737939e-04
After weighted defense → PR[6] = 3.644251e-07


In [51]:
print(f"Before defense → PR[{target}] = {pr_attack[target]:.9f}")
print(f"After weighted defense → PR[{target}] = {pr_prune[target]:.9f}")

Before defense → PR[6] = 0.000273794
After weighted defense → PR[6] = 0.000000364
