## Google Web Graph Analysis

#### Dataset Information：
- Dataset Name: Google Web Graph  
- Source: [Stanford SNAP](https://snap.stanford.edu/data/web-Google.html)  
- Description: This dataset represents the web graph where nodes correspond to web pages, and directed edges represent hyperlinks between them.  
- Nodes: 875713  
- Edges: 5105039  
- Graph Type: Directed Graph  

#### Why This Dataset?
- Clear Data Structure: The dataset has a well-defined structure where nodes represent web pages and directed edges represent hyperlinks, making it suitable for network analysis.
- Ease of Network Construction: The direct relationships between web pages allow for straightforward graph representation, facilitating the creation of a web network.

In [7]:
import pandas as pd
import numpy as np
import networkx as nx

import plotly.graph_objects as go
import plotly.express as px

In [8]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

print(__version__) 

init_notebook_mode(connected=True)

5.24.1


In [9]:
file_path = "web-Google.txt" 
edges = []

In [10]:
with open(file_path, "r") as f:
    for line in f:
        if line.startswith("#"):  
            continue
        source, target = map(int, line.strip().split())
        edges.append((source, target))

In [11]:
df_edges = pd.DataFrame(edges, columns=["Source", "Target"])

In [12]:
df_edges.head()

Unnamed: 0,Source,Target
0,0,11342
1,0,824020
2,0,867923
3,0,891835
4,11342,0


In [13]:
G = nx.DiGraph()
G.add_edges_from(df_edges.values)

In [14]:
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
isolated_nodes = list(nx.isolates(G))

In [15]:
print(f"Total Nodes: {num_nodes}")
print(f"Total Edges: {num_edges}")
print(f"Isolated Nodes: {len(isolated_nodes)}")

Total Nodes: 875713
Total Edges: 5105039
Isolated Nodes: 0


In [209]:
import random

pagerank_scores = nx.pagerank(G, alpha=0.85)

num_sample_nodes = 1000
sample_nodes = sorted(pagerank_scores, key=pagerank_scores.get, reverse=True)[:num_sample_nodes]

G_sub = G.subgraph(sample_nodes)

print(f"Subgraph Nodes: {G_sub.number_of_nodes()}")
print(f"Subgraph Edges: {G_sub.number_of_edges()}")


Subgraph Nodes: 1000
Subgraph Edges: 3342


In [210]:
pos = nx.spring_layout(G_sub, seed=42) 

In [212]:
node_x = []
node_y = []
for node in G_sub.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)

edge_x = []
edge_y = []
for edge in G_sub.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])  
    edge_y.extend([y0, y1, None])

In [213]:
edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines'
)

In [214]:
node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    marker=dict(
        showscale=True,
        colorscale='Blues',
        size=10,
        colorbar=dict(thickness=15, title="Node Degree"),
    )
)

In [215]:
layout = go.Layout(
    title="Google Web Graph Visualization",
    titlefont=dict(size=16),
    showlegend=False,
    hovermode="closest",
    margin=dict(b=0, l=0, r=0, t=0),
)

In [216]:
fig = go.Figure(data=[edge_trace, node_trace], layout=layout)
fig.show()

## Issues in the Current Visualization
- Nodes are too dense and stacked together, making it hard to read the structure.
- Color contrast could be improved to make edges and nodes more distinguishable.
- All nodes are the same size, which doesn’t highlight more important nodes.
- Hover info only shows coordinates, lacking meaningful data like Node ID or PageRank.

In [None]:
# improved graph plot

In [221]:
pagerank_scores = nx.pagerank(G_sub, alpha=0.85)

pos = nx.spring_layout(G_sub, k=0.3, seed=42)

node_size = np.array([pagerank_scores[node] for node in G_sub.nodes()]) * 8000  

node_x = []
node_y = []
for node in G_sub.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)

edge_x = []
edge_y = []
for edge in G_sub.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=1, color="rgba(150,150,150,0.5)"),  
    hoverinfo='none',
    mode='lines'
)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    marker=dict(
        showscale=True,
        colorscale="Viridis",  
        size=node_size,  
        color=[pagerank_scores[node] for node in G_sub.nodes()],  
        colorbar=dict(thickness=15, title="PageRank Score")
    ),
    hoverinfo="text",
    text=[f"Node: {node}<br>PageRank: {pagerank_scores[node]:.5f}" for node in G_sub.nodes()]
)

layout = go.Layout(
    title="Improved Google Web Graph Visualization",
    titlefont=dict(size=16),
    showlegend=False,
    hovermode="closest",
    margin=dict(b=0, l=0, r=0, t=40)
)

fig = go.Figure(data=[edge_trace, node_trace], layout=layout)
fig.show()