# WikiSpeedAI Analysis - Network Metrics

This notebook analyzes JSON files generated by wikispeedai and calculates network metrics to compare Wikipedia vs LLM exploration.

In [4]:
import json
import os
import glob
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from pathlib import Path
from collections import defaultdict
import warnings

warnings.filterwarnings('ignore')

DATA_DIRECTORY = 'results'

def extract_metadata_from_path(file_path):
    """
    Extract temperature, starting page, and personality from file path.
    Expected structure: results/temp_X_Y_personality_NAME/StartingPage/file.json
    """
    parts = Path(file_path).parts
    
    metadata = {
        'temperature': None,
        'run': None,
        'starting_page': None,
        'personality': None
    }
    
    for i, part in enumerate(parts):
        # Extract temperature and personality from folder name like "temp_0_3_personality_baseline"
        if part.startswith('temp_'):
            # Match pattern: temp_X_Y_personality_NAME
            match = re.match(r'temp_(\d+)_(\d+)_personality_(.+)', part)
            if match:
                temp_int, temp_dec, personality = match.groups()
                metadata['temperature'] = f"{temp_int}.{temp_dec}"
                metadata['personality'] = personality
        
        # Starting page is the folder after temperature folder
        elif metadata['temperature'] and not metadata['starting_page']:
            # Check if this is not the filename
            if not part.endswith('.json'):
                metadata['starting_page'] = part
    
    return metadata

def load_json_files_with_metadata(directory_pattern='results'):
    """
    Load all JSON files with extracted metadata from path.
    """
    data_list = []
    
    json_pattern = f"{directory_pattern}/**/*.json"
    json_files = glob.glob(json_pattern, recursive=True)
    
    print(f"Searching for JSON files with pattern: {json_pattern}")
    print(f"Files found: {len(json_files)}")
    
    for json_file in json_files:
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                
                metadata = extract_metadata_from_path(json_file)
                
                if isinstance(data, list):
                    for item in data:
                        if isinstance(item, dict):
                            item.update(metadata)
                            item['_file_path'] = json_file
                            data_list.append(item)
                elif isinstance(data, dict):
                    data.update(metadata)
                    data['_file_path'] = json_file
                    data_list.append(data)
                    
        except Exception as e:
            print(f"Error loading {json_file}: {e}")
    
    print(f"Successfully loaded {len(data_list)} records\n")
    return data_list

all_data = load_json_files_with_metadata(DATA_DIRECTORY)

# Display metadata summary
if all_data:
    df_temp = pd.DataFrame(all_data)
    print("Data summary:")
    print(f"Total records: {len(df_temp)}")
    print(f"\nTemperatures: {sorted(df_temp['temperature'].dropna().unique())}")
    print(f"Starting pages: {sorted(df_temp['starting_page'].dropna().unique())}")
    print(f"Personalities: {sorted(df_temp['personality'].dropna().unique())}")
    print(f"Runs: {sorted(df_temp['run'].dropna().unique())}")


Searching for JSON files with pattern: results/**/*.json
Files found: 312
Successfully loaded 312 records

Data summary:
Total records: 312

Temperatures: ['0.3', '1.5']
Starting pages: ['Albert_Einstein', 'Computer_Science']
Personalities: ['baseline', 'busybody', 'dancer', 'hunter']
Runs: []


## Load Wikipedia Links

Load the Wikipedia hyperlinks structure from the pickle file created by `wiki_links.py`.

In [3]:
import pickle

# Load Wikipedia links from pickle file
wikipedia_links_file = 'wikipedia_links.pkl'

try:
    with open(wikipedia_links_file, 'rb') as f:
        wikipedia_links = pickle.load(f)
    
    print(f"Loaded Wikipedia links for {len(wikipedia_links)} pages")
    print(f"Total links: {sum(len(links) for links in wikipedia_links.values())}")
    print(f"Average links per page: {sum(len(links) for links in wikipedia_links.values()) / len(wikipedia_links):.1f}")
    
except FileNotFoundError:
    print(f"File '{wikipedia_links_file}' not found!")
    print("Please run 'python wiki_links.py' first to download Wikipedia links.")
    wikipedia_links = None

Loaded Wikipedia links for 18 pages
Total links: 14179
Average links per page: 787.7


## Build Wikipedia Graph

Build a graph using the actual Wikipedia hyperlink structure.

In [57]:
def build_wikipedia_graph(wikipedia_links):
    """
    Build a directed graph from Wikipedia hyperlink structure.
    Args:
        wikipedia_links: Dictionary mapping page titles to lists of linked pages
    Returns:
        networkx.DiGraph: Directed graph with Wikipedia links
    """
    G = nx.DiGraph()
    
    if wikipedia_links is None:
        return G
    
    for source_page, target_pages in wikipedia_links.items():
        # Add source node
        if source_page not in G.nodes():
            G.add_node(source_page, page=source_page, source='wikipedia')
        
        # Add edges to all linked pages
        for target_page in target_pages:
            # Only add edges to pages that are also in our dataset
            if target_page in wikipedia_links:
                if target_page not in G.nodes():
                    G.add_node(target_page, page=target_page, source='wikipedia')
                
                # Add edge
                G.add_edge(source_page, target_page, weight=1, source='wikipedia')
    
    return G

# Build Wikipedia graph
G_wiki = build_wikipedia_graph(wikipedia_links)

print(f"Wikipedia Graph Statistics:")
print(f"Number of nodes (pages): {G_wiki.number_of_nodes()}")
print(f"Number of edges (links): {G_wiki.number_of_edges()}")
print(f"Graph is connected: {nx.is_weakly_connected(G_wiki) if G_wiki.number_of_nodes() > 0 else 'N/A'}")

if G_wiki.number_of_nodes() > 0:
    print(f"Average degree: {np.mean([d for n, d in G_wiki.degree()]):.2f}")
    print(f"Density: {nx.density(G_wiki):.4f}")

Wikipedia Graph Statistics:
Number of nodes (pages): 18
Number of edges (links): 86
Graph is connected: True
Average degree: 9.56
Density: 0.2810


## Build Graphs by Temperature and Personality

Build separate graphs for each combination of temperature and personality.

In [58]:
def build_graph_from_data(data_list):
    """
    Build a directed graph from navigation data.
    """
    G = nx.DiGraph()
    
    for data in data_list:
        path = data.get('path', [])
        
        for i in range(len(path) - 1):
            source = path[i]
            target = path[i + 1]
            
            if source not in G.nodes():
                G.add_node(source)
            if target not in G.nodes():
                G.add_node(target)
            
            if G.has_edge(source, target):
                G[source][target]['weight'] += 1
            else:
                G.add_edge(source, target, weight=1)
    
    return G

def group_data_by_conditions(data_list):
    """
    Group data by temperature and personality.
    Returns dict with keys like ('0.3', 'personality1').
    """
    grouped = defaultdict(list)
    
    for data in data_list:
        temp = data.get('temperature', 'unknown')
        personality = data.get('personality', 'unknown')
        key = (temp, personality)
        grouped[key].append(data)
    
    return dict(grouped)

# Group data and build graphs
grouped_data = group_data_by_conditions(all_data)
graphs = {}

print("Building graphs for each condition:")
for (temp, personality), data in grouped_data.items():
    graph_key = f"temp_{temp}_personality_{personality}"
    graphs[graph_key] = build_graph_from_data(data)
    print(f"  {graph_key}: {graphs[graph_key].number_of_nodes()} nodes, {graphs[graph_key].number_of_edges()} edges")

print(f"\nTotal graphs created: {len(graphs)}")


Building graphs for each condition:
  temp_0.3_personality_None: 22 nodes, 27 edges
  temp_1.5_personality_None: 32 nodes, 41 edges

Total graphs created: 2


## Calculate Metrics for All Conditions

Calculate network metrics for each temperature/personality combination and Wikipedia.

In [59]:
def calculate_graph_metrics(G):
    """
    Calculate all network metrics for a graph.
    Returns dictionary with metric values.
    """
    metrics = {}
    
    if G.number_of_nodes() == 0:
        return metrics
    
    # Basic metrics
    metrics['num_nodes'] = G.number_of_nodes()
    metrics['num_edges'] = G.number_of_edges()
    metrics['density'] = nx.density(G)
    metrics['avg_degree'] = np.mean([d for n, d in G.degree()])
    
    # Degree distribution
    in_degrees = [d for n, d in G.in_degree()]
    out_degrees = [d for n, d in G.out_degree()]
    metrics['avg_in_degree'] = np.mean(in_degrees)
    metrics['avg_out_degree'] = np.mean(out_degrees)
    metrics['max_in_degree'] = max(in_degrees)
    metrics['max_out_degree'] = max(out_degrees)
    
    # Clustering coefficient
    G_undirected = G.to_undirected()
    clustering = nx.clustering(G_undirected)
    metrics['avg_clustering'] = np.mean(list(clustering.values()))
    
    # Shortest path
    if nx.is_weakly_connected(G):
        try:
            metrics['avg_shortest_path'] = nx.average_shortest_path_length(G_undirected)
        except:
            metrics['avg_shortest_path'] = None
    else:
        components = list(nx.weakly_connected_components(G))
        if components:
            largest = max(components, key=len)
            subgraph = G.subgraph(largest).to_undirected()
            try:
                metrics['avg_shortest_path'] = nx.average_shortest_path_length(subgraph)
            except:
                metrics['avg_shortest_path'] = None
        else:
            metrics['avg_shortest_path'] = None
    
    # PageRank
    try:
        pagerank = nx.pagerank(G, alpha=0.85, max_iter=100)
        pr_values = list(pagerank.values())
        metrics['mean_pagerank'] = np.mean(pr_values)
        metrics['max_pagerank'] = max(pr_values)
    except:
        metrics['mean_pagerank'] = None
        metrics['max_pagerank'] = None
    
    return metrics

# Calculate metrics for all graphs
all_metrics = {}

print("Calculating metrics for LLM graphs:")
for graph_key, G in graphs.items():
    all_metrics[graph_key] = calculate_graph_metrics(G)
    print(f"  {graph_key}: completed")

# Calculate metrics for Wikipedia
print("\nCalculating metrics for Wikipedia:")
all_metrics['wikipedia'] = calculate_graph_metrics(G_wiki)
print("  Wikipedia: completed")


Calculating metrics for LLM graphs:
  temp_0.3_personality_None: completed
  temp_1.5_personality_None: completed

Calculating metrics for Wikipedia:
  Wikipedia: completed


In [60]:
# Create comparison DataFrame
rows = []

for graph_key, metrics in all_metrics.items():
    if 'temp_' in graph_key:
        # Parse LLM graph key
        parts = graph_key.split('_')
        temp_idx = parts.index('temp') + 1
        pers_idx = parts.index('personality') + 1
        
        row = {
            'Source': 'LLM',
            'Temperature': parts[temp_idx],
            'Personality': '_'.join(parts[pers_idx:]),
            **metrics
        }
    else:
        # Wikipedia row
        row = {
            'Source': 'Wikipedia',
            'Temperature': 'N/A',
            'Personality': 'N/A',
            **metrics
        }
    
    rows.append(row)

df_comparison = pd.DataFrame(rows)

# Reorder columns
first_cols = ['Source', 'Temperature', 'Personality', 'num_nodes', 'num_edges', 'density', 
              'avg_degree', 'avg_in_degree', 'avg_out_degree', 'avg_clustering', 
              'avg_shortest_path', 'mean_pagerank', 'max_pagerank']
df_comparison = df_comparison[first_cols + [c for c in df_comparison.columns if c not in first_cols]]

print("\nComparison DataFrame created with", len(df_comparison), "rows")



Comparison DataFrame created with 3 rows


In [61]:
# Save final comparison table
output_file = 'metrics_comparison.json'
df_comparison.to_json(output_file, orient='records', indent=2)

print("\nFINAL RESULTS")
print("=" * 120)

# Format numeric columns for better readability
df_display = df_comparison.copy()
for col in df_display.columns:
    if col not in ['Source', 'Temperature', 'Personality']:
        if df_display[col].dtype in ['float64', 'float32']:
            df_display[col] = df_display[col].apply(lambda x: f"{x:.6f}" if pd.notna(x) else "N/A")

# Display with better formatting
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)
pd.set_option('display.max_colwidth', 15)

print(df_display.to_string(index=False))

print("=" * 120)
print(f"\nResults saved to: {output_file}")
print("\nAnalysis completed!")



FINAL RESULTS
   Source Temperature Personality  num_nodes  num_edges  density avg_degree avg_in_degree avg_out_degree avg_clustering avg_shortest_path mean_pagerank max_pagerank  max_in_degree  max_out_degree
      LLM         0.3        None         22         27 0.058442   2.454545      1.227273       1.227273       0.284632          4.012987      0.045455     0.160142              4               7
      LLM         1.5        None         32         41 0.041331   2.562500      1.281250       1.281250       0.126811          3.635081      0.031250     0.164228              6               9
Wikipedia         N/A         N/A         18         86 0.281046   9.555556      4.777778       4.777778       0.556614          1.843137      0.055556     0.124733             10               9

Results saved to: metrics_comparison.json

Analysis completed!
