# The actual network

In [13]:
import networkx as nx
import os
import re


In [14]:
# Load performer list
with open('rock_artists/PERFORMER_LIST.txt', 'r', encoding='utf-8') as f:
    performers = [line.strip() for line in f if line.strip()]

performer_set = set(performers)

# Create mapping: sanitized filename -> original performer name
def sanitize_filename(filename):
    """Same function from download"""
    filename = re.sub(r'\s*\([^)]*\)\s*', '', filename)
    replacements = {
        '/': '_', '\\': '_', ':': '_', '*': '_', '?': '_',
        '"': '_', '<': '_', '>': '_', '|': '_', '!': '', '.': '_'
    }
    for old, new in replacements.items():
        filename = filename.replace(old, new)
    filename = re.sub(r'_+', '_', filename)
    return filename.strip('_')

filename_to_performer = {}
for performer in performers:
    sanitized = sanitize_filename(performer.replace(' ', '_'))
    filename_to_performer[sanitized] = performer

print(f"Loaded {len(performers)} performers")
print(f"Created {len(filename_to_performer)} filename mappings")

Loaded 488 performers
Created 485 filename mappings


In [15]:
def count_words(content):
    """
    Count words in wiki content, excluding markup.
    Simple approach: just count whitespace-separated tokens that look like words.
    """
    # Remove wiki templates {{...}}
    content = re.sub(r'\{\{[^}]*\}\}', '', content)
    # Remove HTML tags
    content = re.sub(r'<[^>]+>', '', content)
    # Remove wiki links but keep text: [[Link|Text]] becomes Text
    content = re.sub(r'\[\[([^\]|]+\|)?([^\]]+)\]\]', r'\2', content)
    # Split by whitespace and count
    words = content.split()
    return len(words)

def extract_wiki_links(content):
    """
    Extract all wiki links from the content.
    Pattern explanation:
    - \[\[ matches opening [[
    - ([^\]|]+) captures everything except ] or |
    - (?:\|[^\]]+)? optionally matches |DisplayText (but we don't capture it)
    - \]\] matches closing ]]
    """
    pattern = r'\[\[([^\]|]+)(?:\|[^\]]+)?\]\]'
    links = re.findall(pattern, content)
    return [link.strip() for link in links]

In [16]:
# Create directed graph
G = nx.DiGraph()

pages_dir = 'rock_artists/pages'

print("Building network from all performer pages...")
print("This will take a couple minutes...\n")

processed = 0
edges_created = 0

for filename in os.listdir(pages_dir):
    if not filename.endswith('.txt'):
        continue
    
    # Read the page
    filepath = os.path.join(pages_dir, filename)
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Figure out which performer this is
    # filename is like "The_Beatles.txt", we need "The Beatles (band)" or similar
    # For now, let's use the filename without .txt as the node name
    base_filename = filename[:-4]  # Remove .txt
    performer_name = filename_to_performer.get(base_filename, base_filename.replace('_', ' '))
    
    # Count words
    word_count = count_words(content)
    
    # Add node with word count attribute
    G.add_node(performer_name, content_length=word_count)
    
    # Extract and filter links
    all_links = extract_wiki_links(content)
    performer_links = [link for link in all_links if link in performer_set]
    
    # Add edges (remove duplicates with set())
    for target in set(performer_links):
        G.add_edge(performer_name, target)
        edges_created += 1
    
    processed += 1
    if processed % 50 == 0:
        print(f"Processed {processed} pages, {edges_created} edges so far...")

print(f"\n{'='*60}")
print(f"Network built!")
print(f"Nodes: {G.number_of_nodes()}")
print(f"Edges: {G.number_of_edges()}")

Building network from all performer pages...
This will take a couple minutes...

Processed 50 pages, 711 edges so far...
Processed 100 pages, 1425 edges so far...
Processed 150 pages, 2200 edges so far...
Processed 200 pages, 2870 edges so far...
Processed 250 pages, 3725 edges so far...
Processed 300 pages, 4543 edges so far...
Processed 350 pages, 5331 edges so far...
Processed 400 pages, 6004 edges so far...
Processed 450 pages, 6789 edges so far...

Network built!
Nodes: 488
Edges: 7267


In [None]:
# Cell 6: Quick visualization
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 12))

# Use spring layout (will take a moment to compute)
pos = nx.spring_layout(G, k=0.5, iterations=50)

# Draw nodes (small size since there are many)
nx.draw_networkx_nodes(G, pos, node_size=20, node_color='lightblue', alpha=0.6)

# Draw edges (very thin and transparent)
nx.draw_networkx_edges(G, pos, alpha=0.1, arrows=False, width=0.5)

plt.title(f"Rock Performers Network\n{G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
plt.axis('off')
plt.tight_layout()
plt.show()

In [17]:
# Cell 5: Check for isolated nodes

# Find nodes with no connections (no in-degree AND no out-degree)
isolated_nodes = [node for node in G.nodes() if G.degree(node) == 0]

print(f"Isolated nodes (no connections): {len(isolated_nodes)}")
if isolated_nodes:
    print("\nIsolated performers:")
    for node in isolated_nodes[:10]:
        print(f"  - {node}")
    if len(isolated_nodes) > 10:
        print(f"  ... and {len(isolated_nodes) - 10} more")

Isolated nodes (no connections): 4

Isolated performers:
  - The B-52's
  - Dr. Hook & the Medicine Show
  - Jet (Australian band)
  - Van Zant (band)


In [20]:
# Cell 7: Remove isolated nodes and extract largest component

# Remove isolated nodes
G.remove_nodes_from(isolated_nodes)
print(f"After removing isolated nodes: {G.number_of_nodes()} nodes")

# Extract largest weakly connected component
# (weakly connected means ignoring edge direction)
largest_wcc = max(nx.weakly_connected_components(G), key=len)
G_largest = G.subgraph(largest_wcc).copy()

print(f"\nLargest weakly connected component:")
print(f"  Nodes: {G_largest.number_of_nodes()}")
print(f"  Edges: {G_largest.number_of_edges()}")

# This is now our final network for analysis
G = G_largest
print(f"\nFinal network: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

After removing isolated nodes: 484 nodes

Largest weakly connected component:
  Nodes: 484
  Edges: 7267

Final network: 484 nodes, 7267 edges


In [21]:
# Cell 8: Save the network

# Save as GraphML (good format, preserves attributes)
nx.write_graphml(G, 'rock_artists/rock_network.graphml')
print("Saved network as GraphML")

# Also save basic info
print(f"\nNetwork Summary:")
print(f"  Nodes: {G.number_of_nodes()}")
print(f"  Edges: {G.number_of_edges()}")
print(f"  Density: {nx.density(G):.4f}")
print(f"  Is directed: {G.is_directed()}")

# Verify we can load it back
G_test = nx.read_graphml('rock_artists/rock_network.graphml')
print(f"\nVerification - loaded network has {G_test.number_of_nodes()} nodes, {G_test.number_of_edges()} edges")

Saved network as GraphML

Network Summary:
  Nodes: 484
  Edges: 7267
  Density: 0.0311
  Is directed: True

Verification - loaded network has 484 nodes, 7267 edges
