# The actual network

In [None]:
import networkx as nx
import os
import re


In [None]:
# Load performer list
with open('rock_artists/PERFORMER_LIST.txt', 'r', encoding='utf-8') as f:
    performers = [line.strip() for line in f if line.strip()]

performer_set = set(performers)

# Create mapping: sanitized filename -> original performer name
def sanitize_filename(filename):
    """Same function from download"""
    filename = re.sub(r'\s*\([^)]*\)\s*', '', filename)
    replacements = {
        '/': '_', '\\': '_', ':': '_', '*': '_', '?': '_',
        '"': '_', '<': '_', '>': '_', '|': '_', '!': '', '.': '_'
    }
    for old, new in replacements.items():
        filename = filename.replace(old, new)
    filename = re.sub(r'_+', '_', filename)
    return filename.strip('_')

filename_to_performer = {}
for performer in performers:
    sanitized = sanitize_filename(performer.replace(' ', '_'))
    filename_to_performer[sanitized] = performer

print(f"Loaded {len(performers)} performers")
print(f"Created {len(filename_to_performer)} filename mappings")

Loaded 488 performers


In [9]:
def count_words(content):
    """
    Count words in wiki content, excluding markup.
    Simple approach: just count whitespace-separated tokens that look like words.
    """
    # Remove wiki templates {{...}}
    content = re.sub(r'\{\{[^}]*\}\}', '', content)
    # Remove HTML tags
    content = re.sub(r'<[^>]+>', '', content)
    # Remove wiki links but keep text: [[Link|Text]] becomes Text
    content = re.sub(r'\[\[([^\]|]+\|)?([^\]]+)\]\]', r'\2', content)
    # Split by whitespace and count
    words = content.split()
    return len(words)

def extract_wiki_links(content):
    """
    Extract all wiki links from the content.
    Pattern explanation:
    - \[\[ matches opening [[
    - ([^\]|]+) captures everything except ] or |
    - (?:\|[^\]]+)? optionally matches |DisplayText (but we don't capture it)
    - \]\] matches closing ]]
    """
    pattern = r'\[\[([^\]|]+)(?:\|[^\]]+)?\]\]'
    links = re.findall(pattern, content)
    return [link.strip() for link in links]

In [12]:

# Create directed graph
G = nx.DiGraph()

pages_dir = 'rock_artists/pages'

print("Building network from all performer pages...")
print("This will take a couple minutes...\n")

processed = 0
edges_created = 0

for filename in os.listdir(pages_dir):
    if not filename.endswith('.txt'):
        continue
    
    # Read the page
    filepath = os.path.join(pages_dir, filename)
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Figure out which performer this is
    # filename is like "The_Beatles.txt", we need "The Beatles (band)" or similar
    # For now, let's use the filename without .txt as the node name
    performer_name = filename[:-4].replace('_', ' ')
    
    # Count words
    word_count = count_words(content)
    
    # Add node with word count attribute
    G.add_node(performer_name, content_length=word_count)
    
    # Extract and filter links
    all_links = extract_wiki_links(content)
    performer_links = [link for link in all_links if link in performer_set]
    
    # Add edges (remove duplicates with set())
    for target in set(performer_links):
        G.add_edge(performer_name, target)
        edges_created += 1
    
    processed += 1
    if processed % 50 == 0:
        print(f"Processed {processed} pages, {edges_created} edges so far...")

print(f"\n{'='*60}")
print(f"Network built!")
print(f"Nodes: {G.number_of_nodes()}")
print(f"Edges: {G.number_of_edges()}")

Building network from all performer pages...
This will take a couple minutes...

Processed 50 pages, 711 edges so far...
Processed 100 pages, 1425 edges so far...
Processed 150 pages, 2200 edges so far...
Processed 200 pages, 2870 edges so far...
Processed 250 pages, 3725 edges so far...
Processed 300 pages, 4543 edges so far...
Processed 350 pages, 5331 edges so far...
Processed 400 pages, 6004 edges so far...
Processed 450 pages, 6789 edges so far...

Network built!
Nodes: 584
Edges: 7267
