# Making the regex

In [1]:
# Cell 1: Imports and Setup
import requests
import re
import time
import os
from urllib.parse import unquote

# Create directory to store downloaded pages
os.makedirs('rock_artists', exist_ok=True)
print("Created/verified 'rock_artists' directory")

Created/verified 'rock_artists' directory


# Helper functions


In [4]:
# Cell 2: Helper function with User-Agent

def get_wikipedia_wikitext(page_title):
    """
    Fetch the wikitext content of a Wikipedia page using the API.
    """
    url = "https://en.wikipedia.org/w/api.php"
    
    # Wikipedia requires a User-Agent header
    headers = {
        'User-Agent': 'SocialGraphsProject/1.0 (Educational; DTU Course Project; Contact: student@dtu.dk)'
    }
    
    params = {
        'action': 'query',
        'prop': 'revisions',
        'rvprop': 'content',
        'rvslots': 'main',
        'titles': page_title,
        'format': 'json',
        'formatversion': 2
    }
    
    try:
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()
        data = response.json()
        
        # Extract the wikitext content
        pages = data['query']['pages']
        if pages and len(pages) > 0:
            page = pages[0]
            if 'revisions' in page:
                content = page['revisions'][0]['slots']['main']['content']
                return content
        return None
    except Exception as e:
        print(f"Error fetching {page_title}: {e}")
        return None

print("Helper function defined with proper User-Agent")

Helper function defined with proper User-Agent


In [6]:
# Cell 3: Fetch the List Page

print("Fetching the list of mainstream rock performers...")
list_page_content = get_wikipedia_wikitext("List_of_mainstream_rock_performers")

if list_page_content:
    print("Successfully fetched the list page!")
    print(f"Page length: {len(list_page_content)} characters\n")
    
    # Save the list page for reference
    with open('rock_artists/LIST_PAGE.txt', 'w', encoding='utf-8') as f:
        f.write(list_page_content)
    print("Saved list page to 'rock_artists/LIST_PAGE.txt'")
    
    # Show first 2000 characters to see structure
    print("\nFirst 2000 characters of the page:")
    print("="*80)
    print(list_page_content[:2000])
else:
    print("Failed to fetch the list page!")

Fetching the list of mainstream rock performers...
Successfully fetched the list page!
Page length: 93419 characters

Saved list page to 'rock_artists/LIST_PAGE.txt'

First 2000 characters of the page:
{{short description|None}}

This is an alphabetical '''list of mainstream rock performers''' spanning all subgenres and fusions within the genre of [[rock music]].  Artists included are known for creating material predominantly within a style of rock music (Rockabilly, Rock & Roll,  Heavy Metal, Punk Rock, Alternative Rock, Classic Rock, Modern Rock, Indie Rock, etc.);  have enjoyed considerable success on singles or album charts;  recorded multiple songs that have endured or increased in popularity over time and continue to receive heavy airplay, streaming or downloads;  and garnered a significant following.  Solo artists are sorted by surname.

{{Compact ToC|q=Q|x=X|num=yes|center=yes|seealso=yes|nobreak=yes|refs=yes}}

==0-9==
{{div col|colwidth=30em}}
* [[10cc]]<ref>{{cite web |autho

In [7]:
# Cell 4: Extract performer links using regex

def extract_performer_links(wikitext):
    """
    Extract only the performer links from the list page.
    The performers are in bulleted lists: * [[Artist Name]]
    """
    # Pattern: Find lines starting with * followed by [[link]]
    # This captures the artist name from * [[Artist Name]]<ref>... format
    pattern = r'^\*\s*\[\[([^\]|]+)(?:\|[^\]]+)?\]\]'
    
    matches = re.findall(pattern, wikitext, re.MULTILINE)
    
    # Clean up - remove any remaining markup
    cleaned = [match.strip() for match in matches]
    
    return cleaned

# Extract performers
performers = extract_performer_links(list_page_content)

print(f"Found {len(performers)} performers")
print("\nFirst 30 performers:")
for i, performer in enumerate(performers[:30]):
    print(f"  {i+1}. {performer}")

Found 488 performers

First 30 performers:
  1. 10cc
  2. 10 Years (band)
  3. 3 Doors Down
  4. 311 (band)
  5. 38 Special (band)
  6. ABBA
  7. Accept (band)
  8. AC/DC
  9. Bryan Adams
  10. Aerosmith
  11. AFI (band)
  12. Air Supply
  13. The Alan Parsons Project
  14. Alice in Chains
  15. The All-American Rejects
  16. The Allman Brothers Band
  17. Alter Bridge
  18. Ambrosia (band)
  19. America (band)
  20. The Animals
  21. Adam Ant
  22. Anthrax (American band)
  23. April Wine
  24. Arcade Fire
  25. Arctic Monkeys
  26. Asia (band)
  27. Audioslave
  28. Avenged Sevenfold
  29. Awolnation
  30. The B-52's


In [8]:
# Cell 5: Save the performer list and verify

# Save to file for reference
with open('rock_artists/PERFORMER_LIST.txt', 'w', encoding='utf-8') as f:
    for performer in performers:
        f.write(f"{performer}\n")

print(f"Saved {len(performers)} performers to 'rock_artists/PERFORMER_LIST.txt'")
print(f"\nLast 20 performers:")
for i, performer in enumerate(performers[-20:], start=len(performers)-19):
    print(f"  {i}. {performer}")

Saved 488 performers to 'rock_artists/PERFORMER_LIST.txt'

Last 20 performers:
  469. Joe Walsh
  470. Warrant (American band)
  471. W.A.S.P. (band)
  472. Weezer
  473. Jack White
  474. The White Stripes
  475. White Zombie (band)
  476. Whitesnake
  477. The Who
  478. Paul McCartney and Wings
  479. Steve Winwood
  480. X (American band)
  481. X Ambassadors
  482. The Yardbirds
  483. Yes (band)
  484. Neil Young
  485. Frank Zappa
  486. Rob Zombie
  487. The Zombies
  488. ZZ Top


# Downloading all the pages

In [10]:
# Cell 6: Download all performer pages (IMPROVED VERSION)

def sanitize_filename(filename):
    """
    Convert a string to a safe filename by removing/replacing invalid characters.
    Also removes (band), (singer), etc. suffixes for cleaner filenames.
    """
    # Remove disambiguation suffixes like (band), (singer), (musician), etc.
    filename = re.sub(r'\s*\([^)]*\)\s*', '', filename)
    
    # Replace problematic characters with safe alternatives
    replacements = {
        '/': '_',
        '\\': '_',
        ':': '_',
        '*': '_',
        '?': '_',
        '"': '_',
        '<': '_',
        '>': '_',
        '|': '_',
        '!': '',
        '.': '_'
    }
    
    for old, new in replacements.items():
        filename = filename.replace(old, new)
    
    # Clean up multiple underscores and trailing/leading underscores
    filename = re.sub(r'_+', '_', filename)
    filename = filename.strip('_')
    
    return filename

def download_performer_pages(performers, output_dir='rock_artists/pages'):
    """
    Download Wikipedia pages for all performers and save them as text files.
    """
    # Create the pages subdirectory
    os.makedirs(output_dir, exist_ok=True)
    
    successful = 0
    failed = []
    
    print(f"Starting download of {len(performers)} performer pages...")
    print(f"Saving to: {output_dir}/")
    print("This may take several minutes...\n")
    
    for i, performer in enumerate(performers, 1):
        # Keep original name for API call (with spaces as underscores)
        page_title = performer.replace(' ', '_')
        
        # Fetch the page using the FULL original name (including (band) etc.)
        content = get_wikipedia_wikitext(page_title)
        
        if content:
            # Create clean filename (without (band) and special chars)
            safe_filename = sanitize_filename(page_title)
            filename = f"{output_dir}/{safe_filename}.txt"
            
            try:
                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(content)
                successful += 1
                
                # Progress indicator every 50 pages
                if i % 50 == 0:
                    print(f"Progress: {i}/{len(performers)} pages downloaded")
            except Exception as e:
                print(f"  Error saving {performer}: {e}")
                failed.append(performer)
        else:
            failed.append(performer)
            print(f"  Failed to download: {performer}")
        
        # Be polite to Wikipedia's servers
        time.sleep(0.1)
    
    print(f"\n{'='*60}")
    print(f"Download complete!")
    print(f"  Successfully downloaded: {successful}")
    print(f"  Failed: {len(failed)}")
    
    if failed:
        print(f"\nFailed performers (first 15):")
        for performer in failed[:15]:
            print(f"  - {performer}")
        if len(failed) > 15:
            print(f"  ... and {len(failed) - 15} more")
    
    return successful, failed

# Run the download
successful_count, failed_list = download_performer_pages(performers)

Starting download of 488 performer pages...
Saving to: rock_artists/pages/
This may take several minutes...

Progress: 50/488 pages downloaded
Progress: 100/488 pages downloaded
Progress: 150/488 pages downloaded
Progress: 200/488 pages downloaded
Progress: 250/488 pages downloaded
Progress: 300/488 pages downloaded
Progress: 350/488 pages downloaded
Progress: 400/488 pages downloaded
Progress: 450/488 pages downloaded

Download complete!
  Successfully downloaded: 488
  Failed: 0


In [11]:
# Cell 8: Verify downloads and show examples

pages_dir = 'rock_artists/pages'
downloaded_files = [f for f in os.listdir(pages_dir) if f.endswith('.txt')]

print(f"Total performer pages downloaded: {len(downloaded_files)}")

# Check some specific cases we were worried about
test_cases = ['AC_DC.txt', 'Panic_at_the_Disco.txt', 'W_A_S_P.txt']
print("\nSpecial characters handled correctly:")
for test_file in test_cases:
    if test_file in downloaded_files:
        print(f"  ✓ {test_file}")
    else:
        print(f"  ✗ {test_file} - NOT FOUND")

# Show file size distribution
file_sizes = []
for filename in downloaded_files:
    file_path = f"{pages_dir}/{filename}"
    file_sizes.append(os.path.getsize(file_path))

print(f"\nFile size statistics:")
print(f"  Smallest: {min(file_sizes):,} bytes")
print(f"  Largest: {max(file_sizes):,} bytes")
print(f"  Average: {sum(file_sizes)//len(file_sizes):,} bytes")

# Show a sample of content from one file
print(f"\nSample content from 'The_Beatles.txt' (first 500 chars):")
print("="*80)
try:
    with open(f"{pages_dir}/The_Beatles.txt", 'r', encoding='utf-8') as f:
        print(f.read()[:500])
except FileNotFoundError:
    print("The Beatles file not found - showing first available file instead")
    sample_file = downloaded_files[0]
    with open(f"{pages_dir}/{sample_file}", 'r', encoding='utf-8') as f:
        print(f"Content from {sample_file}:")
        print(f.read()[:500])

Total performer pages downloaded: 485

Special characters handled correctly:
  ✓ AC_DC.txt
  ✓ Panic_at_the_Disco.txt
  ✓ W_A_S_P.txt

File size statistics:
  Smallest: 57 bytes
  Largest: 370,570 bytes
  Average: 86,354 bytes

Sample content from 'The_Beatles.txt' (first 500 chars):
{{Short description|English rock band (1960–1970)}}
{{About|the band|their eponymous album|The Beatles (album){{!}}''The Beatles'' (album)|other uses|Beatles (disambiguation)}}
{{Redirect-multi|2|Beatle|Fab Four|the insect|Beetle|other uses|Fab Four (disambiguation)}}
{{Featured article}}
{{Protection padlock|small=yes}}
{{Use British English|date=October 2024}}
{{Use dmy dates|date=October 2024}}
{{Infobox musical artist
| name              = The Beatles
| image             = The Fabs.JPG
| cap
