In [0]:
#pip install requests beautifulsoup4 pandas

##Import relevant libraries

In [0]:
import requests
import re
import pandas as pd
from html import unescape
import os

## Establish list of URLs and output folder

In [0]:
# Folder to save CSVs
output_folder = "/Workspace/AUS vs US Music Taste/AUS-vs-US-Music-Taste-Data-Project/Data/Raw/Aria"
os.makedirs(output_folder, exist_ok=True)

# List of ARIA URLs
urls = [
    "https://www.top100singles.net/2011/05/aria-top-singles-of-2000.html#show",
    "https://www.top100singles.net/2011/05/aria-top-singles-of-2001.html#show",
    "https://www.top100singles.net/2011/05/aria-top-singles-of-2002.html#show", 
    "https://www.top100singles.net/2011/05/aria-top-singles-of-2003.html#show",
    "https://www.top100singles.net/2011/05/aria-top-singles-of-2004.html#show",
    "https://www.top100singles.net/2011/05/aria-top-singles-of-2005.html#show", 
    "https://www.top100singles.net/2011/05/aria-top-singles-of-2006.html#show", 
    "https://www.top100singles.net/2011/05/aria-top-singles-of-2007.html#show", 
    "https://www.top100singles.net/2011/05/aria-top-singles-of-2008_03.html#show", 
    "https://www.top100singles.net/2011/05/aria-top-singles-of-2009.html#show", 
    "https://www.top100singles.net/2011/08/aria-top-singles-of-2010.html#show", 
    "https://www.top100singles.net/2012/01/aria-top-singles-of-2011.html#show", 
    "https://www.top100singles.net/2013/01/aria-top-singles-of-2012.html#show",
    "https://www.top100singles.net/2014/01/aria-top-singles-of-2013.html#show", 
    "https://www.top100singles.net/2015/01/aria-top-singles-of-2014.html#show", 
    "https://www.top100singles.net/2016/01/aria-top-singles-of-2015.html#show", 
    "https://www.top100singles.net/2017/01/aria-top-singles-of-2016.html#show",
    "https://www.top100singles.net/2018/01/aria-top-singles-of-2017.html#show", 
    "https://www.top100singles.net/2019/01/aria-top-singles-of-2018.html#show", 
    "https://www.top100singles.net/2020/01/aria-top-singles-of-2019.html#show", 
    "https://www.top100singles.net/2021/01/aria-top-singles-of-2020.html#show", 
    "https://www.top100singles.net/2022/01/aria-top-singles-of-2021.html#show", 
    "https://www.top100singles.net/2023/01/aria-top-singles-of-2022.html#show", 
    "https://www.top100singles.net/2024/01/aria-top-singles-of-2023.html#show", 
    "https://www.top100singles.net/2025/01/aria-top-singles-of-2024.html#show", 
    "https://www.top100singles.net/2026/01/aria-top-singles-of-2025.html#show"
]

## Java script array helper function 

In [0]:
def extract_js_array(html, var_name):
    """Extract a JavaScript array from the page and return as a Python list."""
    pattern = rf"var {var_name}=\[(.*?)\];"
    match = re.search(pattern, html, re.DOTALL)
    if not match:
        return []
    array_text = match.group(1)

    # Split carefully, ignoring commas inside quotes
    items = []
    current = ''
    in_quotes = False
    for c in array_text:
        if c == '"':
            in_quotes = not in_quotes
        if c == ',' and not in_quotes:
            items.append(current.strip().strip('"'))
            current = ''
        else:
            current += c
    if current:
        items.append(current.strip().strip('"'))

    # Decode HTML entities and unescape backslashes
    items = [unescape(item.replace("\\'", "'").replace('\\"', '"')) for item in items]
    return items

## Main loop 

In [0]:
for url in urls:
    print(f"Processing {url} ...")
    response = requests.get(url)
    response.raise_for_status()
    html = response.text

    # Extract year from URL for file naming
    year_match = re.search(r'aria-top-singles-of-(\d{4})', url)
    year = year_match.group(1) if year_match else "unknown"

    # Extract arrays
    titles = extract_js_array(html, "s")
    artists = extract_js_array(html, "t")

    # Build DataFrame
    min_len = min(len(titles), len(artists))
    df = pd.DataFrame({
        "rank": range(1, min_len+1),
        "title": titles[:min_len],
        "artist": artists[:min_len],
        "year": int(year)
    })

    # Save CSV
    output_path = os.path.join(output_folder, f"aria_top100_{year}.csv")
    df.to_csv(output_path, index=False)
    print(f"Saved CSV for {year} â†’ {output_path}")