In [194]:
#from playwright.sync_api import sync_playwright
from playwright.async_api import async_playwright, expect, TimeoutError
import json
from pathlib import Path
import time
from datetime import date, timedelta
import asyncio
from urllib.parse import urljoin
import os
from urllib.parse import urlparse, parse_qs, unquote
import re

## 1. Run first - functions and user/pw info

In [195]:
# login info 
user, pw = 'katrina.mullan@mso.umt.edu', 'wvp6rau6rqb_bwy1EYT!'
url = 'https://pinnacle.azira.com/'


async def start_browser():
    playwright = await async_playwright().start()

    browser = await playwright.chromium.launch(headless=False)

    context = await browser.new_context(
        geolocation={"latitude": 46.8721, "longitude": -113.9940},
        permissions=["geolocation"],
    )

    page = await context.new_page()

    return {
        "playwright": playwright,
        "browser": browser,
        "context": context,
        "page": page,
    }

async def login(page, user, pw):
    await page.goto(url) 
    await page.get_by_role("textbox", name="Email Address").fill(user)
    await page.get_by_role("textbox", name="Password").fill(pw)
    await page.get_by_role("button", name="Log in").click()

# get links on a page

async def get_links(page, restrict_state=None):
    links = []      

    rows = page.locator("table tr")
    row_count = await rows.count()

    for i in range(row_count):
        links_cell = rows.nth(i).locator("td:nth-child(4)")

        # skip empty cells
        if await links_cell.count() == 0:
            continue    

        # get row metadata
        job_id = await rows.nth(i).locator("td:nth-child(1)").inner_text()   
        job_name = await rows.nth(i).locator("td:nth-child(2)").inner_text()

        if restrict_state:
            for name_item in job_name.split("_")[::-1]:
                if not name_item in ['a','b']:
                    state = name_item
                    break
            if state != restrict_state:
                continue    

        anchors = links_cell.locator("a")
        link_count = await anchors.count()
        for j in range(link_count):
            link = anchors.nth(j)
            text = (await link.inner_text()).strip()   # "Pin Dataset" or "Expanded Standard Dataset"

            if "Pin Dataset" in text or "Expanded Standard" in text:
                text_link = await link.get_attribute("href")
                if text_link:
                    links.append((job_id, job_name, text, text_link))

    return links


def _safe(s: str) -> str:
    return "".join(c if c.isalnum() or c in "._- " else "_" for c in s).strip()

def filename_from_url(url):
    qs = parse_qs(urlparse(url).query)
    cd = qs.get("response-content-disposition", [None])[0]
    if not cd:
        return None
    cd = unquote(cd)  # attachment; filename="..."
    m = re.search(r'filename="?([^"]+)"?', cd)
    return m.group(1) if m else None

async def download_one(context, item, out_dir='azira_downloads', timeout_ms=180_000):
    job_id, job_name, link_text, url = item

    os.makedirs(out_dir, exist_ok=True)

    p = await context.new_page()
    try:
        async with p.expect_download(timeout=timeout_ms) as dl_info:
                    # Direct download URLs won't "navigate" normally. This may raise "Download is starting".
            try:
                await p.goto(url, wait_until="commit")  # minimal wait; doesn't require DOM
            except Exception as e:
                # This is expected for direct downloads
                if "Download is starting" not in str(e):
                    raise

        download = await dl_info.value

        # Save with a deterministic filename (optional)
        saved_path = None
        filename = f"{_safe(download.suggested_filename)}"
        saved_path = os.path.join(out_dir, filename)
        await download.save_as(saved_path)

        return  saved_path

    finally:
        await p.close()


async def download_three_at_a_time(context, items, out_dir="azira_downloads"):
    sem = asyncio.Semaphore(3)

    async def worker(item):
        async with sem:
            return await download_one(context, item, out_dir=out_dir)

    results = await asyncio.gather(
        *(worker(item) for item in items),
        return_exceptions=True
    )

    completed = []
    failed = []

    for item, result in zip(items, results):
        if isinstance(result, Exception):
            failed.append((item, result))
        else:
            completed.append(result)

    return completed, failed



## 2. Login

In [196]:
# login 
browser_handle = await start_browser()
page = browser_handle["page"]
await login(page, user, pw)

## 3. Click "Load More" until the end

In [197]:
# click "Load More" until the button is no longer visible
from playwright.async_api import expect, TimeoutError as PlaywrightTimeoutError

while True:
    load_more = page.get_by_role("button", name="Load More")

    if await load_more.is_visible():
        await load_more.click()
    else:
        print("end of the line")
        break
    # Wait for new rows to load before next iteration
    await page.wait_for_timeout(1000) 

end of the line


## 3. Get all links on page (or, just from a single state)

In [198]:
all_links = await get_links(page, "Utah")

# filter for already downloaded files
new_links = []
for a in all_links:
    filename = filename_from_url(a[3])
    if not Path(f"azira_downloads/{filename}").exists():
        print(f"adding {filename} to download list")
        new_links.append(a)


adding 10057145_Salt_Lake_City_1_b_Utah_expanded_cel_cdl_report.zip to download list


In [199]:
new_links

[('10057145',
  'Salt_Lake_City_1_b_Utah',
  'Expanded Standard Dataset',
  'https://insights-api-prod-us-west-2-report.s3.amazonaws.com/reports/10057145/expanded_cel_cdl_report/10057145_Salt_Lake_City_1_b_Utah_expanded_cel_cdl_report.zip?response-content-disposition=attachment%3B%20filename%3D%2210057145_Salt_Lake_City_1_b_Utah_expanded_cel_cdl_report.zip%22&response-content-type=application%2Fx-gzip&AWSAccessKeyId=AKIASOM5CCFUB76XXDHE&Signature=ZBim6bE5YwEZ3DaCEHSdQouTppo%3D&Expires=1767039373')]

## 4. Download links

In [175]:
completed, failed = await download_three_at_a_time(
    page.context,
    all_links,
    out_dir="azira_downloads"
)

print("Completed:", len(completed))
print("Failed:", len(failed))

Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed\nCall log:\n  - waiting for locator("table tr").nth(45).locator("td:nth-child(0)")\n')>
playwright._impl._errors.TargetClosedError: Target page, context or browser has been closed
Call log:
  - waiting for locator("table tr").nth(45).locator("td:nth-child(0)")

Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed')>
playwright._impl._errors.TargetClosedError: Target page, context or browser has been closed


Completed: 50
Failed: 0


## Fix Colorado

In [None]:
download_list = []

base_dir = Path('/home/vince/Documents/SmartFires/osm_fitness')
folders = [p for p in base_dir.iterdir() if p.is_dir()]

# ----REMOVE NEXT LINE TO DO ALL STATES-----
folders = [f for f in folders if 'Colorado' in f.name]

for folder in folders:  
    geojson_files = list(folder.glob('*.geojson'))
    for json_path in geojson_files:
        download_list.append(json_path)  

[PosixPath('/home/vince/Documents/SmartFires/osm_fitness/Colorado_700k(45)/Denver_5.geojson'),
 PosixPath('/home/vince/Documents/SmartFires/osm_fitness/Colorado_700k(45)/Centennial_1.geojson'),
 PosixPath('/home/vince/Documents/SmartFires/osm_fitness/Colorado_700k(45)/Denver_4.geojson'),
 PosixPath('/home/vince/Documents/SmartFires/osm_fitness/Colorado_700k(45)/Commerce City.geojson'),
 PosixPath('/home/vince/Documents/SmartFires/osm_fitness/Colorado_700k(45)/Arvada_2.geojson'),
 PosixPath('/home/vince/Documents/SmartFires/osm_fitness/Colorado_700k(45)/Lakewood_1.geojson'),
 PosixPath('/home/vince/Documents/SmartFires/osm_fitness/Colorado_700k(45)/Denver_3.geojson'),
 PosixPath('/home/vince/Documents/SmartFires/osm_fitness/Colorado_700k(45)/Superior.geojson'),
 PosixPath('/home/vince/Documents/SmartFires/osm_fitness/Colorado_700k(45)/Aurora_1.geojson'),
 PosixPath('/home/vince/Documents/SmartFires/osm_fitness/Colorado_700k(45)/Greeley.geojson'),
 PosixPath('/home/vince/Documents/SmartF

In [141]:
all_links = await get_links(page, "Colorado")

In [162]:
requested = list(set(["_".join(a[1].split("_")[:-1]) for a in all_links]))

In [158]:
jsons = [i.stem for i in download_list]