In [None]:
# THIS UNZIPS
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import glob
import datetime as dt
import geopandas as gpd
import zipfile
from concurrent.futures import ThreadPoolExecutor

os.environ['USE_PYGEOS'] = '0'

def get_links(url):
    response = requests.get(url, timeout=10)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.find_all('a')
    return links

def get_all_files(url):
    response = requests.get(url, timeout=10)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.find_all('a')
    out = []
    for link in links[5:]:
        href = link.get('href')
        subdir_url = url + href
        if subdir_url.endswith('/'):  # Check if the link is a directory
            out.append(get_all_files(subdir_url))
        else:  # if not, add download link to outlist
            out.append(subdir_url)
    return out

def flatten_list(lst):
    flattened = []
    for item in lst:
        if isinstance(item, list):
            flattened.extend(flatten_list(item))
        else:
            flattened.append(item)
    return flattened

url = 'https://ftp.wildfire.gov/public/incident_specific_data/rocky_mtn/'
region = 'rocky_mtn'
fyeardirs = ['2020/']  # Find all the links on the page

allout = []
links = get_links(url)
for link in links[5:]:
    href = link.get('href')
    if href not in fyeardirs:
        continue
    subdir_url = url + href
    sublinks = get_links(subdir_url)
    for sl in sublinks[5:]:
        href = sl.get('href')
        su = subdir_url + href
        # force IR dir
        irdir = su + 'IR/'  # THIS IS NOT CONSISTENT AMONG REGIONS AND ACROSS YEARS (so please check it)
        allout.append(get_all_files(irdir))

allout = flatten_list(allout)
print(len(allout))

# Directory to save the downloaded files
local_directory = 'C:\\Users\\magst\\Desktop\\NIROPS_Data'

# Function to download a file from a URL and maintain the directory structure
def download_file(url, base_url, local_base_directory):
    try:
        # Extract relative path
        relative_path = url.replace(base_url, '')
        local_path = os.path.join(local_base_directory, relative_path)
        
        # Create necessary directories
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        
        response = requests.get(url, stream=True, timeout=30)
        total_size_in_bytes = int(response.headers.get('content-length', 0))
        block_size = 1024 # 1 Kibibyte
        with open(local_path, 'wb') as file, tqdm(
            total=total_size_in_bytes, unit='iB', unit_scale=True, desc=relative_path, leave=False) as progress_bar:
            for data in response.iter_content(block_size):
                progress_bar.update(len(data))
                file.write(data)
        progress_bar.close()
        
        # Unzip if the file is a ZIP file
        if local_path.endswith('.zip'):
            with zipfile.ZipFile(local_path, 'r') as zip_ref:
                zip_ref.extractall(os.path.dirname(local_path))
            os.remove(local_path)  # Remove the ZIP file after extraction
    except Exception as e:
        print(f"Failed to download {url}. Reason: {e}")

# Using ThreadPoolExecutor to download files concurrently
def download_files(urls, base_url, local_base_directory):
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(download_file, url, base_url, local_base_directory) for url in urls]
        for future in futures:
            try:
                future.result()  # Ensure all futures are completed
            except Exception as e:
                print(f"Exception occurred during downloading: {e}")

# Download all files concurrently
download_files(allout, url, local_directory)
print("Download completed.")
