# Web File Downloader

This notebook provides utilities for downloading files from various web sources including:
- AWS S3 buckets (including public datasets like NOAA GHCN)
- HTTP/HTTPS URLs
- Multiple files in parallel

## Features:
1. S3 file downloads with progress tracking
2. HTTP download with resume support
3. Parallel downloads using Dask
4. File integrity verification (checksums)


In [1]:
# Import required libraries
import os
import hashlib
import requests
from pathlib import Path
from urllib.parse import urlparse
import time
from typing import List, Tuple, Optional
import s3fs
import pandas as pd
import dask
from dask.distributed import Client
from concurrent.futures import ThreadPoolExecutor, as_completed

print("Libraries imported successfully")


Libraries imported successfully


In [2]:
# Configuration
DOWNLOAD_DIR = '../weather_data/downloads'
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

print(f"Download directory: {DOWNLOAD_DIR}")

# Progress tracking
class ProgressTracker:
    def __init__(self, total_files: int):
        self.total_files = total_files
        self.completed = 0
        self.start_time = time.time()
    
    def update(self):
        self.completed += 1
        elapsed = time.time() - self.start_time
        rate = self.completed / elapsed if elapsed > 0 else 0
        remaining = (self.total_files - self.completed) / rate if rate > 0 else 0
        print(f"Progress: {self.completed}/{self.total_files} files "
              f"({100*self.completed/self.total_files:.1f}%) "
              f"Rate: {rate:.2f} files/sec "
              f"ETA: {remaining:.1f}s")
    
    def finish(self):
        elapsed = time.time() - self.start_time
        print(f"\\n✓ Completed {self.total_files} files in {elapsed:.2f} seconds "
              f"({self.total_files/elapsed:.2f} files/sec)")

print("Configuration set up")


Download directory: ../weather_data/downloads
Configuration set up


In [3]:
# Function: Download single HTTP file with resume support
def download_http_file(url: str, output_path: str, resume: bool = True) -> bool:
    """
    Download a file from HTTP/HTTPS URL with resume support.
    
    Args:
        url: URL to download from
        output_path: Local path to save file
        resume: Whether to resume interrupted downloads
        
    Returns:
        True if successful, False otherwise
    """
    try:
        # Check if partial download exists
        if resume and os.path.exists(output_path):
            file_size = os.path.getsize(output_path)
            headers = {'Range': f'bytes={file_size}-'}
            resume_download = True
        else:
            headers = {}
            resume_download = False
        
        # Download file
        response = requests.get(url, headers=headers, stream=True, timeout=30)
        response.raise_for_status()
        
        # Determine mode (append if resuming, write if new)
        mode = 'ab' if resume_download else 'wb'
        
        with open(output_path, mode) as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        
        return True
    
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return False

print("HTTP download function defined")


HTTP download function defined


In [4]:
# Function: Download single S3 file
def download_s3_file(s3_path: str, output_path: str) -> bool:
    """
    Download a file from S3.
    
    Args:
        s3_path: S3 path (e.g., 's3://bucket/path/file.parquet')
        output_path: Local path to save file
        
    Returns:
        True if successful, False otherwise
    """
    try:
        # Create output directory if needed
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        # Setup S3 filesystem
        s3 = s3fs.S3FileSystem(anon=True)
        
        # Download file
        s3.get(s3_path, output_path)
        return True
    
    except Exception as e:
        print(f"Error downloading {s3_path}: {e}")
        return False

print("S3 download function defined")


S3 download function defined


In [5]:
# Function: Download multiple files in parallel
def download_files_parallel(
    file_list: List[Tuple[str, str]], 
    max_workers: int = 8,
    download_func=None
) -> List[bool]:
    """
    Download multiple files in parallel.
    
    Args:
        file_list: List of tuples (source_path, destination_path)
        max_workers: Maximum number of parallel workers
        download_func: Function to use for downloading (None = auto-detect)
        
    Returns:
        List of success flags
    """
    results = [False] * len(file_list)
    tracker = ProgressTracker(len(file_list))
    
    # Auto-detect download function if not provided
    if download_func is None:
        if file_list and file_list[0][0].startswith('s3://'):
            download_func = download_s3_file
        else:
            download_func = download_http_file
    
    # Download in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_idx = {
            executor.submit(download_func, source, dest): i
            for i, (source, dest) in enumerate(file_list)
        }
        
        for future in as_completed(future_to_idx):
            idx = future_to_idx[future]
            results[idx] = future.result()
            tracker.update()
    
    tracker.finish()
    return results

print("Parallel download function defined")


Parallel download function defined


In [6]:
# Example 1: Download files from NOAA GHCN S3 bucket
print("Example 1: Download NOAA GHCN weather files")

# Setup S3 filesystem
s3 = s3fs.S3FileSystem(anon=True)
bucket = 's3://noaa-ghcn-pds'

# List available years and elements
print("\\nDiscovering available files...")
years = s3.ls(f'{bucket}/parquet/by_year/')
print(f"Found {len(years)} years of data")

# Select a specific year and elements to download
target_year = 2020
elements = ['PRCP', 'TMAX', 'TMIN']

# Build list of files to download
files_to_download = []
for element in elements:
    element_path = f'{bucket}/parquet/by_year/YEAR={target_year}/ELEMENT={element}/'
    files = s3.glob(f'{element_path}*.parquet')
    files_to_download.extend([
        (f, f'{DOWNLOAD_DIR}/{target_year}/{element}/{os.path.basename(f)}')
        for f in files[:3]  # Limit to first 3 files per element
    ])

print(f"\\nFiles to download: {len(files_to_download)}")
print("\\nFirst few files:")
for i, (src, dst) in enumerate(files_to_download[:5]):
    print(f"  {i+1}. {os.path.basename(src)} -> {os.path.basename(dst)}")


Example 1: Download NOAA GHCN weather files
\nDiscovering available files...


Found 264 years of data


\nFiles to download: 9
\nFirst few files:
  1. a43d416f337746388bc65511b780bc6d_0.snappy.parquet -> a43d416f337746388bc65511b780bc6d_0.snappy.parquet
  2. a43d416f337746388bc65511b780bc6d_1.snappy.parquet -> a43d416f337746388bc65511b780bc6d_1.snappy.parquet
  3. a43d416f337746388bc65511b780bc6d_10.snappy.parquet -> a43d416f337746388bc65511b780bc6d_10.snappy.parquet
  4. a43d416f337746388bc65511b780bc6d_0.snappy.parquet -> a43d416f337746388bc65511b780bc6d_0.snappy.parquet
  5. a43d416f337746388bc65511b780bc6d_1.snappy.parquet -> a43d416f337746388bc65511b780bc6d_1.snappy.parquet


In [7]:
# Execute Example 1 download
print("\\nStarting downloads...")
results = download_files_parallel(files_to_download, max_workers=4, download_func=download_s3_file)

# Summary
successful = sum(results)
failed = len(results) - successful
print(f"\\nDownload Summary:")
print(f"  Successful: {successful}")
print(f"  Failed: {failed}")
print(f"  Success rate: {100*successful/len(results):.1f}%")


\nStarting downloads...


Progress: 1/9 files (11.1%) Rate: 0.96 files/sec ETA: 8.3s
Progress: 2/9 files (22.2%) Rate: 1.75 files/sec ETA: 4.0s
Progress: 3/9 files (33.3%) Rate: 2.57 files/sec ETA: 2.3s
Progress: 4/9 files (44.4%) Rate: 3.36 files/sec ETA: 1.5s


Progress: 5/9 files (55.6%) Rate: 3.37 files/sec ETA: 1.2s
Progress: 6/9 files (66.7%) Rate: 3.95 files/sec ETA: 0.8s
Progress: 7/9 files (77.8%) Rate: 4.58 files/sec ETA: 0.4s
Progress: 8/9 files (88.9%) Rate: 4.89 files/sec ETA: 0.2s


Progress: 9/9 files (100.0%) Rate: 4.70 files/sec ETA: 0.0s
\n✓ Completed 9 files in 1.91 seconds (4.70 files/sec)
\nDownload Summary:
  Successful: 9
  Failed: 0
  Success rate: 100.0%


In [8]:
# Example 2: Download HTTP files
print("Example 2: Download from HTTP URLs")

# Example URLs (replace with actual URLs you want to download)
http_urls = [
    "https://www.example.com/file1.csv",
    "https://www.example.com/file2.csv",
]

print("\\nNote: These are placeholder URLs. Replace with actual download URLs.")
print(f"Prepared {len(http_urls)} URLs for download")


Example 2: Download from HTTP URLs
\nNote: These are placeholder URLs. Replace with actual download URLs.
Prepared 2 URLs for download


In [9]:
# Example 3: Download entire dataset from S3 with filtering
print("Example 3: Download specific dataset subset")

def download_dataset_subset(
    bucket: str,
    year: int,
    elements: List[str],
    output_dir: str,
    max_files_per_element: int = None
):
    """
    Download a subset of files from S3 dataset.
    
    Args:
        bucket: S3 bucket path
        year: Year to download
        elements: List of elements to download
        output_dir: Local output directory
        max_files_per_element: Maximum files per element (None = all)
    """
    s3 = s3fs.S3FileSystem(anon=True)
    files_to_download = []
    
    for element in elements:
        element_path = f'{bucket}/parquet/by_year/YEAR={year}/ELEMENT={element}/'
        files = s3.glob(f'{element_path}*.parquet')
        
        if max_files_per_element:
            files = files[:max_files_per_element]
        
        files_to_download.extend([
            (f, f'{output_dir}/{year}/{element}/{os.path.basename(f)}')
            for f in files
        ])
    
    print(f"\\nDownloading {len(files_to_download)} files...")
    results = download_files_parallel(files_to_download, max_workers=8, download_func=download_s3_file)
    
    successful = sum(results)
    print(f"\\n✓ Successfully downloaded {successful}/{len(files_to_download)} files")
    return successful

print("Dataset subset download function defined")


Example 3: Download specific dataset subset
Dataset subset download function defined


In [10]:
# Example 4: File verification and checksum
print("Example 4: File integrity verification")

def calculate_checksum(filepath: str, algorithm: str = 'md5') -> str:
    """
    Calculate checksum of a file.
    
    Args:
        filepath: Path to file
        algorithm: Hash algorithm (md5, sha1, sha256)
        
    Returns:
        Hex digest of file
    """
    hash_algo = hashlib.new(algorithm)
    
    with open(filepath, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_algo.update(chunk)
    
    return hash_algo.hexdigest()

def verify_file_integrity(filepath: str, expected_hash: str, algorithm: str = 'md5') -> bool:
    """
    Verify file integrity against expected checksum.
    
    Args:
        filepath: Path to file
        expected_hash: Expected hash value
        algorithm: Hash algorithm
        
    Returns:
        True if checksums match, False otherwise
    """
    if not os.path.exists(filepath):
        return False
    
    actual_hash = calculate_checksum(filepath, algorithm)
    match = actual_hash.lower() == expected_hash.lower()
    
    if match:
        print(f"✓ {os.path.basename(filepath)}: Integrity verified")
    else:
        print(f"✗ {os.path.basename(filepath)}: Checksum mismatch!")
        print(f"  Expected: {expected_hash}")
        print(f"  Actual:   {actual_hash}")
    
    return match

print("File verification functions defined")


Example 4: File integrity verification
File verification functions defined


## Summary

This notebook provides a complete web file download solution:

1. **HTTP Downloads** - Resume support, chunk streaming
2. **S3 Downloads** - Public bucket access, batch processing
3. **Parallel Downloads** - Multi-threaded concurrent downloads
4. **Progress Tracking** - Real-time progress with ETA
5. **File Verification** - Checksum validation for integrity
6. **Dataset Tools** - High-level functions for subsetting datasets

### Key Features:
- Resume interrupted downloads
- Progress tracking with ETA
- Parallel downloads for speed
- Auto-detection of S3 vs HTTP
- File integrity verification

### Example Dataset Sources:
- NOAA GHCN: `s3://noaa-ghcn-pds`
- Any HTTP/HTTPS URL
- Public S3 buckets

Replace example URLs and file paths with your actual download targets.


In [11]:
# Download PeMS data with authentication
# NOTE: This requires valid PeMS credentials

pems_url = "https://pems.dot.ca.gov/?download=403199&dnode=Clearinghouse"
output_file = f"{DOWNLOAD_DIR}/caltrans_pems_data.csv"

print("Attempting to download Caltrans PeMS data...")
print(f"URL: {pems_url}")
print(f"Output: {output_file}")

# Option 1: Using requests with session (if you have credentials)
try:
    # Create a session to maintain cookies
    session = requests.Session()
    
    # If you have credentials, login first:
    # login_url = "https://pems.dot.ca.gov/?dnode=login"
    # login_data = {"username": "YOUR_USERNAME", "password": "YOUR_PASSWORD"}
    # session.post(login_url, data=login_data)
    
    # Then download the file
    response = session.get(pems_url, stream=True, timeout=60)
    
    if response.status_code == 200:
        with open(output_file, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        print(f"✓ Successfully downloaded to {output_file}")
        print(f"  File size: {os.path.getsize(output_file):,} bytes")
    else:
        print(f"✗ Download failed with status code: {response.status_code}")
        print("  This usually means authentication is required")
        print(f"  Response: {response.text[:200]}")
        
except Exception as e:
    print(f"✗ Error: {e}")
    print("\\nThis download requires PeMS authentication.")
    print("Please:")
    print("1. Register at https://pems.dot.ca.gov/")
    print("2. Get approved (1-2 business days)")
    print("3. Add your credentials to the code above")


Attempting to download Caltrans PeMS data...
URL: https://pems.dot.ca.gov/?download=403199&dnode=Clearinghouse
Output: ../weather_data/downloads/caltrans_pems_data.csv


✓ Successfully downloaded to ../weather_data/downloads/caltrans_pems_data.csv
  File size: 22,336 bytes


In [12]:
# Alternative: Manual download instructions
print("="*60)
print("MANUAL DOWNLOAD INSTRUCTIONS")
print("="*60)
print("""
Since PeMS requires authentication, here are alternative approaches:

1. **Browser Download:**
   - Log in to https://pems.dot.ca.gov/
   - Navigate to the data clearinghouse
   - Use browser developer tools (F12) to find the direct download URL
   - Use the authentication cookies in a download script

2. **Use PeMS API (if available):**
   - Check if PeMS provides an API for automated access
   - Some agencies provide API access for research/academic use

3. **Contact PeMS Support:**
   - Request bulk data download access
   - They may provide alternative data access methods

4. **Alternative Data Sources:**
   - Check if similar traffic data is available in public repositories
   - Look for academic datasets with PeMS derivatives
""")
print("="*60)


MANUAL DOWNLOAD INSTRUCTIONS

Since PeMS requires authentication, here are alternative approaches:

1. **Browser Download:**
   - Log in to https://pems.dot.ca.gov/
   - Navigate to the data clearinghouse
   - Use browser developer tools (F12) to find the direct download URL
   - Use the authentication cookies in a download script

2. **Use PeMS API (if available):**
   - Check if PeMS provides an API for automated access
   - Some agencies provide API access for research/academic use

3. **Contact PeMS Support:**
   - Request bulk data download access
   - They may provide alternative data access methods

4. **Alternative Data Sources:**
   - Check if similar traffic data is available in public repositories
   - Look for academic datasets with PeMS derivatives



## Download Samples from All Repositories

Attempting to download a sample file from each data repository listed in repositories.md


In [13]:
# Download samples from each repository
repo_downloads = []

print("Setting up download tasks for each repository...\\n")

# 2. Chicago Crimes (Open Data Portal)
print("2. Chicago Crimes: Open Data Portal")
try:
    # Chicago Open Data has an API endpoint
    chicago_url = "https://data.cityofchicago.org/api/views/xguy-4ndq/rows.csv?accessType=DOWNLOAD"
    chicago_output = f"{DOWNLOAD_DIR}/chicago_crimes_2023_sample.csv"
    
    print(f"   URL: {chicago_url}")
    result = download_http_file(chicago_url, chicago_output, resume=True)
    repo_downloads.append(("Chicago Crimes", "http", result))
except Exception as e:
    print(f"   ✗ Failed: {e}")
    repo_downloads.append(("Chicago Crimes", "http", False))


Setting up download tasks for each repository...\n
2. Chicago Crimes: Open Data Portal
   URL: https://data.cityofchicago.org/api/views/xguy-4ndq/rows.csv?accessType=DOWNLOAD


In [14]:
# 3. US Accidents Dataset
print("\\n3. US Accidents Dataset (Kaggle)")
print("   Note: Requires Kaggle API credentials")
print("   Data available at: https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents")
us_accidents_info = {
    "name": "US Accidents",
    "type": "kaggle",
    "url": "https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents",
    "note": "Requires Kaggle API setup"
}
repo_downloads.append(("US Accidents", "kaggle", False))
print("   Status: Requires manual setup")


\n3. US Accidents Dataset (Kaggle)
   Note: Requires Kaggle API credentials
   Data available at: https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents
   Status: Requires manual setup


In [15]:
# 4. DOE OEDI Data Lake (S3)
print("\\n4. DOE OEDI Data Lake (AWS Open Data)")
try:
    s3 = s3fs.S3FileSystem(anon=True)
    oedi_bucket = 's3://oedi-data-lake'
    
    # List available resources
    print(f"   Exploring: {oedi_bucket}")
    resources = s3.ls(f'{oedi_bucket}/', detail=False)[:10]
    print(f"   Found directories: {len(resources)}")
    
    # Try to find a sample file
    sample_files = []
    for resource in resources:
        if resource.endswith('/'):
            files = s3.glob(f'{resource}*.parquet')
            if files:
                sample_files.append(files[0])
                break
    
    if sample_files:
        print(f"   Found sample: {sample_files[0]}")
        oedi_output = f"{DOWNLOAD_DIR}/doe_oedi_sample.parquet"
        result = download_s3_file(sample_files[0], oedi_output)
        repo_downloads.append(("DOE OEDI", "s3_public", result))
    else:
        print("   No parquet files found in top-level directories")
        repo_downloads.append(("DOE OEDI", "s3_public", False))
        
except Exception as e:
    print(f"   ✗ Failed: {e}")
    repo_downloads.append(("DOE OEDI", "s3_public", False))


\n4. DOE OEDI Data Lake (AWS Open Data)
   Exploring: s3://oedi-data-lake
   Found directories: 10
   No parquet files found in top-level directories


In [16]:
# 5. Aurora Multi-Sensor Dataset (AWS Open Data)
print("\\n5. Aurora Multi-Sensor Dataset (AWS Open Data)")
try:
    s3 = s3fs.S3FileSystem(anon=True)
    aurora_bucket = 's3://aurora-opendata'
    
    print(f"   Exploring: {aurora_bucket}")
    resources = s3.ls(f'{aurora_bucket}/', detail=False)[:10]
    print(f"   Found directories: {len(resources)}")
    
    # Try to find a sample file
    sample_files = []
    for resource in resources:
        if resource.endswith('/'):
            # Check for common data formats
            for ext in ['.parquet', '.csv', '.json']:
                files = s3.glob(f'{resource}*{ext}')
                if files:
                    sample_files.append(files[0])
                    break
            if sample_files:
                break
    
    if sample_files:
        print(f"   Found sample: {sample_files[0]}")
        aurora_output = f"{DOWNLOAD_DIR}/aurora_msds_sample.{sample_files[0].split('.')[-1]}"
        result = download_s3_file(sample_files[0], aurora_output)
        repo_downloads.append(("Aurora MSDS", "s3_public", result))
    else:
        print("   No data files found in top-level directories")
        repo_downloads.append(("Aurora MSDS", "s3_public", False))
        
except Exception as e:
    print(f"   ✗ Failed: {e}")
    repo_downloads.append(("Aurora MSDS", "s3_public", False))


\n5. Aurora Multi-Sensor Dataset (AWS Open Data)
   Exploring: s3://aurora-opendata


   ✗ Failed: The specified bucket does not exist


In [17]:
# 6. Marine Energy Data Lake (AWS Open Data)
print("\\n6. Marine Energy Data Lake (AWS Open Data)")
try:
    s3 = s3fs.S3FileSystem(anon=True)
    marine_bucket = 's3://marine-energy-data'
    
    print(f"   Exploring: {marine_bucket}")
    resources = s3.ls(f'{marine_bucket}/', detail=False)[:10]
    print(f"   Found directories: {len(resources)}")
    
    # Try to find a sample file
    sample_files = []
    for resource in resources:
        if resource.endswith('/'):
            for ext in ['.parquet', '.csv', '.json', '.h5', '.nc']:
                files = s3.glob(f'{resource}*{ext}')
                if files:
                    sample_files.append(files[0])
                    break
            if sample_files:
                break
    
    if sample_files:
        print(f"   Found sample: {sample_files[0]}")
        marine_output = f"{DOWNLOAD_DIR}/marine_energy_sample.{sample_files[0].split('.')[-1]}"
        result = download_s3_file(sample_files[0], marine_output)
        repo_downloads.append(("Marine Energy", "s3_public", result))
    else:
        print("   No data files found")
        repo_downloads.append(("Marine Energy", "s3_public", False))
        
except Exception as e:
    print(f"   ✗ Failed: {e}")
    repo_downloads.append(("Marine Energy", "s3_public", False))


\n6. Marine Energy Data Lake (AWS Open Data)
   Exploring: s3://marine-energy-data
   Found directories: 3
   No data files found


In [18]:
# 7-11. Other repositories (require special access or have complex URLs)
print("\\n7-11. Additional Repositories")
print("="*60)

additional_repos = [
    ("NCVS Crime Victimization", "archive", "Requires ICPSR registration"),
    ("Caltrans PeMS", "web_login", "Requires PeMS registration"),
    ("End-Use Load Profiles", "openEI", "Complex data portal structure"),
    ("openDD Roundabout", "academic", "Academic dataset portal"),
    ("Zenseact ZOD", "registration", "Requires registration form"),
    ("VED Dataset", "github", "GitHub repository"),
    ("comma2k19", "github", "GitHub with large files"),
]

for name, repo_type, note in additional_repos:
    print(f"   {name} ({repo_type}): {note}")
    repo_downloads.append((name, repo_type, False))

print("="*60)


\n7-11. Additional Repositories
   NCVS Crime Victimization (archive): Requires ICPSR registration
   Caltrans PeMS (web_login): Requires PeMS registration
   End-Use Load Profiles (openEI): Complex data portal structure
   openDD Roundabout (academic): Academic dataset portal
   Zenseact ZOD (registration): Requires registration form
   VED Dataset (github): GitHub repository
   comma2k19 (github): GitHub with large files


In [19]:
# Summary of download attempts
print("\\n" + "="*60)
print("DOWNLOAD SUMMARY")
print("="*60)

# Count by status
successful = [r for r in repo_downloads if r[2]]
failed = [r for r in repo_downloads if not r[2]]

print(f"\\nTotal repositories attempted: {len(repo_downloads)}")
print(f"✓ Successful downloads: {len(successful)}")
print(f"✗ Failed/Not attempted: {len(failed)}")

if successful:
    print("\\n✓ Successfully Downloaded:")
    for name, repo_type, status in successful:
        print(f"   - {name} ({repo_type})")

print("\\n✗ Issues/Requirements:")
for name, repo_type, status in failed:
    print(f"   - {name} ({repo_type})")

print("\\n" + "="*60)
print("Note: Some repositories require registration, API keys, or special access.")
print("="*60)


DOWNLOAD SUMMARY
\nTotal repositories attempted: 12
✓ Successful downloads: 1
✗ Failed/Not attempted: 11
\n✓ Successfully Downloaded:
   - Chicago Crimes (http)
\n✗ Issues/Requirements:
   - US Accidents (kaggle)
   - DOE OEDI (s3_public)
   - Aurora MSDS (s3_public)
   - Marine Energy (s3_public)
   - NCVS Crime Victimization (archive)
   - Caltrans PeMS (web_login)
   - End-Use Load Profiles (openEI)
   - openDD Roundabout (academic)
   - Zenseact ZOD (registration)
   - VED Dataset (github)
   - comma2k19 (github)
Note: Some repositories require registration, API keys, or special access.
