In [None]:
# Serverless 3W Dataset Downloader Job
# This job downloads parquet files from the Petrobras 3W dataset to a Databricks volume

import requests
import os
import time
from pathlib import Path
import json

# Get config path from job parameters
dbutils.widgets.text("config_path", "config.yaml")
config_path = dbutils.widgets.get("config_path")

print(f"Using config file: {config_path}")

# Simple config loader since we can't use the src modules
import yaml
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

# Override output directory to use volume
output_dir = "/Volumes/shm/3w/bronze/"
max_files = config['download'].get('max_files', 10)
max_dirs = config['download'].get('max_dirs', 1)
base_url = config['download']['base_url']
delay_seconds = config['download'].get('delay_seconds', 0.1)

print(f"Output directory: {output_dir}")
print(f"Max files: {max_files}")
print(f"Max dirs: {max_dirs}")
print(f"Base URL: {base_url}")

# Download function
session = requests.Session()

def get_directory_contents(path="dataset"):
    """Get contents of a directory from GitHub API."""
    url = f"{base_url}/{path}"
    response = session.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching {url}: {response.status_code}")
        return []

def download_file(download_url, local_path):
    """Download a single file."""
    try:
        # Ensure parent directory exists
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        
        # Check if file already exists
        if os.path.exists(local_path):
            return True
        
        print(f"Downloading: {os.path.basename(local_path)}")
        response = session.get(download_url, stream=True)
        
        if response.status_code == 200:
            with open(local_path, "wb") as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            print(f"✓ Downloaded: {local_path}")
            return True
        else:
            print(f"✗ Failed to download {download_url}: {response.status_code}")
            return False
    except Exception as e:
        print(f"✗ Error downloading {download_url}: {e}")
        return False

# Get all parquet files
print("Getting file list from 3W dataset...")
subdirs = get_directory_contents("dataset")

if max_dirs and max_dirs > 0:
    subdirs = subdirs[:max_dirs]
    print(f"Limited to first {max_dirs} directories")

files = []
for subdir in subdirs:
    if subdir.get("type") == "dir":
        subdir_name = subdir["name"]
        print(f"Scanning directory: {subdir_name}")
        
        subdir_files = get_directory_contents(f"dataset/{subdir_name}")
        for file_info in subdir_files:
            if file_info.get("type") == "file" and file_info.get("name", "").endswith(".parquet"):
                files.append({
                    "name": file_info["name"],
                    "path": file_info["path"],
                    "download_url": file_info["download_url"],
                    "size": file_info["size"],
                    "subdir": subdir_name,
                })

if max_files and max_files > 0:
    files = files[:max_files]
    print(f"Limited to first {max_files} files")

print(f"Will download {len(files)} parquet files")

# Download files
successful_downloads = 0
total_size = 0

for i, file_info in enumerate(files, 1):
    local_path = os.path.join(output_dir, file_info["subdir"], file_info["name"])
    
    if os.path.exists(local_path):
        print(f"[{i}/{len(files)}] Skipping existing file: {file_info['name']}")
        successful_downloads += 1
        continue
    
    print(f"[{i}/{len(files)}] Downloading {file_info['name']} ({file_info['size']} bytes)")
    
    if download_file(file_info["download_url"], local_path):
        successful_downloads += 1
        total_size += file_info["size"]
    
    time.sleep(delay_seconds)

print(f"\nDownload complete!")
print(f"Successfully downloaded: {successful_downloads}/{len(files)} files")
print(f"Total data size: {total_size / (1024*1024):.2f} MB")
print(f"Files saved to: {output_dir}")


# Download

Databricks job for downloading Petrobras 3W dataset files. This job reads configuration from config.yaml and downloads the 3W dataset according to the specified parameters. Designed to run as a Databricks job.

## Download Files

In [None]:
# Add the uploaded source directory to Python path
import sys
import os
sys.path.append('/Workspace/Users/scott.mckean@databricks.com/.bundle/hydrate/dev/files')

# Install dependencies
%pip install pyyaml requests pandas

In [None]:
from src.download import DatasetDownloader
from src.utils import DotConfig

# Get config path from job parameters
dbutils.widgets.text("config_path", "config.yaml")
config_path = dbutils.widgets.get("config_path")

print(f"Using config file: {config_path}")

config = DotConfig(config_path)

# Override output directory to use volume for jobs
config.download.output_dir = "/Volumes/shm/3w/bronze/"

print(f"Base URL: {config.download.base_url}")
print(f"Output directory: {config.download.output_dir}")
print(f"Max files: {config.download.max_files}")
print(f"Max dirs: {config.download.max_dirs}")

'https://api.github.com/repos/petrobras/3W/contents'

In [3]:
downloader = DatasetDownloader(config)
downloader.download_dataset()

Configuration loaded from config
Output directory: ./3w_dataset
Max files setting: 10 (type: <class 'int'>)
Max directories setting: 1 (type: <class 'int'>)
Will limit to 10 files
Will limit to 1 directories
Getting file list from 3W dataset...
Found 14 total directories
Applying max_dirs limit: 1
Limited to first 1 directories
Scanning directory: 0
Found 594 total parquet files
Applying max_files limit: 10
Limited to first 10 files
Will download 10 parquet files
[1/10] Skipping existing file: WELL-00001_20170201010207.parquet
[2/10] Skipping existing file: WELL-00001_20170201060114.parquet
[3/10] Skipping existing file: WELL-00001_20170201110124.parquet
[4/10] Skipping existing file: WELL-00001_20170201160311.parquet
[5/10] Skipping existing file: WELL-00001_20170201210228.parquet
[6/10] Skipping existing file: WELL-00001_20170202020343.parquet
[7/10] Skipping existing file: WELL-00001_20170202070239.parquet
[8/10] Skipping existing file: WELL-00001_20170218000146.parquet
[9/10] Skipp