downloads dbc file and converts to parquet

https://stackoverflow.com/questions/57205531/python-how-to-download-multiple-files-in-parallel-using-multiprocessing-pool

In [5]:
from tqdm import tqdm
import os
from pysus.online_data import SIA
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

In [6]:

# Function to download SIA data for a specific year, month, and UF
def download_sia_data(year, month, uf):
    try:
        print(f"Downloading data for UF: {uf}, Year: {year}, Month: {month}")
        SIA.download([uf], [year], [month], groups= data_group, data_dir= dbfs_raw_path)
    except Exception as e:
        print(f"Failed to download data for UF: {uf}, Year: {year}, Month: {month}: {str(e)}")
        
# Parallel download function with progress tracking
def download_sia_data_parallel(ufs, years, months):

        # Record the start time of the job
    start_time = time.time()


    # Ensure the destination folder exists
    if not os.path.exists(dbfs_raw_path):
        os.makedirs(dbfs_raw_path)
    
    # Calculate total tasks
    total_tasks = len(ufs) * len(years) * len(months)

    # Initialize error counter
    error_count = 0

    # Create a ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=100) as executor, tqdm(total=total_tasks) as progress_bar:
        # Using a list to store download tasks
        futures = [
            executor.submit(download_sia_data, year, month, uf)
            for uf in ufs for year in years for month in months
        ]

        # Process the tasks as they are completed
        for future in as_completed(futures):
            result = future.result()

            # Check if the task was successful
            if result is None:  # Assume None represents failure
                error_count += 1

            # Update the progress bar for each completed task
            progress_bar.update(1)

    # Print summary of download errors
    print(f"All downloads completed, errors: {error_count}")

    # Record the end time of the job
    end_time = time.time()
    # Calculate the total execution time
    total_time = end_time - start_time
    print(f"Total execution time: {total_time:.2f} seconds")

# Example usage
ufs = ['ac', 'al', 'ap', 'am','ba', 'ce', 'df', 'es', 'go', 'ma', 'mt', 'ms', 'mg', 'pa', 'pb', 'pr', 'pe', 'pi', 'rj', 'rn', 'rs', 'ro', 'rr', 'sc', 'sp', 'se', 'to']
years = [2019]
months = [1] #list(range(1, 13))
data_group = ['PA']
# Path to store the raw data in DBFS
dbfs_raw_path = "./tmp3"

# Call the parallel download function
download_sia_data_parallel(ufs, years, months)

  0%|          | 0/27 [00:00<?, ?it/s]

Downloading data for UF: ac, Year: 2019, Month: 1
Downloading data for UF: al, Year: 2019, Month: 1
Downloading data for UF: ap, Year: 2019, Month: 1
Downloading data for UF: am, Year: 2019, Month: 1
Downloading data for UF: ba, Year: 2019, Month: 1
Downloading data for UF: ce, Year: 2019, Month: 1
Downloading data for UF: df, Year: 2019, Month: 1
Downloading data for UF: es, Year: 2019, Month: 1
Downloading data for UF: go, Year: 2019, Month: 1
Downloading data for UF: ma, Year: 2019, Month: 1
Downloading data for UF: mt, Year: 2019, Month: 1
Downloading data for UF: ms, Year: 2019, Month: 1
Downloading data for UF: mg, Year: 2019, Month: 1
Downloading data for UF: pa, Year: 2019, Month: 1
Downloading data for UF: pb, Year: 2019, Month: 1
Downloading data for UF: pr, Year: 2019, Month: 1
Downloading data for UF: pe, Year: 2019, Month: 1
Downloading data for UF: pi, Year: 2019, Month: 1
Downloading data for UF: rj, Year: 2019, Month: 1
Downloading data for UF: rn, Year: 2019, Month: 1




[A[A
[A

[A[A
[A


[A[A[A



[A[A[A[A




[A[A[A[A[A

[A[A





[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A






[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A



[A[A[A[A



[A[A[A[A














[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A




[A[A[A[A[A
[A


[A[A[A


[A[A[A

















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A





[A[A[A[A[A[A







[A[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[



PAPB1901.dbc:   5%|▌         | 861k/15.8M [00:32<31:33, 7.88kB/s][A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A














[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A

[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A



[A[A[A[A



[A[A[A[A









[A[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A














[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A

[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A













blast printf code: 2














[A[A[A[A[A[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A






[A[A[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A














[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A



[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A






[A[A[A[A[A[A[A

[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A












PARR1901.parquet: 100%|██████████| 38.2k/38.2k [00:20<00:00, 1.87kB/s]











[A[A[A[A[A[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A[A[A[A[A














[A[A[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A






[A[A[A[A[A[A[A






[A[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A










PAPE1901.parquet: 100%|██████████| 6.66k/6.66k [00:04<00:00, 1.49kB/s]

















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A







 44%|████▍     | 12/27 [02:58<03:50, 15.36s/it]
















[A[A[A[A[

blast printf code: 2

PAMG1901.dbc:  93%|█████████▎| 95.1M/102M [05:16<00:18, 386kB/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[ASuccess













[A[A[A[A[A[A[A[A[A[A[A[A[A




PARS1901.dbf: 100%|██████████| 1.00/1.00 [00:20<00:00, 20.4s/B][A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A
















[A[A[A[A[A[

Failed to download data for UF: df, Year: 2019, Month: 1: local variable 'output' referenced before assignment













[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A

Failed to download data for UF: ba, Year: 2019, Month: 1: local variable 'output' referenced before assignment















[A[A[A[A[A[A[A[A[A[A[A[A[A










PAAL1901.dbc:   0%|          | 0.00/11.5M [17:45<?, ?B/s]
 93%|█████████▎| 25/27 [18:00<03:26, 103.06s/it]

Failed to download data for UF: al, Year: 2019, Month: 1: local variable 'output' referenced before assignment















[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[

All downloads completed, errors: 27
Total execution time: 1433.02 seconds



