In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import multiprocessing
from functools import partial

File_Path_str = os.path.expanduser('~/DJAv4')
DJA_Root_Url_str = "https://s3.amazonaws.com/msaexp-nirspec/extractions"
os.makedirs(File_Path_str, exist_ok=True)
DJA_v4_Catalog_Path_str = './DJAv4Catalog.csv'
DJA_v4_Catalog_DataFrame = pd.read_csv(DJA_v4_Catalog_Path_str)

def Download_FITS(Index_In_DJA_v4_DataFrame_int, DJA_v4_DataFrame, DJA_Root_Url_str, File_Path_str):
    """
    Download FITS files from the DJA v4 catalog provided.
    Parameters
    ----------
    Index_In_DJA_v4_DataFrame_int : int
        Index of the row in the DataFrame to download.
    DJA_v4_DataFrame : pd.DataFrame
        DataFrame containing the DJA v4 catalog.
    DJA_Root_Url_str : str
        Root URL for the DJA v4 catalog.
    File_Path_str : str
        Path to save the downloaded FITS files.
    Returns
    -------
    int
        0 for success, -1 for failure
    """
    # Get the row from the DataFrame
    try:
        Object_Catalog = DJA_v4_DataFrame.iloc[Index_In_DJA_v4_DataFrame_int]
        Object_Root_str = Object_Catalog.root
        Object_FileName_str = Object_Catalog.file

        # Create the full file path
        Fits_File_Root_Path_str = os.path.join(File_Path_str, Object_Root_str)
        Fits_File_Full_Path_str = os.path.join(Fits_File_Root_Path_str, Object_FileName_str)
        Fits_File_Url_str = f"{DJA_Root_Url_str}/{Object_Root_str}/{Object_FileName_str}"

        # Create directory if it doesn't exist
        os.makedirs(Fits_File_Root_Path_str, exist_ok=True)

        # Check if file already exists
        if os.path.exists(Fits_File_Full_Path_str):
            return 0

        # Download the FITS file
        result = os.system(f'wget -q -P {Fits_File_Root_Path_str} {Fits_File_Url_str}')

        if result == 0:
            return 0
        else:
            print(f"wget failed with return code: {result}")
            return -1

    except Exception as e:
        print(f"Error downloading file at index {Index_In_DJA_v4_DataFrame_int}: {e}")
        return -1

def main():
    File_Path_str = os.path.expanduser('~/DJAv4')
    DJA_Root_Url_str = "https://s3.amazonaws.com/msaexp-nirspec/extractions"
    DJA_v4_Catalog_Path_str = './DJAv4Catalog.csv'
    DJA_v4_Catalog_DataFrame = pd.read_csv(DJA_v4_Catalog_Path_str)
    DJA_v4_Catalog_DataFrame = DJA_v4_Catalog_DataFrame.sort_values(by='root')

    print(f"\n{'='*60}")
    print(f"Downloading data")
    print(f"{'='*60}\n")

    total_files_int = len(DJA_v4_Catalog_DataFrame)
    print(f"Total files to download: {total_files_int}")

    # Use multiprocessing to download files in parallel
    num_processes_int = int(multiprocessing.cpu_count() * 0.8)
    print(f"Using {num_processes_int} processes for parallel downloading.")

    pool = multiprocessing.Pool(processes=num_processes_int)

    # Create partial function with the fixed parameters
    download_func = partial(Download_FITS,
                           DJA_v4_DataFrame=DJA_v4_Catalog_DataFrame,
                           DJA_Root_Url_str=DJA_Root_Url_str,
                           File_Path_str=File_Path_str)

    results = list(tqdm(pool.imap(download_func, range(total_files_int)),
                       total=total_files_int,
                       desc="Downloading FITS files"))

    pool.close()
    pool.join()

    successful_downloads = sum(1 for result in results if result == 0)
    failed_downloads = sum(1 for result in results if result == -1)

    print(f"\n{'='*60}")
    print(f"Download completed.")
    print(f"Total files downloaded successfully: {successful_downloads}")
    print(f"Total files failed to download: {failed_downloads}")
    print(f"Total files in catalog: {total_files_int}")
    print(f"Download rate: {successful_downloads/total_files_int*100:.2f}%")
    print(f"Failed rate: {failed_downloads/total_files_int*100:.2f}%")
    print(f"{'='*60}\n")

if __name__ == "__main__":
    main()


Downloading data

Total files to download: 67099
Using 57 processes for parallel downloading.


Downloading FITS files:  10%|▉         | 6574/67099 [07:01<1:04:21, 15.67it/s]

wget failed with return code: 2048


Downloading FITS files:  10%|▉         | 6644/67099 [07:18<21:34:14,  1.28s/it]

wget failed with return code: 2048


Downloading FITS files:  11%|█         | 7244/67099 [07:51<15:27, 64.53it/s]   

wget failed with return code: 2048
wget failed with return code: 2048


Downloading FITS files:  70%|███████   | 47208/67099 [58:51<18:59, 17.46it/s]   

wget failed with return code: 2048


Downloading FITS files:  70%|███████   | 47235/67099 [58:59<30:42, 10.78it/s]

wget failed with return code: 2048
wget failed with return code: 2048
wget failed with return code: 2048


Downloading FITS files:  70%|███████   | 47274/67099 [59:01<25:51, 12.78it/s]

wget failed with return code: 2048


Downloading FITS files:  71%|███████▏  | 47823/67099 [59:19<13:03, 24.59it/s]

wget failed with return code: 2048


Downloading FITS files:  71%|███████▏  | 47889/67099 [59:24<35:40,  8.98it/s]

wget failed with return code: 2048


Downloading FITS files:  71%|███████▏  | 47891/67099 [59:24<41:07,  7.78it/s]

wget failed with return code: 2048


Downloading FITS files:  71%|███████▏  | 47893/67099 [59:27<2:24:14,  2.22it/s]

wget failed with return code: 2048


Downloading FITS files:  71%|███████▏  | 47899/67099 [59:33<3:54:03,  1.37it/s]

wget failed with return code: 2048
wget failed with return code: 2048


Downloading FITS files:  71%|███████▏  | 47904/67099 [59:51<12:33:16,  2.35s/it]

wget failed with return code: 2048


Downloading FITS files:  72%|███████▏  | 48000/67099 [1:00:01<1:01:22,  5.19it/s]

wget failed with return code: 2048
wget failed with return code: 2048
wget failed with return code: 2048


Downloading FITS files: 100%|██████████| 67099/67099 [1:26:06<00:00, 12.99it/s]  


Download completed.
Total files downloaded successfully: 67080
Total files failed to download: 19
Total files in catalog: 67099
Download rate: 99.97%
Failed rate: 0.03%






In [3]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import multiprocessing
from functools import partial

File_Path_str = os.path.expanduser('~/DJAv4')
DJA_Root_Url_str = "https://s3.amazonaws.com/msaexp-nirspec/extractions"
os.makedirs(File_Path_str, exist_ok=True)
DJA_v4_Catalog_Path_str = './DJAv4Catalog.csv'
DJA_v4_Catalog_DataFrame = pd.read_csv(DJA_v4_Catalog_Path_str)
DJA_v4_Catalog_Path_str = './DJAv4Catalog.csv'
DJA_v4_Catalog_DataFrame = pd.read_csv(DJA_v4_Catalog_Path_str)
DJA_v4_Catalog_DataFrame = DJA_v4_Catalog_DataFrame.sort_values(by='root')