In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import multiprocessing
from functools import partial

def Download_Full_Spectra_FITS(File_Index_In_DJA_Catalog, File_CSVDataFrame, File_Path, DJA_Root_Path):
    """
    Download the full spectra FITS file from the given URL and save it to the specified path.
    """
    try:
        Spectra_NGrating = File_CSVDataFrame.nGr[File_Index_In_DJA_Catalog]

        FITS_File_Root_Name = File_CSVDataFrame.root[File_Index_In_DJA_Catalog]
        FITS_File_Name = File_CSVDataFrame.file[File_Index_In_DJA_Catalog]

        File_URL = os.path.join(DJA_Root_Path, FITS_File_Root_Name, FITS_File_Name)
        File_Path_Full = os.path.join(File_Path, FITS_File_Name)

        if not os.path.exists(File_Path_Full):
            os.system('wget -q -P ' + File_Path + ' ' + File_URL)

        if Spectra_NGrating:
            for Grating in File_CSVDataFrame.Grating[File_Index_In_DJA_Catalog].split(' '):
                Grating = Grating.strip().lower()
                Disperser_Name = (Grating[1:-1])

                Dual_Filter_Flag = False

                if Disperser_Name == '140':
                    Dual_Filter_Flag = True
                    Grating_070 = Grating + '-f070lp'
                    Grating_100 = Grating + '-f100lp'
                elif Disperser_Name == '235':
                    Grating = Grating + '-f170lp'
                elif Disperser_Name == '395':
                    Grating = Grating + '-f290lp'

                if Dual_Filter_Flag:
                    try:
                        Grating_070_FITS_File_Name = FITS_File_Name.replace('prism-clear', Grating_070)

                        Grating_070_FITS_File_Path = os.path.join(File_Path, Grating_070_FITS_File_Name)


                        Grating_070_FITS_File_URL = os.path.join(DJA_Root_Path, FITS_File_Root_Name, Grating_070_FITS_File_Name)


                        if not os.path.exists(Grating_070_FITS_File_Path):
                            os.system('wget -q -P ' + File_Path + ' ' + Grating_070_FITS_File_URL)

                    except Exception as e:
                        print(f'Process {multiprocessing.current_process().name}: Error downloading grating files for: {FITS_File_Name}')
                        print('Grating:', Grating)
                        print('Grating_070_FITS_File_Name:', Grating_070_FITS_File_Name)
                        print('Grating_070_FITS_File_URL:', Grating_070_FITS_File_URL)
                        print('Error:', e)
                        return -1
                    try:
                        Grating_100_FITS_File_Name = FITS_File_Name.replace('prism-clear', Grating_100)
                        Grating_100_FITS_File_Path = os.path.join(File_Path, Grating_100_FITS_File_Name)
                        Grating_100_FITS_File_URL = os.path.join(DJA_Root_Path, FITS_File_Root_Name, Grating_100_FITS_File_Name)
                        if not os.path.exists(Grating_100_FITS_File_Path):
                            os.system('wget -q -P ' + File_Path + ' ' + Grating_100_FITS_File_URL)
                    except Exception as e:
                        print(f'Process {multiprocessing.current_process().name}: Error downloading grating files for: {FITS_File_Name}')
                        print('Grating:', Grating)
                        print('Grating_100_FITS_File_Name:', Grating_100_FITS_File_Name)
                        print('Grating_100_FITS_File_URL:', Grating_100_FITS_File_URL)
                        print('Error:', e)
                        return -1

                else:
                    try:
                        Grating_FITS_File_Name = FITS_File_Name.replace('prism-clear', Grating)
                        Grating_FITS_File_Path = os.path.join(File_Path, Grating_FITS_File_Name)
                        Grating_FITS_File_URL = os.path.join(DJA_Root_Path, FITS_File_Root_Name, Grating_FITS_File_Name)

                        if not os.path.exists(Grating_FITS_File_Path):
                            os.system('wget -q -P ' + File_Path + ' ' + Grating_FITS_File_URL)
                    except Exception as e:
                        print(f'Process {multiprocessing.current_process().name}: Error downloading grating files for: {FITS_File_Name}')
                        print('Grating:', Grating)
                        print('Grating_FITS_File_Name:', Grating_FITS_File_Name)
                        print('Grating_FITS_File_URL:', Grating_FITS_File_URL)
                        print('Error:', e)
                        return -1
        return 0
    except Exception as e:
        print(f'Process {multiprocessing.current_process().name}: Unexpected error for index {File_Index_In_DJA_Catalog}')
        print('Error:', e)
        return -1

def main():
    File_Path = './DJAData'
    if not os.path.exists(File_Path):
        os.mkdir(File_Path)

    if len(os.listdir(File_Path)) != 0:
        print('Data already exists, deleting...')
        for file in os.listdir(File_Path):
            os.system('rm -rf ' + File_Path + '/' + file)
        print('Data deleted.')
    else:
        print('Data folder is empty, proceeding to download data...')

    print('Downloading data...')
    # Download the data
    File_CSVDataFrame = pd.read_csv('./DJACatalog.csv')
    DJA_Root_Path = 'https://s3.amazonaws.com/msaexp-nirspec/extractions/'

    total_files = len(File_CSVDataFrame)
    print(f'Found {total_files} files to download')

    # 确定CPU核心数，为了避免过度占用系统资源，使用可用核心数的70%
    num_processes = max(1, int(multiprocessing.cpu_count() * 0.9))
    print(f'Using {num_processes} processes for parallel downloading')

    # 创建进程池
    pool = multiprocessing.Pool(processes=num_processes)

    # 创建偏函数，只留下索引需要变化
    download_partial = partial(
        Download_Full_Spectra_FITS,
        File_CSVDataFrame=File_CSVDataFrame,
        File_Path=File_Path,
        DJA_Root_Path=DJA_Root_Path
    )

    # 使用tqdm显示进度
    results = list(tqdm(
        pool.imap(download_partial, range(len(File_CSVDataFrame))),
        total=len(File_CSVDataFrame),
        desc="Downloading FITS files"
    ))

    # 关闭进程池
    pool.close()
    pool.join()

    # 统计下载结果
    successful_downloads = results.count(0)
    failed_downloads = results.count(-1)

    print(f"Download completed. Successfully downloaded: {successful_downloads}, Failed: {failed_downloads}")

if __name__ == "__main__":
    main()

Data folder is empty, proceeding to download data...
Downloading data...
Found 8974 files to download
Using 64 processes for parallel downloading


Downloading FITS files: 100%|██████████| 8974/8974 [14:38<00:00, 10.21it/s]  

Download completed. Successfully downloaded: 8974, Failed: 0





In [2]:
import os
len(os.listdir('./DJAData'))

18507

In [9]:
len(File_CSVDataFrame.jname)

8974