## GZ2イメージデータサンプルのダウンロード(Mr指定)

In [1]:
import os, shutil
import sys, time
import numpy as np
import pandas as pd
from astropy.io import fits
import urllib.request, urllib.parse
from bs4 import BeautifulSoup
from datetime import datetime

# 作業用ディレクトリ定義（要カスタマイズ）
galaxy_data_directory = '/home/satoshi/Galaxy/'
# データサイズ (pixels)
data_size='256'
# sacle_factor
scale_factor=0.01
# 最大Mr
max_mr=60
# ダウンロードサンプル件数
sample_size = 10
# 新カテゴリカタログ
gz2_catalog_v2 = galaxy_data_directory + 'gz2_catalog_hubble_12class.fits'
# サンプルデータ格納ディレクトリ
sample_directory = galaxy_data_directory + 'dataset_12class_' + str(sample_size) + '_' + \
                                                                             data_size + '_' + str(scale_factor)[2:] + '_' + 'Mr' + str(max_mr)
os.mkdir(sample_directory)


In [2]:
with fits.open(gz2_catalog_v2) as hdul:
    data = hdul[1].data
    print('category_catalog_hubble rows = ', len(data))
    galaxy_class = np.array(data.field('HUBBLE_CLASS'))
    galaxy_class_series=pd.Series(galaxy_class)
    unique_galaxy_class = galaxy_class_series.unique()    
        
    print(unique_galaxy_class)
    print(len(unique_galaxy_class), ' class')
    
    #galaxy_redshift = np.array(data.field('REDSHIFT_SIMPLE_BIN'))
    galaxy_mr = np.array(data.field('PETROMAG_MR_SIMPLE_BIN'))
    
    # 各クラスのデータダウンロード
    print('max_mr = ', max_mr)
    for class_name in unique_galaxy_class:
        
        galaxy_list_series=pd.Series(np.array([i for i in range(len(data)) if galaxy_class[i] == class_name and galaxy_mr[i] <= max_mr]))
        galaxy_list_series_selected=galaxy_list_series.sample(n=sample_size).values
        print(class_name, '  total  ', len(galaxy_list_series), '  ', len(galaxy_list_series_selected), '  selected')
        
        if  len(galaxy_list_series_selected) < sample_size:
            print(class_name, '  data shortage')
            continue    
    
        # クラスディレクトリ
        class_dataset_directory =  sample_directory + '/' + class_name
        os.mkdir(class_dataset_directory)
        print('download start: ', datetime.now())

        for i in range(len(galaxy_list_series_selected)):
            
            ra = str(data[galaxy_list_series_selected[i]].field('ra'))
            dec = str(data[galaxy_list_series_selected[i]].field('dec'))
            dr7objid = data[galaxy_list_series_selected[i]].field('dr7objid')
            hubble_class = data[galaxy_list_series_selected[i]].field('HUBBLE_CLASS')
            petroR90_r = data[galaxy_list_series_selected[i]].field('PETROR90_R')
            redshift_bin = data[galaxy_list_series_selected[i]].field('REDSHIFT_SIMPLE_BIN')
            mr_bin = data[galaxy_list_series_selected[i]].field('PETROMAG_MR_SIMPLE_BIN')
            petroR50_bin= data[galaxy_list_series_selected[i]].field('PETROR50_R_KPC_SIMPLE_BIN')
            
            scale = str(scale_factor * int(petroR90_r))
            galaxy_params = {
                'ra': ra, 'dec': dec, 'scale': scale, 'width': data_size, 'height': data_size
             } 
            p = urllib.parse.urlencode(galaxy_params)
    
            # SDSS DR7 ImageCutout サービス
            dr7_imagecutout_url = 'http://skyservice.pha.jhu.edu/DR7/ImgCutout/getjpeg.aspx?' + p
   
            for _ in range(3):
                try:
                    with urllib.request.urlopen(dr7_imagecutout_url, timeout=5) as res:
                         jpeg_data = res.read()
                except Exception as e:
                    pass
                else:
                    break
            else:
                pass
        
            # jpegイメージデータファイル名
            sample_filename = class_dataset_directory + '/' + class_name + '_' + str(redshift_bin) + '_' + \
                                                                                                  str(int(petroR90_r)) + '_' + \
                                                                                                  str(mr_bin) + '_' + str(petroR50_bin) + '_' + \
                                                                                                  str(dr7objid)+ '.jpeg'
            
            with open(sample_filename, 'wb') as f:
                f.write(jpeg_data)
        
            time.sleep(1.0)
            
print('completed: ', datetime.now())

category_catalog_hubble rows =  239100
['Sa' 'E3' 'E0' 'Sb' 'SBb' 'S0' 'E7' 'SB0' 'Edgeon' 'SBa' 'SBc' 'Sc']
12  class
max_mr =  60
Sa   total   25324    10   selected
download start:  2018-08-16 14:43:32.468236
E3   total   28154    10   selected
download start:  2018-08-16 14:43:49.889307
E0   total   30658    10   selected
download start:  2018-08-16 14:44:06.432850
Sb   total   11868    10   selected
download start:  2018-08-16 14:44:23.323613
SBb   total   13819    10   selected
download start:  2018-08-16 14:44:39.260688
S0   total   13154    10   selected
download start:  2018-08-16 14:44:55.540671
E7   total   3351    10   selected
download start:  2018-08-16 14:45:12.072362
SB0   total   3459    10   selected
download start:  2018-08-16 14:45:28.203591
Edgeon   total   10198    10   selected
download start:  2018-08-16 14:45:44.362303
SBa   total   16404    10   selected
download start:  2018-08-16 14:46:00.014858
SBc   total   3603    10   selected
download start:  2018-08-16