## GZ2イメージデータサンプルのダウンロード(random select)

In [1]:
import os, shutil
import sys, time
import numpy as np
import pandas as pd
from astropy.io import fits
import urllib.request, urllib.parse
from bs4 import BeautifulSoup
from datetime import datetime

# 作業用ディレクトリ定義（要カスタマイズ）
galaxy_data_directory = '/home/satoshi/Galaxy/'
# データサイズ (pixels)
data_size='256'
# sacle_factor
scale_factor=0.01
# 最大redshift
max_redshift=5
# ダウンロードサンプル件数
sample_size = 10
# 新カテゴリカタログ
gz2_catalog_v2 = galaxy_data_directory + 'gz2_catalog_hubble_12class.fits'
# サンプルデータ格納ディレクトリ
sample_directory = galaxy_data_directory + 'dataset_12class_' + str(sample_size) + '_' + \
                                                                             data_size + '_' + str(scale_factor)[2:] 
os.mkdir(sample_directory)


In [2]:
with fits.open(gz2_catalog_v2) as hdul:
    data = hdul[1].data
    print('category_catalog_hubble rows = ', len(data))
    galaxy_class = np.array(data.field('HUBBLE_CLASS'))
    galaxy_class_series=pd.Series(galaxy_class)
    unique_galaxy_class = galaxy_class_series.unique()    
        
    print(unique_galaxy_class)
    print(len(unique_galaxy_class), ' class')
    
    galaxy_redshift = np.array(data.field('REDSHIFT_SIMPLE_BIN'))
    
    # 各クラスのデータダウンロード
    for class_name in unique_galaxy_class:
        
        galaxy_list_series=pd.Series(np.array([i for i in range(len(data)) if galaxy_class[i] == class_name]))
        galaxy_list_series_selected=galaxy_list_series.sample(n=sample_size).values
        print(class_name, '  total  ', len(galaxy_list_series), '  ', len(galaxy_list_series_selected), '  selected')
        
        if  len(galaxy_list_series_selected) < sample_size:
            print(class_name, '  data shortage')
            continue    
    
        # クラスディレクトリ
        class_dataset_directory =  sample_directory + '/' + class_name
        os.mkdir(class_dataset_directory)
        print('download start: ', datetime.now())

        for i in range(len(galaxy_list_series_selected)):
            
            ra = str(data[galaxy_list_series_selected[i]].field('ra'))
            dec = str(data[galaxy_list_series_selected[i]].field('dec'))
            dr7objid = data[galaxy_list_series_selected[i]].field('dr7objid')
            hubble_class = data[galaxy_list_series_selected[i]].field('HUBBLE_CLASS')
            petroR90_r = data[galaxy_list_series_selected[i]].field('PETROR90_R')
            redshift_bin = data[galaxy_list_series_selected[i]].field('REDSHIFT_SIMPLE_BIN')
            mr_bin = data[galaxy_list_series_selected[i]].field('PETROMAG_MR_SIMPLE_BIN')
            petroR50_bin= data[galaxy_list_series_selected[i]].field('PETROR50_R_KPC_SIMPLE_BIN')
            
            scale = str(scale_factor * int(petroR90_r))
            galaxy_params = {
                'ra': ra, 'dec': dec, 'scale': scale, 'width': data_size, 'height': data_size
             } 
            p = urllib.parse.urlencode(galaxy_params)
    
            # SDSS DR7 ImageCutout サービス
            dr7_imagecutout_url = 'http://skyservice.pha.jhu.edu/DR7/ImgCutout/getjpeg.aspx?' + p
   
            for _ in range(3):
                try:
                    with urllib.request.urlopen(dr7_imagecutout_url, timeout=5) as res:
                         jpeg_data = res.read()
                except Exception as e:
                    pass
                else:
                    break
            else:
                pass
        
            # jpegイメージデータファイル名
            sample_filename = class_dataset_directory + '/' + class_name + '_' + str(redshift_bin) + '_' + \
                                                                                                  str(int(petroR90_r)) + '_' + \
                                                                                                  str(mr_bin) + '_' + str(petroR50_bin) + '_' + \
                                                                                                  str(dr7objid)+ '.jpeg'
        
            with open(sample_filename, 'wb') as f:
                f.write(jpeg_data)
        
            time.sleep(1.0)
            
print('completed: ', datetime.now())

category_catalog_hubble rows =  239100
['Sa' 'E3' 'E0' 'Sb' 'SBb' 'S0' 'E7' 'SB0' 'Edgeon' 'SBa' 'SBc' 'Sc']
12  class
Sa   total   35062    10   selected
download start:  2018-08-16 15:28:42.256406
E3   total   47166    10   selected
download start:  2018-08-16 15:28:59.955399
E0   total   40015    10   selected
download start:  2018-08-16 15:29:16.720340
Sb   total   18527    10   selected
download start:  2018-08-16 15:29:32.727700
SBb   total   17086    10   selected
download start:  2018-08-16 15:29:49.382351
S0   total   18209    10   selected
download start:  2018-08-16 15:30:05.294511
E7   total   10489    10   selected
download start:  2018-08-16 15:30:21.544940
SB0   total   5023    10   selected
download start:  2018-08-16 15:30:37.826985
Edgeon   total   19606    10   selected
download start:  2018-08-16 15:30:53.789239
SBa   total   18415    10   selected
download start:  2018-08-16 15:31:09.741944
SBc   total   5057    10   selected
download start:  2018-08-16 15:31:25.74