## ## GZ2フルデータセットからML対象データセットを作成（制約条件サンプリング）

In [1]:
import os, shutil
import sys, time
import numpy as np
import pandas as pd
from astropy.io import fits
import urllib.request, urllib.parse
from bs4 import BeautifulSoup
import pickle
from datetime import datetime

# 作業用ディレクトリ定義（要カスタマイズ）
galaxy_data_directory = '/home/satoshi/Galaxy/'

# 新カテゴリカタログ
gz2_catalog_v2 = galaxy_data_directory + 'gz2_catalog_hubble_12class.fits'


# データセットベースディレクトリ
dataset_directory_base = galaxy_data_directory + 'dataset_12class_full_256_01'

# 制約条件：以下の設定は [Hart et al. (2016)] http://mnras.oxfordjournals.org/content/461/4/3663 による
# Redshift_bin制限 redshift: 0.03 ~ 0.085
min_redshift=2
max_redshift=7
# Mr_bin制限 Mr : -21(Mr_bin=60) ~  -24(Mr_bin=0)
max_mr=60

sample_directory_base = galaxy_data_directory +  'dataset_12class_limited_256_01'
os.mkdir(sample_directory_base)


In [2]:
with fits.open(gz2_catalog_v2) as hdul:
    data = hdul[1].data
    
    dr7objid = np.array(data.field('dr7objid'))
    redshift_bin = np.array(data.field('REDSHIFT_SIMPLE_BIN'))
    mr_bin  = np.array(data.field('PETROMAG_MR_SIMPLE_BIN'))
    fname = np.array(data.field('FILE_NAME'))    
    
    galaxy_class = np.array(data.field('HUBBLE_CLASS'))
    galaxy_class_series=pd.Series(galaxy_class, index = dr7objid)
    unique_galaxy_class = set(galaxy_class_series.values)    
        
    print(unique_galaxy_class)
    print(len(unique_galaxy_class), ' class')
    

{'E3', 'E0', 'Edgeon', 'E7', 'SB0', 'SBb', 'SBa', 'Sa', 'Sb', 'S0', 'Sc', 'SBc'}
12  class


#### Redshift, Mrについての制約条件を満たすデータをすべてサンプリングする

In [3]:
for class_name in unique_galaxy_class:
    
    galaxy_limited_list =pd.Series(np.array([i for i in range(len(dr7objid)) if galaxy_class[i] == class_name \
                                           and redshift_bin[i] <= max_redshift \
                                           and redshift_bin[i] >= min_redshift \
                                           and mr_bin[i] <= max_mr]))        
    
    print(class_name, '  ', len(galaxy_limited_list), '  selected')
    
    os.mkdir(sample_directory_base + '/' + class_name)
    
    for galaxy_index in galaxy_limited_list:
        src = os.path.join(dataset_directory_base + '/' + class_name, fname[galaxy_index])
        dst = os.path.join(sample_directory_base + '/' + class_name, fname[galaxy_index])
        shutil.copyfile(src, dst)
        
print('completed')

E3    13986   selected
E0    11678   selected
Edgeon    6921   selected
E7    1873   selected
SB0    1434   selected
SBb    7165   selected
SBa    7377   selected
Sa    13633   selected
Sb    6194   selected
S0    4532   selected
Sc    1208   selected
SBc    1873   selected
completed
