##  GZ2フルデータセットからML対象データセットを作成（赤方偏移&光度制限データセット）

In [1]:
import os, shutil
import sys, time
import numpy as np
import pandas as pd
from astropy.io import fits
import urllib.request, urllib.parse
from bs4 import BeautifulSoup
import pickle
from datetime import datetime

# 作業用ディレクトリ定義（要カスタマイズ）
galaxy_data_directory = '/home/satoshi/Galaxy/'

# 新カテゴリカタログ
gz2_catalog_v2 = galaxy_data_directory + 'gz2_catalog_hubble_12class.fits'


# データセットベースディレクトリ
dataset_directory_base = galaxy_data_directory + 'dataset_12class_full_256_01'

# 制約条件：以下の設定は [Hart et al. (2016)] http://mnras.oxfordjournals.org/content/461/4/3663 による
# Redshift_bin制限 redshift: 0.03 ~ 0.085
min_redshift=2
max_redshift=7
# Mr_bin制限 Mr : -21(Mr_bin=60) ~  -24(Mr_bin=0)
max_mr=60

sample_directory_base = galaxy_data_directory +  'dataset_12class_limited_256_01'
os.mkdir(sample_directory_base)


In [2]:
with fits.open(gz2_catalog_v2) as hdul:
    data = hdul[1].data
    
    dr7objid = np.array(data.field('dr7objid'))
    redshift_bin = np.array(data.field('REDSHIFT_SIMPLE_BIN'))
    mr_bin  = np.array(data.field('PETROMAG_MR_SIMPLE_BIN'))
    fname = np.array(data.field('FILE_NAME'))    
    
    galaxy_class = np.array(data.field('HUBBLE_CLASS'))
    galaxy_class_series=pd.Series(galaxy_class, index = dr7objid)
    unique_galaxy_class = sorted(galaxy_class_series.unique()) 
    print(unique_galaxy_class)
    print(len(unique_galaxy_class), ' class  ',data.shape[0] )
    

['E0', 'E3', 'E7', 'Edgeon', 'S0', 'SB0', 'SBa', 'SBb', 'SBc', 'Sa', 'Sb', 'Sc']
12  class   239100


#### Redshift, Mrについての制約条件を満たすデータをすべてサンプリングする

In [3]:
galaxy_limited_list_count = 0
class_count_list = []
for class_name in unique_galaxy_class:
    
    galaxy_limited_list =pd.Series(np.array([i for i in range(len(dr7objid)) if galaxy_class[i] == class_name \
                                           and redshift_bin[i] <= max_redshift \
                                           and redshift_bin[i] >= min_redshift \
                                           and mr_bin[i] <= max_mr]))        
    
    galaxy_limited_list_count += len(galaxy_limited_list)
    class_count_list +=[[class_name, len(galaxy_limited_list)]]
    os.mkdir(sample_directory_base + '/' + class_name)
    
    for galaxy_index in galaxy_limited_list:
        src = os.path.join(dataset_directory_base + '/' + class_name, fname[galaxy_index])
        dst = os.path.join(sample_directory_base + '/' + class_name, fname[galaxy_index])
        shutil.copyfile(src, dst)

print(galaxy_limited_list_count, '  selected')
for class_count in class_count_list:
    class_ratio = int(class_count[1]) / galaxy_limited_list_count * 100
    print(class_count[0], '  ', class_count[1], '    ', int(class_ratio), '%')
        
print('completed')


77874   selected
E0    11678      14 %
E3    13986      17 %
E7    1873      2 %
Edgeon    6921      8 %
S0    4532      5 %
SB0    1434      1 %
SBa    7377      9 %
SBb    7165      9 %
SBc    1873      2 %
Sa    13633      17 %
Sb    6194      7 %
Sc    1208      1 %
completed
