## ImageDataGenerator用データセットアップ(訓練用2000+検証用1000+テスト用1000=4000件)

In [1]:
import os, shutil
import sys, time
import numpy as np
import pandas as pd
from astropy.io import fits
import urllib.request, urllib.parse
from bs4 import BeautifulSoup
import pickle
from datetime import datetime

# 作業用ディレクトリ定義（要カスタマイズ）
galaxy_data_directory = '/home/satoshi/Galaxy/'

# 新カテゴリカタログ
gz2_catalog_v2 = galaxy_data_directory + 'gz2_catalog_12class.fits'

# サンプルデータセットベースディレクトリ
sample_directory_base = galaxy_data_directory + 'dataset_12class_selected_256_01_4000'

# ImageDataGenerator用訓練データディレクトリ
train_generator_directory = galaxy_data_directory + 'train'
print(train_generator_directory)
os.mkdir(train_generator_directory)

# ImageDataGenerator用検証データディレクトリ
validation_generator_directory = galaxy_data_directory + 'validation'
print(validation_generator_directory)
os.mkdir(validation_generator_directory)

# ImageDataGenerator用テストデータディレクトリ
test_generator_directory = galaxy_data_directory + 'test'
print(test_generator_directory)
os.mkdir(test_generator_directory)


/home/satoshi/Galaxy/train
/home/satoshi/Galaxy/validation
/home/satoshi/Galaxy/test


#### それぞれのカテゴリについて、訓練データ2000件、検証データ1000件、テストデータ1000件のデータセットを構成する

In [2]:
# データセットに含まれるカテゴリ（クラス）ディレクトリの種類
category_directory = os.listdir(sample_directory_base)
print('category_directory: ', len(category_directory), '  ', category_directory)

# データセットに含まれるすべてのカテゴリについて
for category in category_directory:
    # カテゴリに含まれるgalaxyデータのファイル名を取得
    galaxy_fnames = np.array(os.listdir(sample_directory_base + '/' + category))
    print(category, '  ', len(galaxy_fnames))
    # カテゴリに含まれるgalaxyデータから、訓練、検証、テスト用に使用するデータ4000(2000+1000+1000)件をランダムに選択する
    galaxy_list= [i for i in range(len(galaxy_fnames))]
    selected_indexes = pd.Series(galaxy_list).sample(n = 4000).values
   
    # 訓練データセット(2000件/カテゴリ)の作成　全カテゴリでは2000 x 12= 24000件
    train_data_fnames = galaxy_fnames[selected_indexes[0:2000]]
    print('train_data_fnames  ', len(train_data_fnames))
    train_generator_category_directory = train_generator_directory + '/' + category
    os.mkdir(train_generator_category_directory)
    for train_data_fname in train_data_fnames:
        src = os.path.join(sample_directory_base + '/' + category, train_data_fname)
        dst = os.path.join(train_generator_category_directory, train_data_fname)
        shutil.copyfile(src, dst)
            
    # 検証データセット(1000件/カテゴリ)の作成
    validation_data_fnames = galaxy_fnames[selected_indexes[2000:3000]]
    print('validation_data_fnames  ', len(validation_data_fnames))
    validation_generator_category_directory = validation_generator_directory + '/' + category
    os.mkdir(validation_generator_category_directory)
    for validation_data_fname in validation_data_fnames:
        src = os.path.join(sample_directory_base + '/' + category, validation_data_fname)
        dst = os.path.join(validation_generator_category_directory, validation_data_fname)
        shutil.copyfile(src, dst)
            
    # テストデータセット(1000件/カテゴリ)の作成
    test_data_fnames = galaxy_fnames[selected_indexes[3000:4000]]
    print('test_data_fnames  ', len(test_data_fnames))
    test_generator_category_directory = test_generator_directory + '/' + category
    os.mkdir(test_generator_category_directory)
    for test_data_fname in test_data_fnames:
        src = os.path.join(sample_directory_base + '/' + category, test_data_fname)
        dst = os.path.join(test_generator_category_directory, test_data_fname)
        shutil.copyfile(src, dst)
    

category_directory:  12    ['Sc', 'Edgeon', 'E0', 'Sb', 'S0', 'SBa', 'SB0', 'Sa', 'E3', 'SBb', 'E7', 'SBc']
Sc    4000
train_data_fnames   2000
validation_data_fnames   1000
test_data_fnames   1000
Edgeon    4000
train_data_fnames   2000
validation_data_fnames   1000
test_data_fnames   1000
E0    4000
train_data_fnames   2000
validation_data_fnames   1000
test_data_fnames   1000
Sb    4000
train_data_fnames   2000
validation_data_fnames   1000
test_data_fnames   1000
S0    4000
train_data_fnames   2000
validation_data_fnames   1000
test_data_fnames   1000
SBa    4000
train_data_fnames   2000
validation_data_fnames   1000
test_data_fnames   1000
SB0    4000
train_data_fnames   2000
validation_data_fnames   1000
test_data_fnames   1000
Sa    4000
train_data_fnames   2000
validation_data_fnames   1000
test_data_fnames   1000
E3    4000
train_data_fnames   2000
validation_data_fnames   1000
test_data_fnames   1000
SBb    4000
train_data_fnames   2000
validation_data_fnames   1000
test_data

#### カテゴリ別訓練、検証、テストデータ件数

In [3]:
generator_directory = [train_generator_directory, validation_generator_directory, test_generator_directory]
for directory in generator_directory:
    print(directory)
    for category in category_directory:
        data_counts = len(os.listdir(directory + '/' + category))
        print(category, '  ', data_counts)


/home/satoshi/Galaxy/train
Sc    2000
Edgeon    2000
E0    2000
Sb    2000
S0    2000
SBa    2000
SB0    2000
Sa    2000
E3    2000
SBb    2000
E7    2000
SBc    2000
/home/satoshi/Galaxy/validation
Sc    1000
Edgeon    1000
E0    1000
Sb    1000
S0    1000
SBa    1000
SB0    1000
Sa    1000
E3    1000
SBb    1000
E7    1000
SBc    1000
/home/satoshi/Galaxy/test
Sc    1000
Edgeon    1000
E0    1000
Sb    1000
S0    1000
SBa    1000
SB0    1000
Sa    1000
E3    1000
SBb    1000
E7    1000
SBc    1000
