In [1]:
import pandas as pd
import numpy as np
import os, sys, math, pickle
from PIL import Image
import cv2
import random as rn

import tensorflow as tf
from tensorflow.image import resize_images

In [2]:
os.environ['PYTHONHASHSEED'] = '0'
# Setting the seed for numpy-generated random numbers
np.random.seed(37)
# Setting the seed for python random numbers
rn.seed(1254)
# Setting the graph-level random seed.
tf.set_random_seed(89)
# 自動增長 GPU 記憶體用量
gpu_options = tf.GPUOptions(allow_growth=True)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
# 設定 Keras 使用的 Session
tf.keras.backend.set_session(sess)

In [3]:
name_label_dict = {
0:  'Nucleoplasm',
1:  'Nuclear membrane',
2:  'Nucleoli',   
3:  'Nucleoli fibrillar center',
4:  'Nuclear speckles',
5:  'Nuclear bodies',
6:  'Endoplasmic reticulum',   
7:  'Golgi apparatus',
8:  'Peroxisomes',
9:  'Endosomes',
10:  'Lysosomes',
11:  'Intermediate filaments',
12:  'Actin filaments',
13:  'Focal adhesion sites',   
14:  'Microtubules',
15:  'Microtubule ends',  
16:  'Cytokinetic bridge',   
17:  'Mitotic spindle',
18:  'Microtubule organizing center',  
19:  'Centrosome',
20:  'Lipid droplets',
21:  'Plasma membrane',   
22:  'Cell junctions', 
23:  'Mitochondria',
24:  'Aggresome',
25:  'Cytosol',
26:  'Cytoplasmic bodies',   
27:  'Rods & rings' }

LABEL_NUM = 28

In [4]:
NAME = "data_preprocessing"# os.path.basename(__file__).split('.')[0]
PATH = os.getcwd()
TRAIN = os.path.join(os.getcwd(), 'data', 'train')
TEST = os.path.join(os.getcwd(), 'data', 'test')
LABELS = os.path.join(os.getcwd(), 'data', 'train.csv')
PREPROCESSED = os.path.join(os.getcwd(), 'preprocessed_data')
SAMPLE = os.path.join(os.getcwd(), 'data', 'sample_submission.csv')
MODEL = os.path.join(os.getcwd(), 'model', NAME+'.h5')
RESULT = os.path.join(os.getcwd(), 'result', NAME+'_submission.csv')

In [6]:
data = pd.read_csv(LABELS)

train_dataset_info = []
for name, labels in zip(data['Id'], data['Target'].str.split(' ')):
    train_dataset_info.append({
        'path':os.path.join(TRAIN, name),
        'labels':np.array([int(label) for label in labels])})
train_dataset_info = np.array(train_dataset_info)

In [7]:
train_dataset_info

array([{'path': 'C:\\Users\\OneDay\\Downloads\\ML2018FALL\\final\\Human_Protein_Atlas_Image_classification\\data\\train\\00070df0-bbc3-11e8-b2bc-ac1f6b6435d0', 'labels': array([16,  0])},
       {'path': 'C:\\Users\\OneDay\\Downloads\\ML2018FALL\\final\\Human_Protein_Atlas_Image_classification\\data\\train\\000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0', 'labels': array([7, 1, 2, 0])},
       {'path': 'C:\\Users\\OneDay\\Downloads\\ML2018FALL\\final\\Human_Protein_Atlas_Image_classification\\data\\train\\000a9596-bbc4-11e8-b2bc-ac1f6b6435d0', 'labels': array([5])},
       ...,
       {'path': 'C:\\Users\\OneDay\\Downloads\\ML2018FALL\\final\\Human_Protein_Atlas_Image_classification\\data\\train\\fff189d8-bbab-11e8-b2ba-ac1f6b6435d0', 'labels': array([7])},
       {'path': 'C:\\Users\\OneDay\\Downloads\\ML2018FALL\\final\\Human_Protein_Atlas_Image_classification\\data\\train\\fffdf7e0-bbc4-11e8-b2bc-ac1f6b6435d0', 'labels': array([25,  2, 21])},
       {'path': 'C:\\Users\\OneDay\\Downloads\\ML2

In [22]:
test_dataset_info = []
test_dataset_namelist = [ test_name.split('_')[0] for test_name in os.listdir(TEST)]
test_name_unique = []

prename = ""
for name in test_dataset_namelist:
    if prename != name:
        test_name_unique.append(name)
        test_dataset_info.append({
            'path': os.path.join(TEST, name),
            'name': name })
        prename = name
test_dataset_info = np.array(test_dataset_info)

pickle.dump(test_name_unique, open(os.path.join(PREPROCESSED, 'test_name.pickle'), 'wb'))
#test_name = pickle.load(open(os.path.join(PREPROCESSED, 'test_name.pickle'), 'rb'))

test_dataset_info

array([{'path': 'C:\\Users\\OneDay\\Downloads\\ML2018FALL\\final\\Human_Protein_Atlas_Image_classification\\data\\test\\00008af0-bad0-11e8-b2b8-ac1f6b6435d0', 'name': '00008af0-bad0-11e8-b2b8-ac1f6b6435d0'},
       {'path': 'C:\\Users\\OneDay\\Downloads\\ML2018FALL\\final\\Human_Protein_Atlas_Image_classification\\data\\test\\0000a892-bacf-11e8-b2b8-ac1f6b6435d0', 'name': '0000a892-bacf-11e8-b2b8-ac1f6b6435d0'},
       {'path': 'C:\\Users\\OneDay\\Downloads\\ML2018FALL\\final\\Human_Protein_Atlas_Image_classification\\data\\test\\0006faa6-bac7-11e8-b2b7-ac1f6b6435d0', 'name': '0006faa6-bac7-11e8-b2b7-ac1f6b6435d0'},
       ...,
       {'path': 'C:\\Users\\OneDay\\Downloads\\ML2018FALL\\final\\Human_Protein_Atlas_Image_classification\\data\\test\\ffecb8a4-bad4-11e8-b2b8-ac1f6b6435d0', 'name': 'ffecb8a4-bad4-11e8-b2b8-ac1f6b6435d0'},
       {'path': 'C:\\Users\\OneDay\\Downloads\\ML2018FALL\\final\\Human_Protein_Atlas_Image_classification\\data\\test\\fff03816-bad5-11e8-b2b9-ac1f6b6435d0

In [19]:
class data_generator:
    
    def create_train(dataset_info, train_size, shape, mode):
        
        random_indexes = np.random.choice(len(dataset_info), train_size)
        batch_images = np.empty((train_size, shape[0], shape[1], shape[2]), dtype=np.uint8)
        batch_labels = np.zeros((train_size, 28))
        for i, idx in enumerate(random_indexes):
            if mode=='RGBY':
                image = data_generator.RGBY(
                    dataset_info[idx]['path'], shape)   
                batch_images[i] = image
                batch_labels[i][dataset_info[idx]['labels']] = 1
            elif mode=='RGB':
                image = data_generator.RGB(
                    dataset_info[idx]['path'], shape)   
                batch_images[i] = image
                batch_labels[i][dataset_info[idx]['labels']] = 1
            else:
                print ("Unexpected mode!")

        return batch_images, batch_labels.astype(np.uint8)
            
    def create_test(dataset_info, test_size, shape, mode):
        batch_images = np.empty((test_size, shape[0], shape[1], shape[2]), dtype=np.uint8)
        batch_names = []
        for i, idx in enumerate(dataset_info):
            if mode=='RGBY':
                image = data_generator.RGBY(
                    idx['path'], shape)   
                batch_images[i] = image
                batch_names.append(idx['name'])
            elif mode=='RGB':
                image = data_generator.RGB(
                    idx['path'], shape)   
                batch_images[i] = image
                batch_names.append(idx['name'])
            else:
                print ("Unexpected mode!")

        return batch_images, batch_names
        
    def RGBY(path, shape):
        image_red_ch = np.asarray(Image.open(path+'_red.png'))
        image_green_ch = np.asarray(Image.open(path+'_green.png'))
        image_blue_ch = np.asarray(Image.open(path+'_blue.png'))
        image_yellow_ch = np.asarray(Image.open(path+'_yellow.png'))
        
        image = np.stack((
            image_red_ch, 
            image_green_ch, 
            image_blue_ch,
            image_yellow_ch
        ), -1)
        return image.astype(np.uint8)
    
    def RGB(path, shape):
        image_red_ch = np.asarray(Image.open(path+'_red.png'))
        image_green_ch = np.asarray(Image.open(path+'_green.png'))
        image_blue_ch = np.asarray(Image.open(path+'_blue.png'))
        
        image = np.stack((
            image_red_ch, 
            image_green_ch, 
            image_blue_ch
        ), -1)
        return image.astype(np.uint8)
                


In [None]:
IMAGE_LENGTH = 512
IMAGE_WIDTH = 512
CHANNEL_NUM = 4
TRAIN_SIZE = int(len(os.listdir(TRAIN))/4)

# create train datagen
train_x, train_y= data_generator.create_train(
    dataset_info=train_dataset_info, train_size=TRAIN_SIZE, shape=(IMAGE_LENGTH,IMAGE_WIDTH,CHANNEL_NUM), mode='RGBY' )

train_x.shape
train_y
type(train_x[0][0][0][0])
type(train_y[0][0])
np.save(os.path.join(PREPROCESSED, 'train_RGBY_original_x.npy'), train_x)
np.save(os.path.join(PREPROCESSED, 'train_RGBY_original_y.npy'), train_y)

In [21]:
TEST_SIZE = int(len(os.listdir(TEST))/4)
IMAGE_LENGTH = 512
IMAGE_WIDTH = 512
CHANNEL_NUM = 4

# create test datagen
test_x, test_name= data_generator.create_test(
    dataset_info=test_dataset_info, test_size=TEST_SIZE, shape=(IMAGE_LENGTH,IMAGE_WIDTH,CHANNEL_NUM), mode='RGBY' )
np.save(os.path.join(PREPROCESSED, 'test_RGBY_original_x.npy'), test_x)

In [None]:
TEST_SIZE = int(len(os.listdir(TEST))/4)
IMAGE_LENGTH = 512
IMAGE_WIDTH = 512
CHANNEL_NUM = 3
# create test datagen
test_x, test_name= data_generator.create_test(
    dataset_info=test_dataset_info, test_size=TEST_SIZE, shape=(IMAGE_LENGTH,IMAGE_WIDTH,CHANNEL_NUM), mode='RGB' )
np.save(os.path.join(PREPROCESSED, 'test_RGB_original_x.npy'), test_x)