In [1]:
import numpy as np
import pandas as pd
import pickle
import os
import shutil

tmp_dir = 'cache/'

In [2]:
train = pd.read_table('data/data_train_image.txt', header=None, sep=' ', na_filter=False)
train.columns = ['id', 'label', 'url']
val = pd.read_table('data/val.txt', header=None, sep=' ', na_filter=False)
val.columns = ['id', 'label', 'url']
print(u'训练集中共有数据%d个' % train.shape[0])
print(u'验证集中共有数据%d个' % val.shape[0])

训练集中共有数据8210个
验证集中共有数据10551个


In [4]:
# 将train和val合并后，添加"num"列，表示对应id出现了几次，方便后面对重复id进行处理
data = pd.concat([train, val])
data = data.sort_values(by = ['id'])
data = data.reset_index(drop=True)
data = data.reset_index()

data2 = data[['id', 'index']].groupby(['id'], as_index=False).min()
data = pd.merge(data, data2, on='id', how='left')
data['num'] = data['index_x'] - data['index_y'] + 1

data = data[['id', 'label', 'num']]
data = data.sort_values(by = ['label'])
data = data.reset_index(drop=True)

repeated_ids = data[data.num > 1]['id']
print(u'数据集中共有数据%d个, 其中有重复数据%d个' % (data.shape[0], repeated_ids.shape[0]))
repeated_df = data[data.id.isin(repeated_ids.values)]

数据集中共有数据18761个, 其中有重复数据75个


In [5]:
# 对于重复id，有两个不同的label，取第一个label，并将重复数据的信息存入repeated_df.pkl文件
# 包括其id和两个label值
label1 = repeated_df[repeated_df.num == 1]
label2 = repeated_df[repeated_df.num == 2]
repeated_df = pd.merge(label1[['id', 'label']], label2[['id', 'label']], on='id', how='left')
with open(tmp_dir + 'repeated_df.pkl', 'wb') as f:
    pickle.dump(repeated_df, f)

data = data[data.num == 1]
del data['num']
print(u'删除重复图片%d张' % repeated_df.shape[0])

删除重复图片75张


In [6]:
def rmrf_mkdir(dirname):
    if os.path.exists(dirname):
        shutil.rmtree(dirname)
    os.mkdir(dirname)

In [97]:
# 将图片路径整理，方便keras直接读取
rmrf_mkdir('data/train2')
dirs = set(data.label)
for dirname in dirs:
    os.mkdir('data/train2/' + str(dirname))
for i, x in data[['id', 'label']].iterrows():
    shutil.copyfile('data/alldata/' + x.id + '.jpg', 'data/train2/' + str(x.label) + '/' + x.id + '.jpg')

rmrf_mkdir('data/test2')
shutil.copytree('data/test', 'data/test2/test')

'data/test2/test'

In [7]:
from keras.models import *
from keras.layers import *
from keras.applications import *
from keras.preprocessing.image import *
import h5py

Using TensorFlow backend.


In [8]:
# 导出特征向量
def write_gap(MODEL, model_name, image_size, lambda_func=None):
    width = image_size[0]
    height = image_size[1]
    input_tensor = Input((height, width, 3))
    x = input_tensor
    if lambda_func:
        x = Lambda(lambda_func)(x)
    
    base_model = MODEL(input_tensor = x, weights = 'imagenet', include_top = False)
    model = Model(base_model.input, GlobalAveragePooling2D()(base_model.output))

    gen = ImageDataGenerator()
    train_generator = gen.flow_from_directory('data/train2', image_size, shuffle = False, batch_size = 16)
    test_generator = gen.flow_from_directory('data/test2', image_size, shuffle = False, batch_size = 16, class_mode = None)

    if os.path.exists(tmp_dir + 'data_info.pkl'):
        print(u'data_info.pkl文件已存在')
    else:
        train_labels = []
        train_ids = []
        for fname in train_generator.filenames:
            train_label = int(fname[:fname.rfind('/')])
            train_id = str(fname[fname.rfind('/') + 1 : fname.rfind('.')])
            train_labels.append(train_label)
            train_ids.append(train_id)
        train_info = pd.DataFrame({'id': train_ids, 'label': train_labels})
        
        test_ids = []
        for fname in test_generator.filenames:
            test_id = str(fname[fname.rfind('/') + 1 : fname.rfind('.')])
            test_ids.append(test_id)
        test_info = pd.Series({'id': test_ids})
        
        label_dict = train_generator.class_indices
        label_dict = {int(key): value for key, value in label_dict.items()}
        
        data_info = {'train': train_info, 'test': test_info, 'label': label_dict}
        with open(tmp_dir + 'data_info.pkl', 'wb') as f:
            pickle.dump(data_info, f)
        print(u'data_info.pkl文件写入成功')
    
    train = model.predict_generator(train_generator, train_generator.samples/16)
    test = model.predict_generator(test_generator, test_generator.samples/16)

    with h5py.File(tmp_dir + 'gap_%s.h5' % model_name) as h:
        h.create_dataset('train', data = train)
        h.create_dataset('test', data = test)
        h.create_dataset('label', data = train_generator.classes)

In [None]:
write_gap(ResNet50, 'ResNet50', (224, 224))
write_gap(InceptionV3, 'InceptionV3', (299, 299), inception_v3.preprocess_input)
write_gap(Xception, 'Xception', (299, 299), xception.preprocess_input)
write_gap(VGG16, 'VGG16', (224, 224))
write_gap(VGG19, 'VGG19', (224, 224))