# 1. Preprocess-TruncatedImages
在使用keras预训练的model提取特征的时候，有的图片报错：StopIteration: image file is truncated (85 bytes not processed)，会打断程序。所以，这里把truncated图片删除。

## 导入包

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
from IPython.display import display

import os
import time
import zipfile
import pickle
from PIL import Image
import shutil

import tqdm
import multiprocessing

# 项目文件夹

In [2]:
cwd = os.getcwd()
input_folder = os.path.join(cwd, 'input')
output_folder = os.path.join(cwd, 'output')
model_folder = os.path.join(cwd, 'model')

org_train_folder = os.path.join(input_folder, 'org_train')
org_test_folder = os.path.join(input_folder, 'org_test')
train_folder = os.path.join(input_folder, 'data_train')
val_folder = os.path.join(input_folder, 'data_val')
test_folder = os.path.join(input_folder, 'data_test')
test_sub_folder = os.path.join(test_folder, 'test')

# 搜索并删除truncated图片

In [3]:
def search_truncated_images(target_folder):
    print(target_folder)
    percent_count = 1000
    t0 = time.time()
    sub_folders = os.listdir(target_folder)
    sub_folders.sort()
    for c in sub_folders:
        print(c, end='  ')
        class_folder = os.path.join(target_folder, c)
        count = 0
        for image_name in os.listdir(class_folder):
            image_file = os.path.join(class_folder, image_name)
            try:
                img = Image.open(image_file)
                img = img.resize((229, 229))
            except Exception as ex:
                print('%s: %s' % (image_file, ex))
                os.remove(image_file)
            count += 1
            if count % percent_count == 0:
                print(int(count/percent_count), end='  ')
    t1 = time.time()
    print('Spend time: {0} s'.format(t1-t0))

In [4]:
# search_truncated_images(train_folder)
# search_truncated_images(test_folder)

In [5]:
def get_all_images(target_folder):
    sub_folder_names = os.listdir(target_folder)
    sub_folder_names.sort()
    sub_folders = []
    for name in sub_folder_names:
        sub_folders.append(os.path.join(target_folder, name))
    sub_folders.sort()
    
    image_files = []
    for folder in sub_folders:
        for image_name in os.listdir(folder):
            image_files.append(os.path.join(folder, image_name))
    print('len(image_files)=%d' % len(image_files))
    return image_files

train_image_files = get_all_images(train_folder)
val_image_files = get_all_images(val_folder)
test_image_files = get_all_images(test_folder)

len(image_files)=1193694
len(image_files)=24362
len(image_files)=115619


In [6]:
def check_image(image_file):
    result = None
    try:
#         result = image_file
        img = Image.open(image_file)
        img = img.resize((229, 229))
    except Exception as ex:
        result = '%s: %s' % (image_file, ex)
        os.remove(image_file)
    return result

In [7]:
def search_truncated_images_parallel(target_images):
    t0 = time.time()
    print(target_images[:3])
    print('total=%d' % len(target_images))
    cores_count = multiprocessing.cpu_count()
    print('cores_count=%d' % cores_count)

    pool = multiprocessing.Pool(processes=cores_count)
#     pool.map(check_image, target_images)
    for i in tqdm.tqdm(pool.imap_unordered(check_image, target_images), total=len(target_images)):
        if i is not None:
            print(i)
    pool.close()
    pool.terminate()
    t1 = time.time()
    print('spend time: %.2f s' % (t1-t0))

In [8]:
search_truncated_images_parallel(train_image_files)
search_truncated_images_parallel(val_image_files)
search_truncated_images_parallel(test_image_files)

['/data1/kaggle/landmark-recognition-challenge/input/data_train/00000/473e80b9005fcb77.jpg', '/data1/kaggle/landmark-recognition-challenge/input/data_train/00000/464ccf9c677c1c8c.jpg', '/data1/kaggle/landmark-recognition-challenge/input/data_train/00000/22e28089dac709f0.jpg']
total=1193694
cores_count=4


 19%|█▉        | 225990/1193694 [1:14:51<5:20:31, 50.32it/s]

/data1/kaggle/landmark-recognition-challenge/input/data_train/02975/989cece8afd99098.jpg: cannot identify image file '/data1/kaggle/landmark-recognition-challenge/input/data_train/02975/989cece8afd99098.jpg'


 73%|███████▎  | 866071/1193694 [4:46:28<1:48:22, 50.39it/s]

/data1/kaggle/landmark-recognition-challenge/input/data_train/10045/15f0ac5892156ac3.jpg: image file is truncated (40 bytes not processed)


 89%|████████▊ | 1058705/1193694 [5:49:52<44:36, 50.43it/s]  

/data1/kaggle/landmark-recognition-challenge/input/data_train/12718/fd86977aeb91d108.jpg: cannot identify image file '/data1/kaggle/landmark-recognition-challenge/input/data_train/12718/fd86977aeb91d108.jpg'


100%|██████████| 1193694/1193694 [6:33:58<00:00, 50.50it/s]


spend time: 23638.66 s
['/data1/kaggle/landmark-recognition-challenge/input/data_val/00000/4e8ab93c1620e8a3.jpg', '/data1/kaggle/landmark-recognition-challenge/input/data_val/00000/90187c0b6f3fa112.jpg', '/data1/kaggle/landmark-recognition-challenge/input/data_val/00000/e8f5d139190cf632.jpg']
total=24362
cores_count=4


100%|██████████| 24362/24362 [08:05<00:00, 50.19it/s]


spend time: 485.49 s
['/data1/kaggle/landmark-recognition-challenge/input/data_test/test/3d8d7d0946abc715.jpg', '/data1/kaggle/landmark-recognition-challenge/input/data_test/test/48f52f2d6ac1ce30.jpg', '/data1/kaggle/landmark-recognition-challenge/input/data_test/test/97ab0a1fc0daa989.jpg']
total=115619
cores_count=4


100%|██████████| 115619/115619 [37:32<00:00, 51.33it/s]


spend time: 2252.79 s
