# 1. Preprocess-TruncatedImages
在使用keras预训练的model提取特征的时候，有的图片报错：StopIteration: image file is truncated (85 bytes not processed)，会打断程序。所以，这里把truncated图片删除。

## Import PKGs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
from IPython.display import display

import os
import time
import zipfile
import pickle
from PIL import Image
import shutil

from tqdm import tqdm
import multiprocessing

## Run name

In [2]:
project_name = 'ic_furniture2018'
step_name = 'Preprocess-TruncatedImages'
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)

run_name: ic_furniture2018_Preprocess-TruncatedImages_20180329_123409


## Project folders

In [3]:
cwd = os.getcwd()
input_folder = os.path.join(cwd, 'input')
output_folder = os.path.join(cwd, 'output')
model_folder = os.path.join(cwd, 'model')
feature_folder = os.path.join(cwd, 'feature')
post_pca_feature_folder = os.path.join(cwd, 'post_pca_feature')
log_folder = os.path.join(cwd, 'log')
print('input_folder: \t\t\t%s' % input_folder)
print('output_folder: \t\t\t%s' % output_folder)
print('model_folder: \t\t\t%s' % model_folder)
print('feature_folder: \t\t%s' % feature_folder)
print('post_pca_feature_folder: \t%s' % post_pca_feature_folder)
print('log_folder: \t\t\t%s' % log_folder)

org_train_folder = os.path.join(input_folder, 'org_train')
org_val_folder = os.path.join(input_folder, 'org_val')
org_test_folder = os.path.join(input_folder, 'org_test')
train_folder = os.path.join(input_folder, 'data_train')
val_folder = os.path.join(input_folder, 'data_val')
test_folder = os.path.join(input_folder, 'data_test')
test_sub_folder = os.path.join(test_folder, 'test')

if not os.path.exists(post_pca_feature_folder):
    os.mkdir(post_pca_feature_folder)
    print('Create folder: %s' % post_pca_feature_folder)

train_json_file = os.path.join(input_folder, 'train.json')
val_json_file = os.path.join(input_folder, 'validation.json')
test_json_file = os.path.join(input_folder, 'test.json')
print('\ntrain_json_file: \t\t%s' % train_json_file)
print('val_json_file: \t\t\t%s' % val_json_file)
print('test_json_file: \t\t%s' % test_json_file)

train_csv_file = os.path.join(input_folder, 'train.csv')
val_csv_file = os.path.join(input_folder, 'validation.csv')
test_csv_file = os.path.join(input_folder, 'test.csv')
print('\ntrain_csv_file: \t\t%s' % train_csv_file)
print('val_csv_file: \t\t\t%s' % val_csv_file)
print('test_csv_file: \t\t\t%s' % test_csv_file)

sample_submission_csv_file = os.path.join(input_folder, 'sample_submission_randomlabel.csv')
print('\nsample_submission_csv_file: \t%s' % sample_submission_csv_file)

input_folder: 			/data1/kaggle/imaterialist-challenge-furniture-2018/input
output_folder: 			/data1/kaggle/imaterialist-challenge-furniture-2018/output
model_folder: 			/data1/kaggle/imaterialist-challenge-furniture-2018/model
feature_folder: 		/data1/kaggle/imaterialist-challenge-furniture-2018/feature
post_pca_feature_folder: 	/data1/kaggle/imaterialist-challenge-furniture-2018/post_pca_feature
log_folder: 			/data1/kaggle/imaterialist-challenge-furniture-2018/log

train_json_file: 		/data1/kaggle/imaterialist-challenge-furniture-2018/input/train.json
val_json_file: 			/data1/kaggle/imaterialist-challenge-furniture-2018/input/validation.json
test_json_file: 		/data1/kaggle/imaterialist-challenge-furniture-2018/input/test.json

train_csv_file: 		/data1/kaggle/imaterialist-challenge-furniture-2018/input/train.csv
val_csv_file: 			/data1/kaggle/imaterialist-challenge-furniture-2018/input/validation.csv
test_csv_file: 			/data1/kaggle/imaterialist-challenge-furniture-2018/input/test.csv


## Find and delete truncated images

In [4]:
def search_truncated_images(target_folder):
    print(target_folder)
    percent_count = 1000
    t0 = time.time()
    sub_folders = os.listdir(target_folder)
    sub_folders.sort()
    for c in sub_folders:
        print(c, end='  ')
        class_folder = os.path.join(target_folder, c)
        count = 0
        for image_name in os.listdir(class_folder):
            image_file = os.path.join(class_folder, image_name)
            try:
                img = Image.open(image_file)
                img = img.resize((229, 229))
            except Exception as ex:
                print('%s: %s' % (image_file, ex))
                os.remove(image_file)
            count += 1
            if count % percent_count == 0:
                print(int(count/percent_count), end='  ')
    t1 = time.time()
    print('Spend time: {0} s'.format(t1-t0))

In [5]:
def get_all_images(target_folder):
    sub_folder_names = os.listdir(target_folder)
    sub_folder_names.sort()
    sub_folders = []
    for name in sub_folder_names:
        sub_folders.append(os.path.join(target_folder, name))
    sub_folders.sort()
    
    image_files = []
    for folder in sub_folders:
        for image_name in os.listdir(folder):
            image_files.append(os.path.join(folder, image_name))
    print('len(image_files)=%d' % len(image_files))
    return image_files

train_image_files = get_all_images(train_folder)
val_image_files = get_all_images(val_folder)
test_image_files = get_all_images(test_folder)

len(image_files)=191261
len(image_files)=6301
len(image_files)=12652


In [6]:
def check_image(image_file):
    result = None
    try:
#         result = image_file
        img = Image.open(image_file)
        img = img.resize((229, 229))
    except Exception as ex:
        result = '%s: %s' % (image_file, ex)
        os.remove(image_file)
    return result

In [7]:
def search_truncated_images_parallel(target_images):
    t0 = time.time()
    print(target_images[:3])
    print('total=%d' % len(target_images))
    cores_count = multiprocessing.cpu_count()
    print('cores_count=%d' % cores_count)

    pool = multiprocessing.Pool(processes=cores_count)
#     pool.map(check_image, target_images)
#     for i in tqdm.tqdm(pool.imap_unordered(check_image, target_images), total=len(target_images)):
#         if i is not None:
#             print(i)
    with tqdm(total=len(target_images)) as t:
        for i in pool.imap_unordered(check_image, target_images):
            if i is not None:
                print(i)
            t.update(1)
    
#     pool.close()
#     pool.terminate()
    t1 = time.time()
    print('spend time: %.2f s' % (t1-t0))

In [8]:
search_truncated_images_parallel(train_image_files)
search_truncated_images_parallel(val_image_files)
search_truncated_images_parallel(test_image_files)

['/data1/kaggle/imaterialist-challenge-furniture-2018/input/data_train/001/90357_1.jpg', '/data1/kaggle/imaterialist-challenge-furniture-2018/input/data_train/001/90938_1.jpg', '/data1/kaggle/imaterialist-challenge-furniture-2018/input/data_train/001/90711_1.jpg']
total=191261
cores_count=4


100%|██████████| 191261/191261 [20:18<00:00, 156.91it/s]

spend time: 1218.94 s
['/data1/kaggle/imaterialist-challenge-furniture-2018/input/data_val/001/4740_1.jpg', '/data1/kaggle/imaterialist-challenge-furniture-2018/input/data_val/001/4851_1.jpg', '/data1/kaggle/imaterialist-challenge-furniture-2018/input/data_val/001/5768_1.jpg']
total=6301
cores_count=4



100%|██████████| 6301/6301 [00:40<00:00, 156.58it/s]


spend time: 40.26 s
['/data1/kaggle/imaterialist-challenge-furniture-2018/input/data_test/test/8362.jpg', '/data1/kaggle/imaterialist-challenge-furniture-2018/input/data_test/test/4226.jpg', '/data1/kaggle/imaterialist-challenge-furniture-2018/input/data_test/test/12692.jpg']
total=12652
cores_count=4


100%|██████████| 12652/12652 [01:20<00:00, 157.87it/s]

spend time: 80.16 s



