## 机器学习进阶（S13）毕业项目 - 猫狗大战

## 项目：识别图片中是猫还是狗
---
* 此项目是kaggle平台上的一个比赛项目，最终的要求是提供一个模型来识别图片中的对象是猫还是狗。
* 这里会使用深度学习中的卷积神经网络来构建模型。
* 最总要求测试评分进入 kaggle Public Leaderboard 前10%。

## 步骤 1: 数据探索

#### 准备

In [3]:
import os
import shutil
from keras.models import *
from keras.layers import *
from keras.applications import *
from keras.preprocessing.image import *
from tqdm import tqdm
import numpy as np
import pandas as pd  
import matplotlib.pyplot as plt                        
%matplotlib inline

#### 载入数据

In [4]:
# 读取训练集
train_files = os.listdir('train')
print(train_files[0:10])

['dog.9220.jpg', 'dog.4153.jpg', 'cat.4113.jpg', 'dog.7547.jpg', 'cat.7459.jpg', 'dog.10028.jpg', 'cat.11954.jpg', 'cat.7368.jpg', 'dog.3505.jpg', 'cat.12057.jpg']


#### 寻找异常

In [5]:
# 定义ImageNet中识别猫狗的分类 https://blog.csdn.net/zhangjunbob/article/details/53258524
dogs = [
 'n02085620','n02085782','n02085936','n02086079','n02086240','n02086646','n02086910','n02087046'
,'n02087394','n02088094','n02088238','n02088364','n02088466','n02088632','n02089078','n02089867'
,'n02089973','n02090379','n02090622','n02090721','n02091032','n02091134','n02091244','n02091467'
,'n02091635','n02091831','n02092002','n02092339','n02093256','n02093428','n02093647','n02093754'
,'n02093859','n02093991','n02094114','n02094258','n02094433','n02095314','n02095570','n02095889'
,'n02096051','n02096177','n02096294','n02096437','n02096585','n02097047','n02097130','n02097209'
,'n02097298','n02097474','n02097658','n02098105','n02098286','n02098413','n02099267','n02099429'
,'n02099601','n02099712','n02099849','n02100236','n02100583','n02100735','n02100877','n02101006'
,'n02101388','n02101556','n02102040','n02102177','n02102318','n02102480','n02102973','n02104029'
,'n02104365','n02105056','n02105162','n02105251','n02105412','n02105505','n02105641','n02105855'
,'n02106030','n02106166','n02106382','n02106550','n02106662','n02107142','n02107312','n02107574'
,'n02107683','n02107908','n02108000','n02108089','n02108422','n02108551','n02108915','n02109047'
,'n02109525','n02109961','n02110063','n02110185','n02110341','n02110627','n02110806','n02110958'
,'n02111129','n02111277','n02111500','n02111889','n02112018','n02112137','n02112350','n02112706'
,'n02113023','n02113186','n02113624','n02113712','n02113799','n02113978']

cats=['n02123045','n02123159','n02123394','n02123597','n02124075','n02125311','n02127052']


# 采用InceptionResNetV2预测
# 参考 https://keras.io/zh/applications/#applications



#存储不能识别为猫狗的异常图片
my_unknow_images = []
from keras.applications.inception_resnet_v2 import *
# # 输入：图片路径
# # 输出：预测概率，格式(class, description, probability)
# #       例如 [(u'n02504013', u'Indian_elephant', 0.82658225), (u'n01871265', u'tusker', 0.1122357)]
def my_predict(img_path_list):
    model = InceptionResNetV2(weights='imagenet')
    for image_path in img_path_list:
        img = image.load_img(image_path, target_size=(299, 299))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)

        preds = model.predict(x)
        result = decode_predictions(preds, top=10)[0]
        if not is_cat_dog(result):
            my_unknow_images.append(image_path)

# 根据模型预测的结果查看是否属于猫狗的范围
def is_cat_dog(preds_result):
    my_labels = dogs[:]
    my_labels[len(my_labels):len(my_labels)] = cats
    a = [x[0] in my_labels for x in preds_result]
#     print(sum(a))
    return sum(a) > 0        

# iscatdog = is_cat_dog([(u'n02504013', u'Indian_elephant', 0.82658225), 
#                        (u'n01871265', u'tusker', 0.1122357), 
#                        (u'n02123394', u'test', 0.0000000)])
# print(iscatdog)

my_predict(['train/cat.0.jpg'])
print(my_unknow_images)

[]


#### 显示异常图片

In [7]:
# plt.figure(figsize=(16, 16))
# plt.title('aaa')
# plt.axis('off')
# plt.imshow('train/cat.0.jpg')

#### 处理异常图片

In [8]:
# 把异常图片从train移动到unknown文件夹
def move_images_to(files, dest_path):
    if not os.path.exists(dest_path):
        os.mkdir(dest_path)
        
    for file in files:
        shutil.move("t1/" + file, dest_path)
    
# files = ['cat.3.jpg','cat.5.jpg']
# move_images_to(files, 'unknow')


#### 准备训练集，验证集，测试集

In [11]:
from operator import is_not
from functools import partial
from sklearn.model_selection import train_test_split

train_files = os.listdir('train')
test_files = os.listdir('test')

# 参考 http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
my_train_files, my_valid_files = train_test_split(train_files,test_size=0.2,random_state = 0)
print('my train data count:', len(my_train_files), 'my valid data count:', len(my_valid_files))

# my train data 细分为cat和dog
train_cat_list = [file if file.split('.')[0] == 'cat' else None for file in my_train_files]
train_cat_list = list(filter(None.__ne__, train_cat_list))
# train_cat_list = filter(partial(is_not, None), train_cat_list)

train_dog_list = [file if file.split('.')[0] == 'dog' else None for file in my_train_files]
train_dog_list = list(filter(None.__ne__, train_dog_list))
# train_dog_list = filter(partial(is_not, None), train_dog_list)

print('train cat count:', len(train_cat_list))
print('train dog count:', len(train_dog_list))

print(train_cat_list[0:10])
print(train_dog_list[0:10])

# my valid data 细分为cat， dog
valid_cat_list = [file if file.split('.')[0] == 'cat' else None for file in my_valid_files]
valid_cat_list = list(filter(None.__ne__, valid_cat_list))

valid_dog_list = [file if file.split('.')[0] == 'dog' else None for file in my_valid_files]
valid_dog_list = list(filter(None.__ne__, valid_dog_list))

print('valid cat count:', len(valid_cat_list))
print('valid dog count:', len(valid_dog_list))

print(valid_cat_list[0:10])
print(valid_dog_list[0:10])




my train data count: 20000 my valid data count: 5000
train cat count: 9985
train dog count: 10015
['cat.1261.jpg', 'cat.7111.jpg', 'cat.6767.jpg', 'cat.2073.jpg', 'cat.9640.jpg', 'cat.6320.jpg', 'cat.3348.jpg', 'cat.10001.jpg', 'cat.9011.jpg', 'cat.1886.jpg']
['dog.6268.jpg', 'dog.3490.jpg', 'dog.5809.jpg', 'dog.118.jpg', 'dog.3911.jpg', 'dog.1130.jpg', 'dog.6036.jpg', 'dog.5422.jpg', 'dog.4428.jpg', 'dog.573.jpg']
valid cat count: 2515
valid dog count: 2485
['cat.6557.jpg', 'cat.327.jpg', 'cat.65.jpg', 'cat.243.jpg', 'cat.10629.jpg', 'cat.4179.jpg', 'cat.11125.jpg', 'cat.8824.jpg', 'cat.7439.jpg', 'cat.4687.jpg']
['dog.5907.jpg', 'dog.5663.jpg', 'dog.7991.jpg', 'dog.1576.jpg', 'dog.1397.jpg', 'dog.2656.jpg', 'dog.11533.jpg', 'dog.4108.jpg', 'dog.371.jpg', 'dog.8936.jpg']


#### 文件目录分类

In [20]:
def my_mkdir(dirname):
    if os.path.exists(dirname):
        shutil.rmtree(dirname)
    os.mkdir(dirname)

my_mkdir('train_classify')
my_mkdir('train_classify/cat')
my_mkdir('train_classify/dog')
my_mkdir('valid_classify')
my_mkdir('valid_classify/cat')
my_mkdir('valid_classify/dog')
my_mkdir('test_nolabel')

# my_mkdir('test_kaggle')
# os.symlink('../test/', 'test_kaggle/test')

for file in train_cat_list:
    os.symlink('../../train/' + file, 'train_classify/cat/' + file)
    
for file in train_dog_list:
    os.symlink('../../train/' + file, 'train_classify/dog/' + file)

for file in valid_cat_list:
    os.symlink('../../train/' + file, 'valid_classify/cat/' + file)

for file in valid_cat_list:
    os.symlink('../../train/' + file, 'valid_classify/dog/' + file)
    
for file in test_files:
    os.symlink('../test/' + file, 'test_nolabel/' + file)

# 对测试集预处理
X_test = []
for file in tqdm(test_files):
    img = image.load_img('test_nolabel/'+file, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    X_test.append(x)
    
    
gen = ImageDataGenerator() # 参考 https://keras.io/zh/preprocessing/image/#imagedatagenerator
train_generator = gen.flow_from_directory(directory="train_classify", 
                                          target_size=(224, 224), 
                                          shuffle=False, 
                                          batch_size=32,
                                          class_mode="categorical")

valid_generator = gen.flow_from_directory(directory='valid_classify',
                                               target_size=(224, 224),
                                               shuffle=False, 
                                               batch_size=32,
                                               class_mode='categorical')

test_generator = gen.flow_from_directory(directory="test_nolabel",
                                         target_size=(224, 224), 
                                         shuffle=False, 
                                         batch_size=32, 
                                         class_mode=None)



100%|██████████| 12500/12500 [01:09<00:00, 180.95it/s]


Found 20000 images belonging to 2 classes.
Found 5030 images belonging to 2 classes.
Found 0 images belonging to 0 classes.


## 步骤 2: 模型探索

#### ResNet50探索

In [29]:
from keras.callbacks import ModelCheckpoint  

# 构建不带分类器的预训练模型
base_model = ResNet50(input_tensor=Input((224, 224, 3)), weights='imagenet', include_top=False)
# 添加全局平均池化层
x = GlobalAveragePooling2D()(base_model.output)
# 添加Dropout
x = Dropout(0.5)(x)
# 添加一个分类器
predictions = Dense(1, activation='sigmoid')(x)
# 构建我们需要训练的完整模型
model = Model(inputs=base_model.input, outputs=predictions)
# 编译模型
model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['accuracy'])

# model.fit(X_train, y_train, batch_size=128, nb_epoch=8, validation_split=0.2)
# X_train, y_train = shuffle(X_train, y_train)

#训练模型并保存具有最佳验证loss的模型权重
checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.ResNet50.hdf5', 
                               verbose=1, save_best_only=True)
model.fit_generator(train_generator,
                    epochs=5,
                    validation_data=valid_generator,
                    callbacks=[checkpointer])


ModuleNotFoundError: No module named 'keras'

In [30]:
# 加载具有最佳验证loss的模型权重
model.load_weights('saved_models/weights.best.ResNet50.hdf5')

y_pred = model.predict(X_test, verbose=1)
# 将每个预测值限制到了 [0.005, 0.995] 个区间内，这个原因很简单，kaggle 官方的评估标准是 LogLoss，
# 对于预测正确的样本，0.995 和 1 相差无几，但是对于预测错误的样本，0 和 0.005 的差距非常大，是 15 和 2 的差别。
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.clip.html
y_pred = y_pred.clip(min=0.005, max=0.995)

NameError: name 'model' is not defined

In [31]:
df = pd.read_csv("sample_submission.csv")

for i, fname in enumerate(test_generator.filenames):
    index = int(fname[fname.rfind('/')+1:fname.rfind('.')])
    df.set_value(index-1, 'label', y_pred[i])

df.to_csv('pred.csv', index=None)
df.head(10)

NameError: name 'pd' is not defined

#### InceptionV3探索

#### Xception探索

## 步骤 3: 模型搭建

上面单独使用了Resnet50, InceptionV3, Exception模型进行尝试，成绩都不是太理想；
下面会综合三个模型的特征来搭建一个全新的模型进行预测。

## 步骤 4: 模型训练

## 步骤 5: 模型调参

## 步骤 6: 模型评估

### 预测测试集

## 步骤 7: 可视化