In [1]:
import h5py
import numpy as np
import pandas as pd
import pickle
from sklearn.utils import shuffle
from keras.utils import np_utils

from keras.models import *
from keras.layers import *
from keras.optimizers import RMSprop
import xgboost as xgb
from sklearn import svm

tmp_dir = 'cache/'
np.random.seed = 2017
SEED = 2017

Using TensorFlow backend.


In [2]:
# 载入特征向量
def load_data():
    X_train = []
    X_test = []
    for filename in ['gap_ResNet50.h5', 'gap_InceptionV3.h5', 'gap_Xception.h5']:
        with h5py.File(tmp_dir + filename, 'r') as h:
            X_train.append(np.array(h['train']))
            X_test.append(np.array(h['test']))
            y_train = np.array(h['label'])
    X_train = np.concatenate(X_train, axis=1)
    X_test = np.concatenate(X_test, axis=1)
    return(X_train, y_train, X_test)

In [3]:
def load_info():
    with open(tmp_dir + 'data_info.pkl', 'rb') as f:
        data_info = pickle.load(f)
    train_info = data_info['train']
    test_info = data_info['test']
    label_dict = data_info['label']
    label_dict_reverse = {value: key for key, value in label_dict.items()}
    return(train_info, test_info, label_dict, label_dict_reverse)

In [4]:
# 数据预处理函数，对神经网络和xgboost分别将whether_to_categorical设置为True和False
def preproc(X_train, train_df, label_dict, whether_to_categorical=True):
    df = train_df.copy()
    df['label'] = train_df['label'].map(label_dict)
    if whether_to_categorical:
        y_train = np_utils.to_categorical(df.label)
    else:
        y_train = df.label.values
    X_train, y_train = shuffle(X_train, y_train)
    return(X_train, y_train)

In [5]:
# 全连接层预测函数
def cnn_predict(X_train, y_train, X_test, epochs=45):
    input_tensor = Input(X_train.shape[1:])
    x = Dropout(0.7)(input_tensor)
    x = Dense(100, activation = 'sigmoid')(x)
    model = Model(input_tensor, x)
    model.compile(optimizer=RMSprop(lr = 1e-4), loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, batch_size=256, epochs=epochs, validation_split=0.2)
    preds = model.predict(X_test, verbose=1)
    return(preds)

In [6]:
# xgboost预测函数，参数待优化
def xgb_predict(X_train, y_train, X_test):
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=SEED)
    dtrain = xgb.DMatrix(X_train, label = y_train)
    dtest = xgb.DMatrix(X_val, label = y_val)
    param = {'learning_rate': 0.1, 'n_estimators': 300, 'max_depth': 5,
             'min_child_weight': 5, 'gamma': 1, 'subsample': 0.8, 'colsample_bytree': 0.8,
             'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'multi:softprob', 'num_class': 100}
    num_round = 100
    plst = list(param.items())
    evallist = [(dtrain, 'train'), (dtest, 'eval')]
    bst = xgb.train(plst, dtrain, num_round, evallist)
        
    dtest = xgb.DMatrix(X_test)
    preds = bst.predict(dtest).reshape(X_test.shape[0], 100)
    return(preds)

In [7]:
def svm_predict(X_train, y_train, X_test):
    clf = svm.SVC(decision_function_shape='ovr')
    clf.fit(X_train, y_train)
    preds = clf.decision_function(X_test)
    return(preds)

In [8]:
# 对预测结果进行后处理，将label映射回134类
def postproc(preds, label_dict):
    preds = np.argmax(preds, axis=1)
    preds = pd.Series(preds)
    label_dict_reverse = {value: key for key, value in label_dict.items()}
    preds = preds.map(label_dict_reverse)
    return(preds.values)

In [9]:
# 移除数据函数
def remove_ids(X_train, train_df, rm_ids):
    """移除id在rm_ids列表中的数据"""
    remains = (1 - train_df.id.isin(rm_ids)).astype(bool).values
    X_remains = X_train[remains]
    remains_df = train_df[remains]
    return(X_remains, remains_df)

# label修改函数
def modifi_labels(train_df, modifi_df):
    """对train_df中的部分label进行修改，以modifi_df中的label为准"""
    df = pd.merge(train_df, modifi_df, on='id', how='inner')
    df = df[df.label_x != df.label_y]
    train = train_df.set_index(['id'])
    for i, row in df.iterrows():
        train.loc[row.id, 'label'] = row.label_y
    train = train.reset_index()
    return(train)

In [10]:
# 对repeated_df中重复的数据进行预测，将预测结果存入prediction_of_repeated.pkl文件
# 若预测值与之前的已有的两个label都不相同，删除之；若与其中一个相同，修改为预测值
def modifi_repeated(X_data, data_info, label_dict, epochs = 45):
    dump_file = tmp_dir + 'prediction_of_repeated.pkl'
    if os.path.exists(dump_file):
        with open(dump_file, 'rb') as f:
            pred_df = pickle.load(f)
    else:
        repeated_file = 'cache/repeated_df.pkl'
        with open(repeated_file, 'rb') as f:
            repeated = pickle.load(f)
        remains = data_info.id.isin(repeated.id.values)
        X_test = X_data[remains]
        test_info = data_info[remains]
        X_train, train_info = remove_ids(X_data, data_info, repeated.id.values)
        X_train, y_train = preproc(X_train, train_info, label_dict)
        y_preds = cnn_predict(X_train, y_train, X_test, epochs)
        test_info['label'] = postproc(y_preds, label_dict)
        pred_df = pd.merge(test_info, repeated, on='id', how='left')
        with open(dump_file, 'wb') as f:
            pickle.dump(pred_df, f)
    
    rm_ids = pred_df[(pred_df.label != pred_df.label_x) & (pred_df.label != pred_df.label_y)]['id']
    modifi_df = pred_df[(pred_df.label == pred_df.label_x) | (pred_df.label == pred_df.label_y)]
    X_data, data_info = remove_ids(X_data, data_info, rm_ids.values)
    data_info = modifi_labels(data_info, modifi_df[['id', 'label']])
    return(X_data, data_info)

In [11]:
# 对所有数据进行交叉预测，返回预测概率
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
def cross_predict(n_splits = 10, epochs=70):
    dump_file = tmp_dir + 'cross_predict.pkl'
    if os.path.exists(dump_file):
        with open(dump_file, 'rb') as f:
            df = pickle.load(f)
    else:
        train_info, test_info, label_dict, label_dict_reverse = load_info()
        X_train, y_train, X_test = load_data()
        X, y_df = shuffle(X_train, train_info)
        kf = KFold(n_splits = n_splits)
        y_preds = []
        for train, test in kf.split(X):
            print(u'\n第%d次交叉预测' % (len(y_preds) + 1))
            X_train, X_test, y_train_df, y_test_df = X[train], X[test], y_df.iloc[train,:], y_df.iloc[test,:]
            X_train, y_train = preproc(X_train, y_train_df, label_dict)
            y_pred = cnn_predict(X_train, y_train, X_test, epochs)
            y_preds.append(y_pred)
        y_preds = np.concatenate(y_preds)
    
        preds = []
        max_probas = []
        for ys in y_preds:
            pred = np.argmax(ys)
            total = sum(ys)
            ys = ys / total
            max_proba = max(ys)
            preds.append(pred)
            max_probas.append(max_proba)
        df = y_df.copy()
        df['pred'] = preds
        df['max_proba'] = max_probas
        df['pred'] = df['pred'].map(label_dict_reverse).astype(int)
    
        wrong_rate = df[df.label != df.pred].shape[0] / df.shape[0]
        print(u'%d折交叉预测错误率: %f' % (n_splits, wrong_rate))
        with open(dump_file, 'wb') as f:
            pickle.dump(df, f)
    return(df)

In [12]:
# 依据cross_predict函数预测的概率
# 若概率大于modifi_point，说明该数据可能标注错误，修改为预测的lable
# 若概率小于rm_point, 可能不是狗的图片或狗的特征不明显的图片，删除它们
def remove_and_modifi(X_train, train_info, modifi_point = 0.9, rm_point = 0.5):
    cv_df = cross_predict()
    modifi_df = cv_df[(cv_df.label != cv_df.pred) & (cv_df.max_proba > modifi_point)]
    rm_ids = cv_df[(cv_df.label != cv_df.pred) & (cv_df.max_proba < rm_point)]['id']
    print(u'修改阈值为%f, 修改数据%d/%d个' % (modifi_point, modifi_df.shape[0], X_train.shape[0]))
    print(u'删除阈值为%f, 删除数据%d/%d个' % (rm_point, rm_ids.shape[0], X_train.shape[0]))
    modifi_df = modifi_df[['id', 'pred']]
    modifi_df.columns = ['id', 'label']
    train_info = modifi_labels(train_info, modifi_df)
    X_train, train_info = remove_ids(X_train, train_info, rm_ids)
    
    return(X_train, train_info)

In [13]:
# 测试函数，使用remove_and_modifi函数对数据进行删除和修改label后，测试其泛化表现
def remove_and_modifi_val(model, X_train, train_info, label_dict, modifi_point = 1, rm_point = 0):
    X_train, X_val, train_info, val_info = train_test_split(X_train, train_info, test_size=0.3, random_state=SEED)
    if (modifi_point < 1) or (rm_point > 0):
        X_train, train_info = remove_and_modifi(X_train, train_info, modifi_point, rm_point)
    if model == 'cnn':
        X_train, y_train = preproc(X_train, train_info, label_dict, True)
        y_preds = cnn_predict(X_train, y_train, X_val, 70)
    elif model == 'xgb':
        X_train, y_train = preproc(X_train, train_info, label_dict, False)
        y_preds = xgb_predict(X_train, y_train, X_val)
    val_info['pred'] = postproc(y_preds, label_dict)
    wrong_num = val_info[val_info.label != val_info.pred].shape[0]
    print(u'\n在验证集上错误率为%f' % (wrong_num / val_info.shape[0]))

In [20]:
train_info, test_info, label_dict, label_dict_reverse = load_info()
X_train, y_train, X_test = load_data()
print(u'修改重复数据前有数据%d个' % X_train.shape[0])
X_train, train_info = modifi_repeated(X_train, train_info, label_dict, 70)
print(u'修改重复数据后有数据%d个' % X_train.shape[0])
X_train, train_info = shuffle(X_train, train_info)

修改重复数据前有数据18686个
修改重复数据后有数据18673个


In [15]:
# 不断调整modifi_point和rm_point的取值进行交叉训练，
# 得出在modifi_point=0.88, rm_point=0.3时，线下得分最高
remove_and_modifi_val('cnn', X_train, train_info, label_dict, 0.88, 0.35)

修改阈值为0.880000, 修改数据754/13071个
删除阈值为0.350000, 删除数据200/13071个
Train on 10346 samples, validate on 2587 samples
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70


Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70
在验证集上错误率为0.198322


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [19]:
# 将提取后的特征向量，使用全连接层训练、和xgboost训练后进行融合
X_train, X_val, train_info, val_info = train_test_split(X_train, train_info, test_size=0.3, random_state=SEED)
X_train, train_info = remove_and_modifi(X_train, train_info, modifi_point = 0.88, rm_point = 0.3)

X_train1, y_train1 = preproc(X_train, train_info, label_dict, True)
X_train2, y_train2 = preproc(X_train, train_info, label_dict, False)

pred1 = cnn_predict(X_train1, y_train1, X_val, 70)
pred2 = xgb_predict(X_train2, y_train2, X_val)

pred3 = pred1.copy()
for i in range(len(pred1)):
    total = sum(pred1[i])
    ys = pred1[i] / total
    pred3[i] = ys

preds = pred3*0.55 + pred2*0.45
val_info['pred'] = postproc(preds, label_dict)
wrong_num = val_info[val_info.label != val_info.pred].shape[0]
print(u'\n在验证集上错误率为%f' % (wrong_num / val_info.shape[0]))

修改阈值为0.880000, 修改数据754/13071个
删除阈值为0.300000, 删除数据103/13071个
Train on 10399 samples, validate on 2600 samples
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70


Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70
[1]	train-merror:0.407627	eval-merror:0.425385
[2]	train-merror:0.304869	eval-merror:0.336667
[3]	train-merror:0.249698	eval-merror:0.297692
[4]	train-merror:0.209474	eval-merror:0.269744
[5]	train-merror:0.178921	eval-merror:0.253333
[6]	train-merror:0.156061	eval-merror:0.242308
[7]	train-merror:0.141224	eval-merror:0.235897
[8]	train-merror:0.126278	eval-merror:0.226923
[9]	train-merror:0.115397	eval-merror:0.224872
[10]	train-merror:0.102649	eval-merror:0.220769
[11]	train-merror:0.094846	eval-merror:0.214103
[12]	train-merror:0.085944	eval-merror:0.213333
[13]	train-merror:0.080009	eval-merror:0.208462
[14]	train-merror:0.073854	eval-merror:0.20641
[15]	train-merror:0.069568	eval-merror:0.204615
[16]	train-merror:0.062754	eval-merror:0.20359
[17]	train-merror:0.057918	eval-merror:0.202308
[18]	train-merror:0.052313	eval-merror:0.202308
[19]	train-merror:0.049236	eval-merror:0.200769
[20]	train-merror:0.047038	

In [21]:
# 将提取后的特征向量，使用全连接层训练、和xgboost训练后进行融合
X_train, train_info = remove_and_modifi(X_train, train_info, modifi_point = 0.88, rm_point = 0.3)

X_train1, y_train1 = preproc(X_train, train_info, label_dict, True)
X_train2, y_train2 = preproc(X_train, train_info, label_dict, False)

pred1 = cnn_predict(X_train1, y_train1, X_test, 70)
pred2 = xgb_predict(X_train2, y_train2, X_test)

pred3 = pred1.copy()
for i in range(len(pred1)):
    total = sum(pred1[i])
    ys = pred1[i] / total
    pred3[i] = ys

preds = pred3*0.55 + pred2*0.45
preds = postproc(preds, label_dict)
submission = pd.DataFrame({'label': preds, 'id':test_info.id})

from datetime import datetime
submission.to_csv('sub/sub_%s_3.txt' % datetime.now().strftime('%Y-%m-%d'),
                  sep='\t', columns=['label', 'id'], header=False, index=False)
submission.head(10)

修改阈值为0.880000, 修改数据754/18673个
删除阈值为0.300000, 删除数据103/18673个
Train on 14856 samples, validate on 3714 samples
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70


Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70
[1]	train-merror:0.368259	eval-merror:0.39221
[2]	train-merror:0.287791	eval-merror:0.332795
[3]	train-merror:0.238018	eval-merror:0.293664
[4]	train-merror:0.199477	eval-merror:0.274098
[5]	train-merror:0.171859	eval-merror:0.258122
[6]	train-merror:0.151242	eval-merror:0.247532
[7]	train-merror:0.135395	eval-merror:0.236582
[8]	train-merror:0.123009	eval-merror:0.2303
[9]	train-merror:0.112778	eval-merror:0.223299
[10]	train-merror:0.102316	eval-merror:0.219171
[11]	train-merror:0.09293	eval-merror:0.215581
[12]	train-merror:0.08693	eval-merror:0.209478
[13]	train-merror:0.080314	eval-merror:0.20858
[14]	train-merror:0.074467	eval-merror:0.205529
[15]	train-merror:0.067621	eval-merror:0.203016
[16]	train-merror:0.064005	eval-merror:0.204452
[17]	train-merror:0.059158	eval-merror:0.20158
[18]	train-merror:0.055927	eval-merror:0.200682
[19]	train-merror:0.051927	eval-merror:0.198349
[20]	train-merror:0.

Unnamed: 0,id,label
0,17059665971055292353,36
1,24731922354122785580,45
2,41880133422932602063,78
3,10227905702912884051,26
4,10314919462813127030,34
5,23575883972237003912,88
6,14437285902022328737,67
7,28951333633781516925,68
8,15519661372044502403,38
9,15730698511977047833,42


In [26]:
import os
import shutil
def rmrf_mkdir(dirname):
    if os.path.exists(dirname):
        shutil.rmtree(dirname)
    os.mkdir(dirname)
def create_wrong_classify(train_df, predict_df):
    files_dir = 'data/wrong_classify/train/'
    rmrf_mkdir(files_dir)
    label_name = pd.read_table('data/label_name.txt', header=None, sep=' ', na_filter=False)
    label_name.columns = ['name', 'label']
    label_name['name'] = label_name['name'].apply(lambda x: x.split('---')[-1])\
                                           .apply(lambda x: x.split('|')[0]).apply(lambda x: x.split('/')[0])
    label_set = set(train_df.label)
    remains = label_name.label.isin(label_set)
    label_name = label_name[remains]
    for i, x in label_name.iterrows():
        target_dir = files_dir + str(x['label']) + '_' + x['name']
        os.mkdir(target_dir)
        os.mkdir(target_dir + '/wrong')
    train_df = pd.merge(train_df, label_name, on='label', how='left')
    train_df = pd.merge(train_df, predict_df[['id', 'pred', 'max_proba']], on='id', how='left')
    for i, x in train_df.iterrows():
        target_dir = files_dir + str(x['label']) + '_' + x['name'] + '/'
        shutil.copyfile('data/alldata/' + x.id + '.jpg',
                         target_dir + x.id + '.jpg')
        if x.label != x.pred:
            shutil.move(target_dir + x.id + '.jpg',
                            target_dir + 'wrong/' + x.id + '_' + str(int(x.pred)) + '_' + '%.2e'%x.max_proba + '.jpg')

In [27]:
# create_wrong_classify函数为辅助函数，将预测错误的图片分在不同的文件夹，方便手动寻找预测错误的规律
cv_df = cross_predict()
create_wrong_classify(train_info, cv_df)