# import

In [1]:
import pickle
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from skimage.feature import local_binary_pattern
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn import preprocessing

In [2]:
pic_shape = (200, 200)

# 数据库导入

In [3]:
train_dataset = pickle.load(open('my_train.pkl', 'rb'))
test_dataset = pickle.load(open('my_test.pkl', 'rb'))
print('训练集长度:', len(train_dataset['data']), '测试集长度:', len(test_dataset['data']))

训练集长度: 4750 测试集长度: 794


## data augmentation

In [4]:
for i in range(len(train_dataset['data'])):
    img=train_dataset['data'][i]
    target=train_dataset['target'][i]
    # 翻转增强
    train_dataset['data'].append(cv2.flip(img, -1))
    train_dataset['target'].append(target)
    train_dataset['data'].append(cv2.flip(img, 1))
    train_dataset['target'].append(target)

    # 旋转增强
    # getRotationMatrix2D有三个参数，第一个为旋转中心，第二个为旋转角度，第三个为缩放比例
    M = cv2.getRotationMatrix2D((int(pic_shape[0]*0.5),int(pic_shape[1]*0.5)), 45, 1)
    dst = cv2.warpAffine(img, M, pic_shape)
    train_dataset['data'].append(dst)
    train_dataset['target'].append(target)
    M = cv2.getRotationMatrix2D((int(pic_shape[0]*0.5),int(pic_shape[1]*0.5)), 90, 1)
    dst = cv2.warpAffine(img, M, pic_shape)
    train_dataset['data'].append(dst)
    train_dataset['target'].append(target)
    M = cv2.getRotationMatrix2D((int(pic_shape[0]*0.5),int(pic_shape[1]*0.5)), 135, 1)
    dst = cv2.warpAffine(img, M, pic_shape)
    train_dataset['data'].append(dst)
    train_dataset['target'].append(target)

#
print('数据拓展结束')

## mask

In [5]:
def create_mask_for_plant(image):
    image_hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

    sensitivity = 35
    lower_hsv = np.array([60 - sensitivity, 100, 50])
    upper_hsv = np.array([60 + sensitivity, 255, 255])

    mask = cv2.inRange(image_hsv, lower_hsv, upper_hsv)
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (11, 11))
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)

    return mask


def segment_plant(image):
    mask = create_mask_for_plant(image)
    output = cv2.bitwise_and(image, image, mask=mask)
    return output


def sharpen_image(image):
    image_blurred = cv2.GaussianBlur(image, (0, 0), 3)
    image_sharp = cv2.addWeighted(image, 1.5, image_blurred, -0.5, 0)
    return image_sharp

## 特征提取

In [6]:
winSize = pic_shape
blockSize = (int(pic_shape[0]*0.2),int(pic_shape[1]*0.2))
blockStride = (int(pic_shape[0]*0.2),int(pic_shape[1]*0.2))
cellSize = (int(pic_shape[0]*0.1),int(pic_shape[1]*0.1))
nbins = 4
hog = cv2.HOGDescriptor(winSize, blockSize, blockStride, cellSize, nbins)
orb=cv2.ORB_create(nfeatures=50)

In [7]:
winStride = (8, 8)
padding = (8, 8)
# 四分割 直方图和词袋模型
train_HOG_feature=[]
train_ORB_feature=[]
train_LBP_feature=[]
train_GRAY_feature=[]


test_HOG_feature=[]
test_ORB_feature=[]
test_LBP_feature=[]
test_GRAY_feature=[]

for img_data in tqdm(train_dataset['data']):
    image_segmented = segment_plant(img_data)
    image_sharpen = sharpen_image(image_segmented)
    gray = cv2.cvtColor(image_sharpen, cv2.COLOR_BGR2GRAY)
    # 图像数据生成mask和gray
    
    #resize后的小型图像
    gray_resized=cv2.resize(gray, (20, 20))
    train_GRAY_feature.append(gray_resized.reshape((-1,)))
    
    # lbp统计直方图
    lbp = local_binary_pattern(gray,P=8,R=3)
    max_bins=lbp.max()
    lbp_hist,_=np.histogram(lbp.reshape((-1,)), normed=True, density=True, bins=256, range=(0, max_bins))
    train_LBP_feature.append(lbp_hist)
    #
    # orb特征
    ORB_zero=np.zeros((50,32))
    kp1, des1 = orb.detectAndCompute(gray, None)
    try:
        ORB=np.pad(des1,((0,50-des1.shape[0]),(0,0)),'constant')
    except:
        ORB=np.zeros((50,32))
    assert ORB.shape==(50,32)
    train_ORB_feature.append(ORB.reshape((-1,)))
    # hog特征
    #hog_result = hog.compute(image_sharpen, winStride, padding).reshape((-1,))
    hog_result = hog.compute(gray, winStride, padding).reshape((-1,))
    train_HOG_feature.append(hog_result)

print('其中HOG特征维度',train_HOG_feature[0].shape)
print('其中LBP特征维度',train_LBP_feature[0].shape)
print('其中ORB特征维度',train_ORB_feature[0].shape)
print('其中GRAY特征维度',train_GRAY_feature[0].shape)


for img_data in tqdm(test_dataset['data']):
    image_segmented = segment_plant(img_data)
    image_sharpen = sharpen_image(image_segmented)
    gray = cv2.cvtColor(image_sharpen, cv2.COLOR_BGR2GRAY)
    # 图像数据生成mask和gray

    #resize后的小型图像
    gray_resized=cv2.resize(gray, (20, 20))
    test_GRAY_feature.append(gray_resized.reshape((-1,)))
    
    # lbp统计直方图
    lbp = local_binary_pattern(gray,P=8,R=3)
    max_bins = lbp.max()
    lbp_hist,_ = np.histogram(lbp.reshape((-1,)), normed=True, density=True, bins=256, range=(0, max_bins))
    test_LBP_feature.append(lbp_hist)
    #
    # orb特征

    kp1, des1 = orb.detectAndCompute(gray, None)
    try:
        ORB=np.pad(des1,((0,50-des1.shape[0]),(0,0)),'constant')
    except:
        ORB=np.zeros((50,32))
    assert ORB.shape==(50,32)
    test_ORB_feature.append(ORB.reshape((-1,)))
    # hog特征
    #hog_result = hog.compute(image_sharpen, winStride, padding).reshape((-1,))
    hog_result = hog.compute(gray, winStride, padding).reshape((-1,))
    test_HOG_feature.append(hog_result)
# 特征提取



  lbp_hist,_=np.histogram(lbp.reshape((-1,)), normed=True, density=True, bins=256, range=(0, max_bins))
100%|██████████████████████████████████████████████████████████████████████████████| 4750/4750 [00:49<00:00, 96.78it/s]
  lbp_hist,_ = np.histogram(lbp.reshape((-1,)), normed=True, density=True, bins=256, range=(0, max_bins))
  2%|█▏                                                                              | 12/794 [00:00<00:07, 104.60it/s]

其中HOG特征维度 (3600,)
其中LBP特征维度 (256,)
其中ORB特征维度 (1600,)
其中GRAY特征维度 (400,)


100%|███████████████████████████████████████████████████████████████████████████████| 794/794 [00:07<00:00, 100.28it/s]


In [8]:
# 特征归一化

train_mm_HOG = preprocessing.MinMaxScaler()
train_mm_HOG_data = train_mm_HOG.fit_transform(train_HOG_feature)
train_HOG_feature = train_mm_HOG.inverse_transform(train_mm_HOG_data)

train_mm_LBP = preprocessing.MinMaxScaler()
train_mm_LBP_data = train_mm_LBP.fit_transform(train_LBP_feature)
train_LBP_feature = train_mm_LBP.inverse_transform(train_mm_LBP_data)

train_mm_ORB = preprocessing.MinMaxScaler()
train_mm_ORB_data = train_mm_ORB.fit_transform(train_ORB_feature)
train_ORB_feature = train_mm_ORB.inverse_transform(train_mm_ORB_data)

train_mm_GRAY = preprocessing.MinMaxScaler()
train_mm_GRAY_data = train_mm_GRAY.fit_transform(train_GRAY_feature)
train_GRAY_feature = train_mm_GRAY.inverse_transform(train_mm_GRAY_data)


test_mm_HOG = preprocessing.MinMaxScaler()
test_mm_HOG_data = test_mm_HOG.fit_transform(test_HOG_feature)
test_HOG_feature = test_mm_HOG.inverse_transform(test_mm_HOG_data)

test_mm_LBP = preprocessing.MinMaxScaler()
test_mm_LBP_data = test_mm_LBP.fit_transform(test_LBP_feature)
test_LBP_feature = test_mm_LBP.inverse_transform(test_mm_LBP_data)

test_mm_ORB = preprocessing.MinMaxScaler()
test_mm_ORB_data = test_mm_ORB.fit_transform(test_ORB_feature)
test_ORB_feature = test_mm_ORB.inverse_transform(test_mm_ORB_data)

test_mm_GRAY = preprocessing.MinMaxScaler()
test_mm_GRAY_data = test_mm_GRAY.fit_transform(test_GRAY_feature)
test_GRAY_feature = test_mm_GRAY.inverse_transform(test_mm_GRAY_data)

print('特征归一化完成')

特征归一化完成


In [9]:
# 特征融合

# train_feature=np.hstack([np.array(train_HOG_feature),np.array(train_LBP_feature),np.array(train_ORB_feature),np.array(train_GRAY_feature)])
# test_feature=np.hstack([np.array(test_HOG_feature),np.array(test_LBP_feature),np.array(test_ORB_feature),np.array(test_GRAY_feature)])
train_feature=np.hstack([np.array(train_HOG_feature),np.array(train_LBP_feature),np.array(train_ORB_feature)])
test_feature=np.hstack([np.array(test_HOG_feature),np.array(test_LBP_feature),np.array(test_ORB_feature)])
# train_feature=np.hstack([np.array(train_HOG_feature),np.array(train_ORB_feature)])
# test_feature=np.hstack([np.array(test_HOG_feature),np.array(test_ORB_feature)])
# train_feature=np.array(train_ORB_feature)
# test_feature=np.array(test_ORB_feature)
# train_feature=np.array(train_HOG_feature)
# test_feature=np.array(test_HOG_feature)
print('train综合特征维度', train_feature.shape)
print('test综合特征维度', test_feature.shape)
print('特征提取结束')

train综合特征维度 (4750, 5456)
test综合特征维度 (794, 5456)
特征提取结束


In [10]:
from sklearn.decomposition import KernelPCA
#数据降维
print('数据降维开始')
n_components=2000
train_len=len(train_feature)
data=np.vstack([train_feature,test_feature])
# pca_tsne = TSNE(n_components=n_components)
# newData = pca_tsne.fit_transform(data)
sklearn_kpca = KernelPCA(n_components=n_components, kernel="rbf", gamma=15)
newData = sklearn_kpca.fit_transform(data)
print(newData.shape)

In [11]:
# train_feature=newData[0:train_len]
# assert train_len==len(train_feature)
# test_feature=newData[train_len:]


In [12]:
# #特征数据存储
# train_feature_dist=train_dataset.copy()
# train_feature_dist['data']=train_feature
# pickle.dump(train_feature_dist,open('/Users/mataoxun/code/python/pythonProject3/train_feature.pkl','wb'))
# test_feature_dist=test_dataset.copy()
# test_feature_dist['data']=test_feature
# pickle.dump(test_feature_dist,open('/Users/mataoxun/code/python/pythonProject3/test_feature.pkl','wb'))
# print('特征数据存储结束')





In [14]:
# # SVM分类
# from sklearn import svm
# print('SVM分类开始')
# model = svm.SVC()
# model.fit(train_feature, train_dataset['target'])
# predicted = model.predict(test_feature)

# 随机森林分类
# print('随机森林分类开始')
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier()
# model.fit(train_feature, train_dataset['target'])
# predicted = model.predict(test_feature)
#

#XGBoost分类
print('Xgboost分类开始')
from xgboost import XGBClassifier
model = XGBClassifier(max_depth=5)
model.fit(train_feature, train_dataset['target'])
predicted = model.predict(test_feature)


print('测试集分类预测结束')
# from sklearn.model_selection import KFold, cross_val_score
# def f1_kf(my_model):
#     kf = KFold(5, shuffle=True, random_state=50).get_n_splits(train_feature)
#     result_list= np.sqrt(-cross_val_score(my_model, train_feature, train_dataset['target'], scoring="f1", cv = kf))
#     return(result_list)
# f1_kf(model)

Xgboost分类开始




测试集分类预测结束


In [15]:
# 结果生成
sub=pd.read_csv('sample_submission.csv')
sub['file'] = test_dataset['file_name']
sub['species'] = list(map(lambda x:train_dataset['dict'][x], predicted))
sub.to_csv('submission.csv', index=False)
#