In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import

In [None]:
!pip install fishervector

In [None]:
from cv2 import SIFT_create, imread

import numpy as np
import pandas as pd
import pickle
import gc

from fishervector import FisherVectorGMM
from xgboost import XGBClassifier

from sklearn.preprocessing import LabelEncoder, Normalizer
# from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV

In [None]:
train_df = pd.read_csv('../input/plant-pathology-2020-fgvc7/train.csv')

train_label = train_df.melt(['image_id'])
train_label.drop(index=train_label[train_label['value'] == 0].index, inplace=True)

train_df = train_df.merge(train_label, on='image_id')
train_df.drop(columns=['value'], inplace=True)
train_df.rename(columns={'variable': 'class'}, inplace=True)

le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['class'])
train_df

# SIFT + Fisher Vector

In [None]:
sift = SIFT_create(contrastThreshold=0.075)

In [None]:
sift_des = []
none_count = 0

for image_name in train_df['image_id']:
    img = imread(f'../input/plant-pathology-2020-fgvc7/images/{image_name}.jpg')
    # gray_img = cvtColor(img, COLOR_RGB2GRAY)

    kp = sift.detect(img)
    kp, des = sift.compute(img, kp)
    
    if des is None:
        none_count += 1
        
    sift_des.append(des)
    
print(f'#None descriptor: {none_count}')

In [None]:
with open('sift_des.pickle', 'wb') as f:
    pickle.dump(sift_des, f, pickle.HIGHEST_PROTOCOL)
    
train_sift = np.vstack(sift_des)
train_sift = train_sift.reshape((1, *train_sift.shape))

In [None]:
fv_gmm = FisherVectorGMM().fit_by_bic(train_sift, [10, 30, 60], 'fv_gmm.pickle')

In [None]:
# with open('../input/app-cq18/sift_des.pickle', 'rb') as f:
#     sift_des = pickle.load(f)
    
# with open('../input/app-cq18/fv_gmm.pickle', 'rb') as f:
#     fv_gmm = pickle.load(f)

In [None]:
train_fv = np.concatenate([fv_gmm.predict(des.reshape(1, *des.shape)) for des in sift_des]).\
            reshape(train_df.shape[0], -1)
# train_fv = np.concatenate(train_fv)
train_fv.shape

In [None]:
normalizer = Normalizer(copy=False)
train_fv = normalizer.fit_transform(train_fv)

In [None]:
boosters = ['gbtree', 'gblinear']
n_estimators = [50, 100, 150, 200]
max_depth = [3, 6, 12, 18]
# learning_rate = [0.01, 0.1, 0.3]

xgboost_model = XGBClassifier(objective='multi:softmax', use_label_encoder=False,
                              num_class=4, eval_metric='merror')

grid_search = GridSearchCV(xgboost_model, {'booster': boosters, 'n_estimators': n_estimators, 'max_depth': max_depth},
#                                            'learning_rate': learning_rate},
                           scoring='balanced_accuracy', cv=5, verbose=4, refit=False)

In [None]:
grid_search = grid_search.fit(train_fv, train_df['label'])

# Tài liệu tham khảo
- https://towardsdatascience.com/using-nmf-to-classify-companies-a77e176f276f
- https://github.com/jonasrothfuss/fishervector
- https://hal.inria.fr/hal-00779493/file/RR-8209.pdf
- https://viblo.asia/p/sift-scale-invariant-feature-transform-huan-luyen-mo-hinh-cho-cac-bai-toan-phan-loai-924lJqJaZPM
- https://xgboost.readthedocs.io/en/stable/
- https://towardsdatascience.com/xgboost-fine-tune-and-optimize-your-model-23d996fab663
- https://medium.com/analytics-vidhya/what-makes-xgboost-so-extreme-e1544a4433bb