# Plant Diseases Identification by XGBoost

## Import libraries

In [1]:
import numpy as np
import os
import cv2
from numba import jit, njit

from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

## Data preparation 

In [2]:
# TODO: handle data from scrath
data_dir = os.path.join(os.getcwd(), 'data/New Plant Diseases Dataset(Augmented)')
IMG_SIZE = (256, 256)

In [3]:
@njit
def reshape_array(arr, shape):
    return np.reshape(arr, shape)

In [4]:
def read_data(path, get_labels=True):
    X = []
    y = []
    disease_dir_list = os.listdir(path)
    for dirname in disease_dir_list:
        disease_dir_path = os.path.join(path, dirname)
        disease_item_list = os.listdir(disease_dir_path)
        for file in disease_item_list:
            img = cv2.imread(os.path.join(disease_dir_path, file), cv2.IMREAD_COLOR)
            img = cv2.resize(img, IMG_SIZE)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            X.append(img)

        if get_labels:
            y += [dirname.split('___')[1]] * len(disease_item_list)
    
    X = np.array(X)
        
    if get_labels:
        return X, y
    return X

In [5]:
X_train, y_train = read_data(path=os.path.join(data_dir, 'train'), get_labels=True)

In [6]:
# Flatten image matrix
X_train = reshape_array(X_train, (X_train.shape[0], -1))

# Encode label
y_train = LabelEncoder().fit_transform(y_train)

## Train

In [None]:
model = XGBClassifier()
model.fit(X_train, y_train)

In [None]:
accuracy_score(model.predict(X_train), y_train)