**Paddy Doctor**

#Import data and libraries

In [1]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
# import librairies
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import Model, optimizers
from tensorflow.keras.applications import xception
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import plot_model
from tensorflow.keras import backend as K
from tensorflow.keras.models import load_model

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import AveragePooling2D
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.utils import to_categorical

!pip install tensorflow-addons
!pip install -q -U keras-tuner
import tensorflow_addons as tfa
import keras_tuner as kt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

from lightgbm import LGBMClassifier

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.17.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 4.5 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.17.1
[K     |████████████████████████████████| 133 kB 5.1 MB/s 
[?25h

##data exploration

In [3]:
df_meta = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/paddy/train.csv")
df_meta.head()

Unnamed: 0,image_id,label,variety,age
0,100330.jpg,bacterial_leaf_blight,ADT45,45
1,100365.jpg,bacterial_leaf_blight,ADT45,45
2,100382.jpg,bacterial_leaf_blight,ADT45,45
3,100632.jpg,bacterial_leaf_blight,ADT45,45
4,101918.jpg,bacterial_leaf_blight,ADT45,45


In [4]:
train_path = "/content/drive/MyDrive/Colab Notebooks/paddy/train_images"
test_path = "/content/drive/MyDrive/Colab Notebooks/paddy/test_images"

In [5]:
disease_list = os.listdir(train_path)
n_classes = len(disease_list)
print(disease_list)
print("Total diseases number:", n_classes)

['brown_spot', 'downy_mildew', 'tungro', 'hispa', 'normal', 'bacterial_leaf_streak', 'blast', 'dead_heart', 'bacterial_panicle_blight', 'bacterial_leaf_blight']
Total diseases number: 10


In [None]:
# Counts the number of training and testing samples in the directories
training_samples = sum([len(files) for r, d, files in os.walk(train_path)])
testing_samples = sum([len(files) for r, d, files in os.walk(test_path)])

In [None]:
print(training_samples)
print(testing_samples)

6411
3469


## Lightgbm classifier on image features + metadata

### preprocessing

In [6]:
img_width = 200
img_height = 200
batch_size = 32

epochs = 100
epochs_ft = 200

nb_classes = 10

In [7]:
train_datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.3,
    height_shift_range=0.2,
    shear_range=0.3,
    zoom_range=0.3,
    horizontal_flip=True,
    vertical_flip=False,
    validation_split=0.2,
    preprocessing_function=tf.keras.applications.resnet_v2.preprocess_input,
)

test_datagen = ImageDataGenerator(
    preprocessing_function=tf.keras.applications.resnet_v2.preprocess_input
)

In [8]:
train_generator = train_datagen.flow_from_directory(
    train_path,
    subset="training",
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode="categorical",
)

valid_generator = train_datagen.flow_from_directory(
    train_path,
    subset="validation",
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode="categorical",
)

Found 5131 images belonging to 10 classes.
Found 1280 images belonging to 10 classes.


In [9]:
# images for prediction
test_generator = test_datagen.flow_from_directory(
    directory=test_path,
    target_size=(img_width, img_height),
    batch_size=batch_size,
    classes=["."],
    shuffle=False,
)

Found 3469 images belonging to 1 classes.


In [10]:
# for feature extraction
features_datagen = ImageDataGenerator(
    preprocessing_function=tf.keras.applications.resnet_v2.preprocess_input
)

features_generator = features_datagen.flow_from_directory(
    train_path,
    target_size=(img_width, img_height),
    batch_size=6411,
    class_mode="categorical",
)

Found 6411 images belonging to 10 classes.


###feature extraction for future classification

In [None]:
resnet = load_model(
    "/content/drive/MyDrive/Colab Notebooks/paddy/paddy_resnet_fine_tuned"
)
resnet.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 200, 200, 3  0           []                               
                                )]                                                                
                                                                                                  
 conv1_pad (ZeroPadding2D)      (None, 206, 206, 3)  0           ['input_3[0][0]']                
                                                                                                  
 conv1_conv (Conv2D)            (None, 100, 100, 64  9472        ['conv1_pad[0][0]']              
                                )                                                                 
                                                                                              

In [None]:
# features as vector

extractor = Model(
    inputs=resnet.inputs, outputs=resnet.get_layer("global_average_pooling2d").output
)

In [None]:
features = extractor.predict(features_generator)

In [None]:
np.save(
    "/content/drive/MyDrive/Colab Notebooks/paddy/resnet_train_features.npy", features
)

In [11]:
features = np.load(
    "/content/drive/MyDrive/Colab Notebooks/paddy/resnet_train_features.npy"
)

In [None]:
features

array([[1.3958767e-01, 0.0000000e+00, 2.1052444e-01, ..., 2.5995275e-01,
        2.3295243e-01, 1.3605195e-01],
       [7.0347562e-02, 3.5302755e-01, 0.0000000e+00, ..., 2.6250997e-01,
        4.7581993e-02, 3.7727752e-04],
       [8.9329667e-02, 3.5451594e-01, 3.3279669e-01, ..., 7.8598255e-01,
        1.6052764e-02, 4.5336917e-01],
       ...,
       [2.3775879e-02, 3.5819954e-01, 0.0000000e+00, ..., 1.7842248e-01,
        4.0694055e-01, 6.0170345e-02],
       [4.6558119e-03, 1.3682973e-01, 2.5103068e-01, ..., 5.4945755e-01,
        1.1953112e-02, 0.0000000e+00],
       [2.0308977e-02, 3.9501261e-02, 5.2355796e-01, ..., 1.2693298e-01,
        0.0000000e+00, 2.0509057e-01]], dtype=float32)

In [None]:
# one hot encoding : save
np.save("/content/drive/MyDrive/Colab Notebooks/paddy/target_train_paddy.npy", y)

In [None]:
# one hot encoding : load
y = np.load("/content/drive/MyDrive/Colab Notebooks/paddy/target_train_paddy.npy")

#### images features + metadata in gbm classifier

In [12]:
# concatenate features from images with metadata
df_feat = pd.DataFrame(features)

In [13]:
df_feat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0.139588,0.0,0.210524,0.0,0.666899,0.664828,0.010476,0.019738,0.0,0.08499,...,1.403242,0.0,0.105543,0.020411,0.155562,0.060386,1.471093,0.259953,0.232952,0.136052
1,0.070348,0.353028,0.0,0.32015,0.576641,0.154265,0.387742,0.018312,0.0,0.523028,...,0.569842,0.0,0.301794,0.0,0.546042,0.026524,0.594858,0.26251,0.047582,0.000377
2,0.08933,0.354516,0.332797,1.304994,0.193011,0.128608,0.0,0.0,0.0,0.003377,...,0.0,0.038477,0.018226,0.004933,0.710452,0.0,0.594483,0.785983,0.016053,0.453369
3,0.063697,0.26942,0.175823,0.608938,0.070597,0.623174,0.012941,0.0,0.0,0.0,...,0.168628,0.341892,0.013262,0.013886,0.18913,0.009164,1.071386,0.314274,0.359584,0.81632
4,0.024662,0.198447,0.0,0.077448,0.086731,0.003153,0.000355,0.02202,0.0,0.000821,...,0.321922,0.524812,0.126951,0.059856,0.052624,0.02623,0.191431,0.399338,0.07212,0.048504


In [14]:
df_meta = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/paddy/train.csv")

In [15]:
df_feat = df_feat.join(df_meta)

In [16]:
df_feat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2042,2043,2044,2045,2046,2047,image_id,label,variety,age
0,0.139588,0.0,0.210524,0.0,0.666899,0.664828,0.010476,0.019738,0.0,0.08499,...,0.155562,0.060386,1.471093,0.259953,0.232952,0.136052,100330.jpg,bacterial_leaf_blight,ADT45,45
1,0.070348,0.353028,0.0,0.32015,0.576641,0.154265,0.387742,0.018312,0.0,0.523028,...,0.546042,0.026524,0.594858,0.26251,0.047582,0.000377,100365.jpg,bacterial_leaf_blight,ADT45,45
2,0.08933,0.354516,0.332797,1.304994,0.193011,0.128608,0.0,0.0,0.0,0.003377,...,0.710452,0.0,0.594483,0.785983,0.016053,0.453369,100382.jpg,bacterial_leaf_blight,ADT45,45
3,0.063697,0.26942,0.175823,0.608938,0.070597,0.623174,0.012941,0.0,0.0,0.0,...,0.18913,0.009164,1.071386,0.314274,0.359584,0.81632,100632.jpg,bacterial_leaf_blight,ADT45,45
4,0.024662,0.198447,0.0,0.077448,0.086731,0.003153,0.000355,0.02202,0.0,0.000821,...,0.052624,0.02623,0.191431,0.399338,0.07212,0.048504,101918.jpg,bacterial_leaf_blight,ADT45,45


In [18]:
# targets without encoding (lightgbm)
target = df_feat["label"]

In [19]:
# train test split sur features

x_train, x_test, y_train, y_test = train_test_split(
    features, target, test_size=0.25, random_state=42
)

In [20]:
# reduce dimension with PCA


scaler = StandardScaler().fit(x_train)
x_std = scaler.transform(x_train)
x_std_test = scaler.transform(x_test)

In [22]:
pca_res = PCA(n_components=0.90)
pca_res_train = pca_res.fit_transform(x_std)
pca_res_test = pca_res.transform(x_std_test)

In [23]:
pca_res_train.shape

(4808, 557)

lightgbm classifier

In [24]:
lgbm = LGBMClassifier()


params = {
    "num_leaves": [7, 14, 21, 28, 31, None],
    "learning_rate": [0.1, 0.03, 0.003],
    "max_depth": [-1, 3, 5, None],
    "n_estimators": [100],
}

grid = GridSearchCV(
    lgbm, param_grid=params, scoring="accuracy", cv=5, n_jobs=-1, verbose=10
)
model_gbm = grid.fit(pca_res_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [25]:
print(grid.best_params_)

{'learning_rate': 0.003, 'max_depth': -1, 'n_estimators': 100, 'num_leaves': 14}


In [26]:
y_pred = model_gbm.predict(pca_res_test)

In [29]:
gbm_accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", gbm_accuracy)

Accuracy: 0.2744853399875234
