In [8]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Import necessary libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
import cv2
import os


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/rice-leaf-diseases/rice_leaf_diseases/Brown spot/DSC_0305.JPG
/kaggle/input/rice-leaf-diseases/rice_leaf_diseases/Brown spot/DSC_0394.jpg
/kaggle/input/rice-leaf-diseases/rice_leaf_diseases/Brown spot/DSC_0105.jpg
/kaggle/input/rice-leaf-diseases/rice_leaf_diseases/Brown spot/DSC_0104.jpg
/kaggle/input/rice-leaf-diseases/rice_leaf_diseases/Brown spot/DSC_0325.JPG
/kaggle/input/rice-leaf-diseases/rice_leaf_diseases/Brown spot/DSC_0332.JPG
/kaggle/input/rice-leaf-diseases/rice_leaf_diseases/Brown spot/DSC_0296.jpg
/kaggle/input/rice-leaf-diseases/rice_leaf_diseases/Brown spot/DSC_0114.jpg
/kaggle/input/rice-leaf-diseases/rice_leaf_diseases/Brown spot/DSC_0329.jpg
/kaggle/input/rice-leaf-diseases/rice_leaf_diseases/Brown spot/DSC_0100.jpg
/kaggle/input/rice-leaf-diseases/rice_leaf_diseases/Brown spot/DSC_0299.JPG
/kaggle/input/rice-leaf-diseases/rice_leaf_diseases/Brown spot/DSC_0307.JPG
/kaggle/input/rice-leaf-diseases/rice_leaf_diseases/Brown spot/DSC_0301.JPG
/kaggle/inpu

In [10]:
# Define image size and data path
img_size = 128
base_dir = '/kaggle/input/rice-leaf-diseases/rice_leaf_diseases'
categories = ['Bacterial leaf blight', 'Brown spot', 'Leaf smut']

# Load images and labels
data = []
labels = []

for label, category in enumerate(categories):
    path = os.path.join(base_dir, category)
    for img in os.listdir(path):
        try:
            img_array = cv2.imread(os.path.join(path, img))
            resized_array = cv2.resize(img_array, (img_size, img_size))
            data.append(resized_array)
            labels.append(label)
        except Exception as e:
            print(f"Error loading image {img}: {e}")

data = np.array(data) / 255.0
labels = np.array(labels)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)


In [12]:
# Set up data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

datagen.fit(X_train)


In [29]:
cnn_model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(img_size, img_size, 3)),
    BatchNormalization(),
    MaxPooling2D(2,2),
    Dropout(0.25),
    
    Conv2D(64, (3,3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D(2,2),
    Dropout(0.3),
    
    Conv2D(128, (3,3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D(2,2),
    Dropout(0.4),
    
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the CNN model with data augmentation
cnn_model.fit(datagen.flow(X_train, y_train, batch_size=25), 
              validation_data=(X_test, y_test), 
              epochs=30, 
              callbacks=[early_stopping])


Epoch 1/30
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 800ms/step - accuracy: 0.3650 - loss: 7.1052 - val_accuracy: 0.3333 - val_loss: 1.0695
Epoch 2/30
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 755ms/step - accuracy: 0.4901 - loss: 6.8354 - val_accuracy: 0.4167 - val_loss: 1.0496
Epoch 3/30
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 634ms/step - accuracy: 0.5247 - loss: 4.1219 - val_accuracy: 0.4167 - val_loss: 1.3616
Epoch 4/30
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 648ms/step - accuracy: 0.5565 - loss: 2.5889 - val_accuracy: 0.4167 - val_loss: 2.6390
Epoch 5/30
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 672ms/step - accuracy: 0.5427 - loss: 3.5753 - val_accuracy: 0.4167 - val_loss: 3.9293
Epoch 6/30
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 686ms/step - accuracy: 0.5434 - loss: 2.4410 - val_accuracy: 0.4167 - val_loss: 4.9340
Epoch 7/30
[1m4/4[0m [32m━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7da48c7e54b0>

In [31]:
cnn_test_loss, cnn_test_acc = cnn_model.evaluate(X_test, y_test)
print(f"Advanced CNN Test Accuracy: {cnn_test_acc}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step - accuracy: 0.4167 - loss: 1.0496
Advanced CNN Test Accuracy: 0.4166666567325592


In [33]:
# Flatten images for Random Forest
X_train_flat = X_train.reshape(len(X_train), -1)
X_test_flat = X_test.reshape(len(X_test), -1)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV for hyperparameter tuning
rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_flat, y_train)

# Best model after tuning
best_rf_model = grid_search.best_estimator_

# Predictions with the tuned Random Forest
rf_predictions = best_rf_model.predict(X_test_flat)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Test Accuracy after tuning: {rf_accuracy}")


Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.7s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   2.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   1.3s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   2.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   1.9s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   1.3s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=   1.9s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   1.9s
[CV] END max_depth=10, min_

In [35]:
# CNN Predictions and Evaluation
cnn_predictions = np.argmax(cnn_model.predict(X_test), axis=-1)

print("CNN Classification Report:")
print(classification_report(y_test, cnn_predictions))
print("Confusion Matrix for CNN:")
print(confusion_matrix(y_test, cnn_predictions))

# Random Forest Predictions and Evaluation
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_predictions))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test, rf_predictions))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step
CNN Classification Report:
              precision    recall  f1-score   support

           0       0.36      0.50      0.42         8
           1       0.46      0.60      0.52        10
           2       0.00      0.00      0.00         6

    accuracy                           0.42        24
   macro avg       0.28      0.37      0.31        24
weighted avg       0.31      0.42      0.36        24

Confusion Matrix for CNN:
[[4 4 0]
 [4 6 0]
 [3 3 0]]
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89         8
           1       1.00      0.70      0.82        10
           2       0.71      0.83      0.77         6

    accuracy                           0.83        24
   macro avg       0.84      0.84      0.83        24
weighted avg       0.86      0.83      0.83        24

Confusion Matrix for Random Forest:
[[8 0 0]
 [

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Final result description
The Random Forest model outperforms the CNN model significantly in this case, achieving an accuracy of 83% compared to the CNN’s 42%. Here’s an analysis of why Random Forest is performing better and some next steps you could consider:

Imbalanced Performance: The CNN is struggling to generalize across all classes, shown by its low precision and recall, especially for class 2. This is often due to either insufficient data, ineffective model architecture for the dataset, or inadequate training epochs.

Random Forest's Superiority: Random Forest likely performed better due to its robustness with smaller datasets and tabular data, handling the limited data size well. The tuning of hyperparameters (e.g., max_depth, min_samples_split, n_estimators) helped optimize its performance.