Importing Libraries

In [3]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import os
import skimage
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import pickle

Reading Dataset

In [4]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
0,Train_0,0,0,0,1
1,Train_1,0,1,0,0
2,Train_2,1,0,0,0
3,Train_3,0,0,1,0
4,Train_4,1,0,0,0


Labeling every image 

In [5]:
df['label'] = df[['healthy', 'multiple_diseases', 'rust', 'scab']].idxmax(axis=1)
df['label'] = df['label'].map({
    'healthy': 0,
    'multiple_diseases': 1,
    'rust': 2,
    'scab': 3
})
df.head()

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab,label
0,Train_0,0,0,0,1,3
1,Train_1,0,1,0,0,1
2,Train_2,1,0,0,0,0
3,Train_3,0,0,1,0,2
4,Train_4,1,0,0,0,0


Functions for feature extraction

In [6]:
def mean_brightness(img):
  hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
  h, s, v = cv2.split(hsv)
  return np.mean(v)

def std_brightness(img):
  hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
  h, s, v = cv2.split(hsv)
  return np.std(v)

def mean_saturation(img):
  hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
  h, s, v = cv2.split(hsv)
  return np.mean(s)

def std_saturation(img):
  hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
  h, s, v = cv2.split(hsv)
  return np.std(s)

def yellow_area_fraction(img):
  lower_yellow = np.array([20, 100, 100])
  upper_yellow = np.array([40, 255, 255])

  yellow_mask = cv2.inRange(img, lower_yellow, upper_yellow)
  yellow_pixels = np.sum(yellow_mask > 0)
  total_pixels = yellow_mask.size
  return yellow_pixels / total_pixels

def lbp_hist(img):
  gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  lbp = skimage.feature.local_binary_pattern(gray, P=8, R=1, method='uniform')
  lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, 10), density=True)
  return lbp_hist

def number_of_blobs(img):
  lower_yellow = np.array([20, 100, 100])
  upper_yellow = np.array([40, 255, 255])

  yellow_mask = cv2.inRange(img, lower_yellow, upper_yellow)
  num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(yellow_mask, connectivity=8)
  blob_areas = stats[1:, cv2.CC_STAT_AREA]
  num_blobs = len(blob_areas)
  avg_blob_size = np.mean(blob_areas) if num_blobs > 0 else 0
  feature = []
  feature.append(num_blobs)
  feature.append(avg_blob_size)
  return feature

def dark_area_fraction(img, threshold=50):
    
    # Convert to HSV
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    v_channel = hsv[:, :, 2]

    # Make a mask of "dark" pixels
    dark_mask = v_channel < threshold

    # Compute fraction
    dark_pixels = np.sum(dark_mask)
    total_pixels = dark_mask.size

    return dark_pixels / total_pixels


def extract_features(img_path):
  img_path = os.path.join('images', img_path) + ".jpg"
  img = cv2.imread(img_path)
  img = cv2.resize(img, (500, 500))
  features = []
  features.append(mean_brightness(img))
  features.append(std_brightness(img))
  features.append(mean_saturation(img))
  features.append(std_saturation(img))
  features.append(yellow_area_fraction(img))
  features.extend(lbp_hist(img))
  features.extend(number_of_blobs(img))
  features.append(dark_area_fraction(img))
  features = np.array(features)
  return features

In [None]:
img = cv2.imread('/images/Test_17.jpg')
img = cv2.resize(img, (500,500))
img = cv2.

In [5]:
len(df)

1821

Creating feature array

In [7]:
x_features = []
for image_id in tqdm(df['image_id']):
  features = extract_features(image_id)
  x_features.append(features)

x_features = np.array(x_features)
x_features.shape


100%|██████████| 1821/1821 [05:33<00:00,  5.46it/s]


(1821, 17)

In [8]:
y = df['label'].to_numpy()
x = x_features

Train Test Split

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

Model Training

In [10]:
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(x_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


Predicting

In [11]:
predict = classifier.predict(x_test)
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

           0       0.61      0.60      0.60       100
           1       0.00      0.00      0.00        18
           2       0.52      0.67      0.58       120
           3       0.54      0.47      0.50       127

    accuracy                           0.55       365
   macro avg       0.42      0.43      0.42       365
weighted avg       0.52      0.55      0.53       365



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [19]:
with open('classifier.pkl', 'wb') as f:
    pickle.dump(classifier,f)


On Test.csv

In [21]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,image_id
0,Test_0
1,Test_1
2,Test_2
3,Test_3
4,Test_4


In [23]:
x_features = []
for image_id in tqdm(test_df['image_id']):
  features = extract_features(image_id)
  x_features.append(features)

x_features = np.array(x_features)
x_features.shape

100%|██████████| 1821/1821 [05:53<00:00,  5.15it/s]


(1821, 16)

In [24]:
result = classifier.predict(x_features)

In [26]:
result_df = test_df.copy()
result_df['Healthy'] = [1 if x==0 else 0 for x in result]
result_df['multiple_diseases'] = [1 if x==1 else 0 for x in result]
result_df['rust'] = [1 if x==2 else 0 for x in result]
result_df['scab'] = [1 if x==3 else 0 for x in result]

In [27]:
result_df.head()

Unnamed: 0,image_id,Healthy,multiple_diseases,rust,scab
0,Test_0,0,0,1,0
1,Test_1,1,0,0,0
2,Test_2,0,0,0,1
3,Test_3,1,0,0,0
4,Test_4,0,0,0,1


In [30]:
result_df.to_csv('final_submission.csv',index=False)