In [3]:
%pip install seaborn

Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import cv2
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix ,accuracy_score


In [5]:
def load_image_data(base_path='../data/raw/soil-types'):
    data = []
    for label in os.listdir(base_path):
        label_path = os.path.join(base_path, label)
        if os.path.isdir(label_path):
            for file in os.listdir(label_path):
                if file.endswith((".jpg", ".jpeg", ".png")):
                    data.append({
                        "image_path": os.path.join(label_path, file),
                        "label": label
                    })
    df = pd.DataFrame(data)
    return df

In [6]:
df = load_image_data()

In [7]:
df.head()

Unnamed: 0,image_path,label
0,../data/raw/soil-types\Alluvial soil\alluvial ...,Alluvial soil
1,../data/raw/soil-types\Alluvial soil\alluvial ...,Alluvial soil
2,../data/raw/soil-types\Alluvial soil\alluvial ...,Alluvial soil
3,../data/raw/soil-types\Alluvial soil\alluvial ...,Alluvial soil
4,../data/raw/soil-types\Alluvial soil\alluvial ...,Alluvial soil


In [8]:
df.isnull().sum()

image_path    0
label         0
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   image_path  144 non-null    object
 1   label       144 non-null    object
dtypes: object(2)
memory usage: 2.4+ KB


In [10]:
df['label'].unique()

array(['Alluvial soil', 'Clayey soils', 'Laterite soil', 'Loamy soil',
       'Sandy loam', 'Sandy soil'], dtype=object)

In [11]:
# Image size to resize
IMG_SIZE = 128  

# Lists to hold image arrays and labels
image_data = []
labels = []

for idx, row in df.iterrows():
    img_path = row['image_path']
    label = row['label']
    
    # Read image
    image = cv2.imread(img_path)
    if image is None:
        continue  # skip broken or unreadable images

    # Resize image
    image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))  # here IMG_SIZE shows width and height of image
    
    # Normalize image (convert to float and scale to [0,1])
    image = image.astype('float32') / 255.0

    # Append image and label
    image_data.append(image)
    labels.append(label)


In [12]:
def extract_color_histogram(image, bins=(16, 16, 16)):
    # Convert image to HSV
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    
    # Extract color histogram
    hist = cv2.calcHist([hsv], [0, 1, 2], None, bins,
                        [0, 180, 0, 256, 0, 256])
    
    # Normalize and flatten
    cv2.normalize(hist, hist)
    return hist.flatten()


In [13]:
extracted_features = [extract_color_histogram(img) for img in image_data]
X = np.array(extracted_features)

In [14]:
# Encode labels to integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)  # shape: (num_samples,)
print("y shape:", y.shape)
print("Classes:", label_encoder.classes_)


y shape: (144,)
Classes: ['Alluvial soil' 'Clayey soils' 'Laterite soil' 'Loamy soil' 'Sandy loam'
 'Sandy soil']


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [16]:
print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)

Train shape: (115, 4096) (115,)
Test shape: (29, 4096) (29,)


In [27]:
model = SVC(kernel='rbf',C=10)  # Try 'rbf' or 'poly' too
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [28]:
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
                precision    recall  f1-score   support

Alluvial soil       0.50      0.50      0.50         2
 Clayey soils       0.00      0.00      0.00         6
Laterite soil       0.57      0.67      0.62         6
   Loamy soil       0.00      0.00      0.00         3
   Sandy loam       0.50      0.60      0.55         5
   Sandy soil       0.27      0.43      0.33         7

     accuracy                           0.38        29
    macro avg       0.31      0.37      0.33        29
 weighted avg       0.30      0.38      0.34        29

Confusion Matrix:
 [[1 0 0 0 0 1]
 [0 0 0 0 1 5]
 [0 0 4 2 0 0]
 [0 0 3 0 0 0]
 [0 0 0 0 3 2]
 [1 0 0 1 2 3]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [29]:
print(accuracy_score(y_test,y_pred))

0.3793103448275862


In [25]:
from sklearn.metrics import r2_score

In [26]:
print(r2_score(y_test,y_pred))

-0.8967297762478488
