In [2]:
import sys
import os

# Since this notebook is in the notebooks folder, move up one level to the project root.
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
print("Project root added to sys.path:", project_root)


Project root added to sys.path: c:\Users\rober\OneDrive\Desktop\516_fp\EuroSat_Segmentation_Project


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# Import the classifier module (if needed, for later use)
from src.classification.classifier import extract_color_features, LandUseClassifier

print("Libraries imported successfully.")


Libraries imported successfully.


In [4]:
# Test feature extraction on a sample image.
sample_img_path = os.path.join("..", "data", "raw", "EuroSAT", "Forest", "Forest_1.jpg")
sample_img = cv2.imread(sample_img_path)
if sample_img is not None:
    sample_img = cv2.cvtColor(sample_img, cv2.COLOR_BGR2RGB)
    features = extract_color_features(sample_img)
    print("Extracted features from sample image:", features)
else:
    print("Sample image not found. Check the file path.")


Extracted features from sample image: [38.90722656 61.08911133 77.5871582   3.355148    3.71837164  2.44474507]


In [5]:
# Define the CSV directory path (relative from notebooks folder)
csv_dir = os.path.join("..", "data", "raw", "EuroSAT")
train_csv_path = os.path.join(csv_dir, "train.csv")
val_csv_path   = os.path.join(csv_dir, "validation.csv")
test_csv_path  = os.path.join(csv_dir, "test.csv")

# Load CSV files using pandas
train_df = pd.read_csv(train_csv_path, index_col=0)
val_df   = pd.read_csv(val_csv_path, index_col=0)
test_df  = pd.read_csv(test_csv_path, index_col=0)

print("Train CSV columns:", train_df.columns)
print("Train CSV head:")
print(train_df.head())


Train CSV columns: Index(['Filename', 'Label', 'ClassName'], dtype='object')
Train CSV head:
                                                Filename  Label  \
16257                      AnnualCrop/AnnualCrop_142.jpg      0   
3297   HerbaceousVegetation/HerbaceousVegetation_2835...      2   
17881               PermanentCrop/PermanentCrop_1073.jpg      6   
2223                       Industrial/Industrial_453.jpg      4   
4887   HerbaceousVegetation/HerbaceousVegetation_1810...      2   

                  ClassName  
16257            AnnualCrop  
3297   HerbaceousVegetation  
17881         PermanentCrop  
2223             Industrial  
4887   HerbaceousVegetation  


In [6]:
def load_image_and_extract_features(img_path):
    img = cv2.imread(img_path)
    if img is None:
        print(f"Error loading image: {img_path}")
        return None
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return extract_color_features(img)

# Process a larger subset of the training data.
# Instead of using head(50), we randomly sample 200 rows.
sample_train = train_df.sample(n=200, random_state=42)

X = []
y = []
for idx, row in sample_train.iterrows():
    # "Filename" already contains a relative path, e.g., "AnnualCrop/AnnualCrop_142.jpg"
    img_filename = row["Filename"]
    img_path = os.path.join("..", "data", "raw", "EuroSAT", img_filename)
    features = load_image_and_extract_features(img_path)
    if features is not None:
        X.append(features)
        y.append(row["ClassName"])  # Using ClassName as the label

X = np.array(X)
y = np.array(y)

print("Feature matrix shape:", X.shape)
print("Labels shape:", y.shape)


Feature matrix shape: (200, 6)
Labels shape: (200,)


In [7]:
# Split the data into training and validation sets.
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Create an instance of the classifier and train it.
classifier = LandUseClassifier()
classifier.train(X_train, y_train)

# Evaluate the classifier on the validation set.
print("Validation Set Evaluation:")
classifier.evaluate(X_val, y_val)


Validation Set Evaluation:
Classification Report:
                      precision    recall  f1-score   support

          AnnualCrop       0.20      0.29      0.24         7
              Forest       0.50      1.00      0.67        10
HerbaceousVegetation       0.43      0.50      0.46         6
             Highway       0.00      0.00      0.00         5
          Industrial       0.60      0.86      0.71         7
             Pasture       0.00      0.00      0.00         2
       PermanentCrop       0.00      0.00      0.00         5
         Residential       0.00      0.00      0.00         5
               River       0.33      0.67      0.44         6
             SeaLake       0.00      0.00      0.00         7

            accuracy                           0.42        60
           macro avg       0.21      0.33      0.25        60
        weighted avg       0.25      0.42      0.31        60

Confusion Matrix:
[[ 2  0  1  0  3  0  0  0  1  0]
 [ 0 10  0  0  0  0  0  0  0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# Save the classifier to the models folder at the project root.
model_dir = os.path.join("..", "models")
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
model_path = os.path.join(model_dir, "landuse_classifier.pkl")
classifier.save(model_path)
print("Classifier saved to", model_path)


Classifier saved to ..\models\landuse_classifier.pkl


In [12]:
import sys
import sklearn
print("Python executable:", sys.executable)
print("scikit-learn version:", sklearn.__version__)


Python executable: c:\ProgramData\anaconda3\python.exe
scikit-learn version: 1.4.2


## Observations

- The feature extraction function (mean and standard deviation of color channels) produced a feature vector of length 6.
- The classifier (SVM) was trained on a subset of the data (50 samples) and evaluated on a validation split.
- The classification report and confusion matrix indicate areas for improvement—particularly by increasing the sample size and possibly refining feature extraction.