In [8]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import rasterio
from sklearn.model_selection import KFold, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score, confusion_matrix
import joblib 
from pathlib import Path

# for data leveling
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# for aumentation
from scipy.ndimage import rotate

In [9]:
# define ouput dir
output_dir = Path("../output/XGBoost")
output_dir.mkdir(parents=True, exist_ok=True)

output_img_dir = output_dir / "IMG"
output_img_dir.mkdir(parents=True, exist_ok=True)

output_model_dir = output_dir / "model"
output_model_dir.mkdir(parents=True, exist_ok=True)

In [10]:
# Define functions to calculate NDVI and NDWI
def calculate_ndvi(nir, red):
    return (nir - red) / (nir + red + 1e-10)

def calculate_ndwi(nir, green):
    return (nir - green) / (nir + green + 1e-10)

# define data augmentation function
def augment_image(bands):
    """Perform augmentation on GeoTIFF bands (NIR, Red, Green)."""
    nir, red, green = bands[3], bands[0], bands[1]

    aug_bands = []
    # original
    aug_bands.append((nir, red, green))

    # Rotations (60,120,180, 240, 300) - keep shape same
    for angle in [60, 120, 180, 240, 300]:
        aug_bands.append((
            rotate(nir, angle, reshape=False),
            rotate(red, angle, reshape=False),
            rotate(green, angle, reshape=False)
        ))

    # Horizontal + vertical flips
    aug_bands.append((np.fliplr(nir), np.fliplr(red), np.fliplr(green)))
    aug_bands.append((np.flipud(nir), np.flipud(red), np.flipud(green)))

    # Small Gaussian noise (simulate sensor variation)
    noise = np.random.normal(0, 0.01, nir.shape).astype(nir.dtype)
    aug_bands.append((nir + noise, red + noise, green + noise))

    return aug_bands

'''
    Enhanced Feature Engineering:
        -  Current features (mean NDVI/NDWI) might be insufficient. Add:

'''
def extract_features(image_path, augment=False):
    with rasterio.open(image_path) as src:
        bands = src.read()

    feature_set = []
    band_sets = augment_image(bands) if augment else [(bands[3], bands[0], bands[1])]

    for nir, red, green in band_sets:
        # NDVI / NDWI
        ndvi = calculate_ndvi(nir, red)
        ndwi = calculate_ndwi(nir, green)

        # Feature vector
        features = [
            np.mean(ndvi), np.std(ndvi),
            np.mean(ndwi), np.std(ndwi),
            np.percentile(nir, 75),
            np.mean(green > np.quantile(green, 0.75))
        ]
        feature_set.append(features)

    return feature_set


In [11]:
# Load dataset efficiently
train_dataset_path = Path('../../../../data/odm_dataset/Patches/Train')
test_dataset_path = Path('../../../../data/odm_dataset/Patches/Test/')
growth_stages = ["germination", "tillering", "grand_growth", "ripening"]
X, X_test, y, y_test = [], [], [], []

for idx, stage in enumerate(growth_stages):
    stage_path = train_dataset_path / stage
    for img_file in os.listdir(stage_path):
        img_path = stage_path / img_file
        feats = extract_features(img_path, augment=True)  # enable augmentation
        for f in feats:
            X.append(f)
            y.append(idx)

X = np.array(X)
y = np.array(y)

# ADD SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

for idx, stage in enumerate(growth_stages):
    stage_path = test_dataset_path / stage
    for img_file in os.listdir(stage_path):
        img_path = stage_path / img_file
        feats = extract_features(img_path, augment=False)  # no augmentation
        for f in feats:   # usually only 1 feature vector
            X_test.append(f)
            y_test.append(idx)

X_test = np.array(X_test)
y_test = np.array(y_test)

In [12]:
XGBClassifier()

In [None]:
# Define parameter grid for XGBoost
param_grid = {
    "n_estimators": [100, 200, 400], 
    "max_depth": [2, 3, 5], 
    "learning_rate": [0.1, 0.05, 0.25],
    "subsample": [0.2, 0.5, 1.0], 
    "colsample_bytree": [0.2, 0.5, 1.0],
    "colsample_bylevel": [0.2, 0.5, 1.0], 
    "reg_lambda": [1, 5, 100], 
    "reg_alpha": [1, 5, 100],
    "min_child_weight": [5, 10, 100]
}

# Initialize XGBoost and GridSearchCV
xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=4,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42,
    scale_pos_weight=1.0,  # Adjust based on class imbalance ratio if known
    tree_method='hist',  # Faster training with histogram-based method
    enable_categorical=False  # Ensure compatibility with features
)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    xgb_model,
    param_grid,
    cv=kf,
    n_jobs=-1,
    verbose=2,
    return_train_score=True,
    scoring='accuracy'  # Optimize for accuracy directly
)

# Train model with cross-validation
# grid_search.fit(X, y)
grid_search.fit(X_res, y_res)

# Get the best model and parameters
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")


# Train with early stopping on the full dataset
history = best_model.fit(
    X_res,
    y_res,
    eval_set=[(X_res, y_res), (X_test, y_test)],
    eval_metric='mlogloss',
    early_stopping_rounds=10,
    verbose=True
)

# Plot training and test accuracy
eval_results = best_model.evals_result()
epochs = range(len(eval_results['validation_0']['mlogloss']))
train_loss = eval_results['validation_0']['mlogloss']
test_loss = eval_results['validation_1']['mlogloss']

plt.figure(figsize=(8, 6))
plt.plot(epochs, train_loss, label='Train Log Loss')
plt.plot(epochs, test_loss, label='Test Log Loss')
plt.xlabel('Epochs')
plt.ylabel('Log Loss')
plt.title('Training vs Test Log Loss')
plt.legend()
plt.savefig(Path(os.path.join(output_img_dir, "ODM_XGB_V7_optimized_loss_plot.png")))
plt.show()

# Evaluate on test set
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

Fitting 5 folds for each of 19683 candidates, totalling 98415 fits


KeyboardInterrupt: 

In [None]:
# Evaluate the model on the test set
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)

# Calculate additional metrics
test_precision = precision_score(y_test, y_test_pred, average='weighted')
test_recall = recall_score(y_test, y_test_pred, average='weighted')
test_f1 = f1_score(y_test, y_test_pred, average='weighted')
confusion = confusion_matrix(y_test, y_test_pred)

# Log confusion matrix as an image
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.savefig(Path(os.path.join(output_img_dir, "ODM_XGB_V7_confusion_matrix.png"))) # Save confusion matrix

# Print results
print(f"Test Accuracy: {test_accuracy}")
print(f"Test Precision: {test_precision}")
print(f"Test Recall: {test_recall}")
print(f"Test F1 Score: {test_f1}")
print(f"Classification Report:\n{test_report}")

# Save the best model to disk
XGB_model_v5 = Path(os.path.join(output_model_dir, "XGB_model_v7.joblib"))
joblib.dump(best_model, XGB_model_v5)
print(f"Saved model to: {XGB_model_v5}")

# Evaluate using cross-validation accuracy
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_ * 100:.2f}%")