In [None]:
# 1. Import required libraries
import pandas as pd  # For handling CSV files and dataframes
import numpy as np  # For numerical operations
import os  # For file path operations
import matplotlib.pyplot as plt  # For plotting graphs
from tensorflow.keras.preprocessing.image import ImageDataGenerator  # For image augmentation and loading
from tensorflow.keras.applications import MobileNetV2  # Pre-trained MobileNetV2 model
from tensorflow.keras.models import Model  # Base class for defining the custom model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D  # Layers to modify base model
from tensorflow.keras.optimizers import Adam  # Optimizer
from tensorflow.keras.callbacks import EarlyStopping  # For stopping training early if no improvement
from tensorflow.keras.preprocessing import image  # For image loading and processing
from sklearn.metrics import f1_score  # For evaluating model performance using F1-score
import tensorflow as tf  # TensorFlow framework


In [None]:
# 2. Define important paths and constants
TRAIN_DIR = "../data/soil_competition-2025/train"  # Directory where training images are stored
TEST_DIR = "../data/soil_competition-2025/test"  # Directory where test images are stored
TRAIN_CSV = "../data/soil_competition-2025/train_labels.csv"  # CSV file containing training image IDs
TEST_CSV = "../data/soil_competition-2025/test_ids.csv"  # CSV file containing test image IDs
IMG_SIZE = (224, 224)  # Input image size (height, width)
BATCH_SIZE = 32  # Number of images per batch during training

In [None]:
# 3. Load training labels
df = pd.read_csv(TRAIN_CSV)  # Read the CSV into a dataframe
df["label"] = 1  # Assign label 1 to all training images (since they are all soil)

In [None]:
# 4. Define augmentation pipeline to generate negative (non-soil) samples
augmenter = ImageDataGenerator(
    rescale=1./255,  # Normalize pixel values
    rotation_range=30,  # Random rotation
    brightness_range=[0.2, 0.8],  # Random brightness
    shear_range=20,  # Shear transformation
    zoom_range=0.5,  # Zoom in/out
    horizontal_flip=True,  # Random horizontal flip
    vertical_flip=True,  # Random vertical flip
    channel_shift_range=50.0,  # Random color shifts
    fill_mode="nearest",  # Filling strategy
    preprocessing_function=lambda x: tf.image.random_contrast(x, 0.5, 1.5)  # Add random contrast
)

In [None]:
# 5. Function to generate synthetic negative images
def generate_negative_samples(df, n=1000):
    # Use augmenter to generate new negative images based on existing soil images
    temp_gen = augmenter.flow_from_dataframe(
        df,
        directory=TRAIN_DIR,
        x_col="image_id",  # Column with image filenames
        y_col=None,  # No labels needed
        target_size=IMG_SIZE,  # Resize images
        class_mode=None,
        batch_size=1,  # Generate one image at a time
        shuffle=True,
        seed=42
    )
    neg_images = []  # List to store generated negative images
    for _ in range(n):
        img = next(temp_gen)[0]  # Fetch one augmented image
        neg_images.append(img)  # Add to list
    return np.array(neg_images), np.zeros((n,))  # Return images with label 0 (non-soil)

In [None]:
# 6. Create train and validation generators with real soil images
datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)  # Rescale and split data

# Create training generator (80% data)
train_gen = datagen.flow_from_dataframe(
    df,
    directory=TRAIN_DIR,
    x_col="image_id",  # Image file name
    y_col="label",  # Label column
    target_size=IMG_SIZE,
    class_mode="raw",  # Return raw labels instead of categorical
    subset="training",
    batch_size=BATCH_SIZE,
    shuffle=True
)

# Create validation generator (20% data)
val_gen = datagen.flow_from_dataframe(
    df,
    directory=TRAIN_DIR,
    x_col="image_id",
    y_col="label",
    target_size=IMG_SIZE,
    class_mode="raw",
    subset="validation",
    batch_size=BATCH_SIZE,
    shuffle=False
)

In [None]:
# 7. Create the base model using MobileNetV2
base_model = MobileNetV2(weights="imagenet", include_top=False, input_shape=(224, 224, 3))  # Load MobileNetV2 without the top classification layer

x = base_model.output  # Get output from base model
x = GlobalAveragePooling2D()(x)  # Add a global average pooling layer
x = Dense(64, activation='relu')(x)  # Add dense layer with 64 neurons
predictions = Dense(1, activation='sigmoid')(x)  # Final output layer for binary classification

model = Model(inputs=base_model.input, outputs=predictions)  # Define the full model

# Freeze base model layers (only train top layers first)
for layer in base_model.layers:
    layer.trainable = False

# Compile the model
model.compile(optimizer=Adam(learning_rate=1e-4), loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
# 8. Generate synthetic negative samples to balance dataset
neg_images, neg_labels = generate_negative_samples(df, n=len(df))  # Generate same number of negatives as positives

In [None]:
# 9. Collect real soil training images
X_real, y_real = [], []
for _ in range(len(train_gen)):
    X, y = next(train_gen)  # Get batch of data
    X_real.append(X)  # Append images
    y_real.append(y)  # Append labels

X_real = np.concatenate(X_real)  # Combine all real images
y_real = np.concatenate(y_real)  # Combine all labels

# Combine real soil and synthetic non-soil images
X_train = np.concatenate([X_real, neg_images])  # All training images
y_train = np.concatenate([y_real, neg_labels])  # Corresponding labels

In [None]:
# 10. Train model with early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)  # Stop training if val_loss doesn't improve for 3 epochs
# Initial training
history = model.fit(
    X_train, y_train,  # Input and labels
    validation_data=val_gen,  # Validation data
    epochs=10,  # Train for up to 10 epochs
    batch_size=BATCH_SIZE,
    callbacks=[early_stop]
)

In [None]:
# 11. Fine-tune top layers of MobileNetV2
for layer in base_model.layers[-30:]:  # Unfreeze last 30 layers for fine-tuning
    layer.trainable = True

# Re-compile model with smaller learning rate for fine-tuning
model.compile(optimizer=Adam(learning_rate=1e-5), loss="binary_crossentropy", metrics=["accuracy"])

# Continue training with unfrozen layers
history_finetune = model.fit(
    X_train, y_train,
    validation_data=val_gen,
    epochs=10,
    batch_size=BATCH_SIZE,
    callbacks=[early_stop]
)

In [None]:
# 12. Evaluate model using F1-score on validation set
val_gen.reset()  # Reset generator state
val_preds, val_labels = [], []  # Store predictions and labels

for _ in range(len(val_gen)):
    batch_x, batch_y = next(val_gen)  # Get batch of validation data
    pred = model.predict(batch_x)  # Predict on batch
    val_preds.extend((pred > 0.5).astype(int).flatten())  # Convert probabilities to 0/1
    val_labels.extend(batch_y.flatten())  # Store true labels

val_f1 = f1_score(val_labels, val_preds)  # Compute F1 score
print(f"Validation F1-score: {val_f1:.4f}")  # Print result

In [None]:
# 13. Save model weights
model.save("soil_model_mobilenetv2.h5")  # Save trained model to disk