In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array, ImageDataGenerator

# CONFIGURATION
IMAGE_SIZE = (224, 224)
TRAIN_DIR = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train'         # Folder with only soil images
TEST_DIR = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test'         # Folder with unknown images to classify
AUGMENT_TIMES = 3                 # Number of augmentations per soil image
CONTAMINATION_RATE = 0.08        # Tune between 0.05–0.15

# Preprocess a single image
def preprocess_image(path):
    img = Image.open(path).convert("RGB").resize(IMAGE_SIZE)
    img = img_to_array(img)
    img = preprocess_input(img)
    return img

# Load all valid images from folder
def load_dataset(folder):
    features, filenames = [], []
    for file in tqdm(os.listdir(folder)):
        if file.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
            try:
                img = preprocess_image(os.path.join(folder, file))
                features.append(img)
                filenames.append(file)
            except Exception as e:
                print(f"Error loading {file}: {e}")
    return np.array(features), filenames

# Load EfficientNetB0 model for feature extraction
model = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg')

# --- STEP 1: Load and Augment Training Data ---
print("Loading soil images...")
X_train_raw, _ = load_dataset(TRAIN_DIR)

print("Augmenting soil images...")
datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.2,
    horizontal_flip=True,
    brightness_range=[0.8, 1.2]
)

augmented_images = []
for img in X_train_raw:
    img = np.expand_dims(img, 0)
    it = datagen.flow(img, batch_size=1)
    for _ in range(AUGMENT_TIMES):
        augmented = next(it)[0]  # ✅ FIXED
        augmented_images.append(augmented)

X_train_aug = np.concatenate([X_train_raw] + [np.array(augmented_images)])

# --- STEP 2: Feature Extraction & Scaling ---
print("Extracting features from training images...")
X_train_features = model.predict(X_train_aug, verbose=1)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_features)

# --- STEP 3: Train Isolation Forest Model ---
print("Training IsolationForest model...")
clf = IsolationForest(contamination=CONTAMINATION_RATE, random_state=42)
clf.fit(X_train_scaled)

# --- STEP 4: Load & Predict on Test Images ---
print("Loading test images...")
X_test_raw, test_filenames = load_dataset(TEST_DIR)

print("Extracting features from test images...")
X_test_features = model.predict(X_test_raw, verbose=1)
X_test_scaled = scaler.transform(X_test_features)

print("Predicting with IsolationForest...")
preds = clf.predict(X_test_scaled)        # -1 = not soil, 1 = soil
boolean_preds = (preds == 1).astype(int)  # Convert to 1 (soil) or 0 (not soil)

# --- STEP 5: Save Results to CSV ---
df = pd.DataFrame({
    'image_id': test_filenames,
    'label': boolean_preds
})
df.to_csv('soil_predictionsfinal.csv', index=False)

print("✅ Done! Predictions saved to 'soil_predictionsfinal.csv'")


Loading soil images...


100%|██████████| 1222/1222 [00:15<00:00, 81.32it/s]


Augmenting soil images...


In [1]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array

# CONFIG
IMAGE_SIZE = (224, 224)
TRAIN_DIR = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train'
TEST_DIR = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test'

# Load and preprocess image
def preprocess_image(path):
    img = Image.open(path).convert("RGB").resize(IMAGE_SIZE)
    img = img_to_array(img)
    img = preprocess_input(img)  # ResNet50 preprocessing
    return img

def load_dataset(folder):
    features, filenames = [], []
    for file in tqdm(os.listdir(folder)):
        if file.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
            try:
                img = preprocess_image(os.path.join(folder, file))
                features.append(img)
                filenames.append(file)
            except Exception as e:
                print(f"Error loading {file}: {e}")
    return np.array(features), filenames

# Load pretrained ResNet50 (without top layer)
resnet = ResNet50(weights='imagenet', include_top=False, pooling='avg')

# --- Load soil training images ---
X_train_raw, _ = load_dataset(TRAIN_DIR)
X_train_features = resnet.predict(X_train_raw)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_features)

# Train Isolation Forest
model = IsolationForest(contamination=0.5, random_state=42)
model.fit(X_train_scaled)

# --- Load test images ---
X_test_raw, test_filenames = load_dataset(TEST_DIR)
X_test_features = resnet.predict(X_test_raw)
X_test_scaled = scaler.transform(X_test_features)

# Predict: -1 = not soil, 1 = soil
predictions = model.predict(X_test_scaled)
boolean_preds = (predictions == 1).astype(int)  # Convert to 1/0

# Save to CSV
df = pd.DataFrame({'filename': test_filenames, 'is_soil': boolean_preds})
df.to_csv('soil_predictions.csv', index=False)
print("Predictions saved to soil_predictions.csv ✅")


2025-05-24 18:33:22.090303: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748111602.343831      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748111602.417840      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-24 18:33:37.207183: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


100%|██████████| 1222/1222 [00:28<00:00, 43.61it/s]


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 3s/step


100%|██████████| 967/967 [00:07<00:00, 126.44it/s]


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 3s/step
Predictions saved to soil_predictions.csv ✅
