```
Author: Annam.ai IIT Ropar
Team Name: RAV
Team Members: VIGNESH J, ASHWATH VINODKUMAR, RAHUL BHARGAV TALLADA
Leaderboard Rank: 13
```

# This is the notebook used for soil classification challenge (Task 2).

## Setup and Configuration

Installing required packages and importing necessary libraries for the soil classification task.

In [None]:
!pip install open-clip-torch pandas pillow scikit-learn --quiet

import sys
import os
sys.path.append('../src')

import open_clip
import torch
import pandas as pd
import numpy as np
from PIL import Image
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Import custom utilities (if available)
try:
    from preprocessing import load_and_preprocess_image, normalize_embeddings
    from postprocessing import save_predictions, analyze_predictions
    print("Successfully imported custom utilities")
except ImportError:
    print("Custom utilities not found, using internal functions")

# Configuration
MODEL_NAME = "ViT-H-14"
PRETRAINED = "laion2b_s32b_b79k"
BATCH_SIZE = 8
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load model and preprocessor
model, _, preprocess = open_clip.create_model_and_transforms(
    model_name=MODEL_NAME,
    pretrained=PRETRAINED
)
model = model.to(DEVICE).eval()

## Data Loading

Loading the training and test datasets for the soil classification task.

In [None]:
# Load metadata
DATA_DIR = Path("../data")
if not DATA_DIR.exists():
    # For Kaggle environment
    DATA_DIR = Path("/kaggle/input/soil-classification-part-2/soil_competition-2025")

train_df = pd.read_csv(DATA_DIR / "train_labels.csv" if DATA_DIR.exists() else "/kaggle/input/soil-classification-part-2/soil_competition-2025/train_labels.csv")
test_df = pd.read_csv(DATA_DIR / "test_ids.csv" if DATA_DIR.exists() else "/kaggle/input/soil-classification-part-2/soil_competition-2025/test_ids.csv")

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")

# Display sample data
display(train_df.head())
display(test_df.head())

## Text Prompts for Zero-Shot Classification

Defining text prompts for soil and non-soil categories to use with CLIP model.

In [None]:
# Binary classification prompts
soil_prompts = [
    "A photograph of soil, earth, dirt, or ground surface",
    "Agricultural soil in a field or garden",
    "Natural earth surface with visible soil texture",
    "Close-up view of soil, dirt, or earth material",
    "Ground surface showing soil composition",
    "Farmland soil or agricultural earth",
    "Natural soil formation with organic matter"
]

non_soil_prompts = [
    "A photograph with no soil, dirt, or earth visible",
    "Indoor scene without any ground or soil",
    "Water, sky, buildings, or other non-soil objects",
    "Concrete, asphalt, or artificial surfaces",
    "Plants, rocks, or objects without visible soil",
    "Urban environment without natural earth",
    "Clean surfaces without dirt or soil material"
]

# Precompute text embeddings for binary classification
with torch.no_grad():
    # Soil embeddings
    soil_embeddings = []
    for prompt in soil_prompts:
        text = open_clip.tokenize([prompt]).to(DEVICE)
        soil_embeddings.append(model.encode_text(text))
    soil_text_embedding = torch.mean(torch.cat(soil_embeddings), dim=0, keepdim=True)
    
    # Non-soil embeddings
    non_soil_embeddings = []
    for prompt in non_soil_prompts:
        text = open_clip.tokenize([prompt]).to(DEVICE)
        non_soil_embeddings.append(model.encode_text(text))
    non_soil_text_embedding = torch.mean(torch.cat(non_soil_embeddings), dim=0, keepdim=True)
    
    # Normalize embeddings for cosine similarity
    soil_text_embedding = soil_text_embedding / soil_text_embedding.norm(dim=-1, keepdim=True)
    non_soil_text_embedding = non_soil_text_embedding / non_soil_text_embedding.norm(dim=-1, keepdim=True)

print(f"Created text embeddings for {len(soil_prompts)} soil prompts and {len(non_soil_prompts)} non-soil prompts")

## Image Embedding Functions

Creating functions to extract image embeddings in a memory-efficient way.

In [None]:
# Memory-optimized embedding generator
def get_image_embeddings(image_paths):
    """Batch processing to prevent OOM errors"""
    embeddings = []
    for i in range(0, len(image_paths), BATCH_SIZE):
        batch_paths = image_paths[i:i+BATCH_SIZE]
        batch = torch.stack([preprocess(Image.open(p).convert("RGB")) for p in batch_paths])
        
        with torch.no_grad():
            batch = batch.to(DEVICE)
            batch_emb = model.encode_image(batch)
            embeddings.append(batch_emb.cpu().numpy())
        
        # Memory cleanup
        del batch, batch_emb
        torch.cuda.empty_cache()
    
    return np.concatenate(embeddings)

## Training Data Preparation

Preparing the training data for the soil classification model.

In [None]:
# Prepare training data - since all training images contain soil, label them as 1
train_images = [DATA_DIR / "train" / img_id if DATA_DIR.exists() else Path("/kaggle/input/soil-classification-part-2/soil_competition-2025/train")/img_id 
               for img_id in train_df.image_id]
X_train = get_image_embeddings(train_images)

# All training images are soil (label = 1)
# For a more robust classifier, you might want to add negative examples (non-soil images)
y_train = np.ones(len(X_train), dtype=int)  # All are soil = 1

print(f"Training samples: {len(X_train)} (all labeled as soil=1)")

# Note: Since we only have positive examples, we'll rely more on zero-shot classification
# Train a simple classifier anyway (though it may not be very effective with only positive examples)
if len(np.unique(y_train)) > 1:  # Only train if we have both classes
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )
    
    clf = LogisticRegression(max_iter=1000, class_weight="balanced")
    clf.fit(X_train_split, y_train_split)
    val_acc = clf.score(X_val_split, y_val_split)
    print(f"Validation Accuracy: {val_acc:.2%}")
else:
    print("Only one class in training data - using pure zero-shot classification")
    clf = None

## Prediction Functions

Implementing functions to predict whether an image contains soil or not.

In [None]:
# Binary prediction function
def predict_is_soil(image_path, threshold=0.0):
    """
    Predict if image contains soil (1) or not (0)
    Uses zero-shot CLIP classification
    """
    image = preprocess(Image.open(image_path).convert("RGB")).unsqueeze(0).to(DEVICE)
    
    with torch.no_grad():
        # Get image features
        image_features = model.encode_image(image)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        
        # Get text features
        soil_features = soil_text_embedding.to(DEVICE)
        non_soil_features = non_soil_text_embedding.to(DEVICE)
        
        # Calculate similarities
        soil_similarity = (image_features @ soil_features.T).item()
        non_soil_similarity = (image_features @ non_soil_features.T).item()
        
        # Use softmax to get probabilities
        logits = torch.tensor([non_soil_similarity, soil_similarity])
        probs = torch.softmax(logits, dim=0)
        
        # Return 1 if soil probability > non-soil probability + threshold
        is_soil = 1 if probs[1] > (probs[0] + threshold) else 0
        
    return is_soil, probs[1].item()  # Return prediction and soil probability

# Alternative simpler approach - just compare similarities
def predict_is_soil_simple(image_path):
    """Simpler version - just compare soil vs non-soil similarity"""
    image = preprocess(Image.open(image_path).convert("RGB")).unsqueeze(0).to(DEVICE)
    
    with torch.no_grad():
        image_features = model.encode_image(image)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        
        soil_features = soil_text_embedding.to(DEVICE)
        non_soil_features = non_soil_text_embedding.to(DEVICE)
        
        soil_sim = (image_features @ soil_features.T).item()
        non_soil_sim = (image_features @ non_soil_features.T).item()
        
        # Return 1 if more similar to soil than non-soil
        return 1 if soil_sim > non_soil_sim else 0

## Testing on Training Images

Verifying the prediction functions on a few training images.

In [None]:
# Test on a few training images to verify
print("\nTesting on a few training images (should all be 1 since they contain soil):")
sample_images = train_images[:5]
for i, img_path in enumerate(sample_images):
    prediction, confidence = predict_is_soil(img_path)
    simple_pred = predict_is_soil_simple(img_path)
    print(f"Image {i+1}: Advanced={prediction} (conf={confidence:.3f}), Simple={simple_pred}")

## Generating Predictions for Test Set

Applying the model to the test dataset and generating predictions.

In [None]:
# Generate predictions for test set
print(f"\nGenerating predictions for {len(test_df)} test images...")
test_images = [DATA_DIR / "test" / img_id if DATA_DIR.exists() else Path("/kaggle/input/soil-classification-part-2/soil_competition-2025/test")/img_id 
              for img_id in test_df.image_id]

# Use the simple approach for final predictions
predictions = []
for img_path in test_images:
    pred = predict_is_soil_simple(img_path)
    predictions.append(pred)

test_df["is_soil"] = predictions

## Saving Results and Analysis

Saving the predictions to a CSV file and analyzing the distribution of predictions.

In [None]:
# Save results
output_file = "binary_soil_submission.csv"
test_df[["image_id", "is_soil"]].to_csv(output_file, index=False)

# Analyze prediction distribution
print(f"\nPrediction distribution:")
print(f"Soil (1): {sum(predictions)} images")
print(f"Non-soil (0): {len(predictions) - sum(predictions)} images")
print(f"Percentage classified as soil: {sum(predictions)/len(predictions)*100:.1f}%")

print(f"\nBinary classification complete! Results saved to '{output_file}'")

# Display sample predictions
display(test_df.head(10))