In [None]:
# Soil Classification - PART 1
## TRAINING

## Imports and Setup
import os
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.serialization import safe_globals
import numpy.core.multiarray
import timm
import albumentations as A
from albumentations.pytorch import ToTensorV2
from sklearn.metrics import f1_score

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Dataset Classes
class SoilDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.data = pd.read_csv(csv_file)
        self.image_dir = img_dir
        self.transform = transform
        self.classes = sorted(self.data['soil_type'].unique())
        self.class_to_idx = {label: idx for idx, label in enumerate(self.classes)}
        self.idx_to_class = {idx: label for label, idx in self.class_to_idx.items()}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.image_dir, row['image_id'])
        image = np.array(Image.open(img_path).convert("RGB"))
        label = self.class_to_idx[row['soil_type']]
        if self.transform:
            image = self.transform(image=image)['image']
        return image, label

class TestDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.data = pd.read_csv(csv_file)
        self.image_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = self.data.iloc[idx]['image_id']
        img_path = os.path.join(self.image_dir, img_name)
        image = np.array(Image.open(img_path).convert("RGB"))
        if self.transform:
            image = self.transform(image=image)['image']
        return image, img_name

## Corrected Transforms
train_transform = A.Compose([
    A.SmallestMaxSize(max_size=256),
    A.RandomCrop(height=224, width=224),
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1, rotate_limit=15, p=0.5),
    A.RandomBrightnessContrast(p=0.5),
    A.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20, p=0.5),
    A.CoarseDropout(max_holes=8, max_height=32, max_width=32, fill_value=0, p=0.5),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2()
])

val_transform = A.Compose([
    A.SmallestMaxSize(max_size=256),
    A.CenterCrop(height=224, width=224),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2()
])

##  Data Loading
train_dir = '/kaggle/input/soilclassification1/soil_classification-2025/train'
test_dir = '/kaggle/input/soilclassification1/soil_classification-2025/test'
train_csv = '/kaggle/input/soilclassification1/soil_classification-2025/train_labels.csv'
test_csv = '/kaggle/input/soilclassification1/soil_classification-2025/test_ids.csv'

full_dataset = SoilDataset(train_csv, train_dir, transform=train_transform)
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size])

# Apply val transform to validation set
val_dataset.dataset.transform = val_transform

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)

## Model Setup
model = timm.create_model('tf_efficientnet_b4', pretrained=True, num_classes=len(full_dataset.classes))
model = model.to(device)

# Loss function with label smoothing
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

# Optimizer with weight decay
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)

# Cosine annealing learning rate scheduler
scheduler = CosineAnnealingLR(optimizer, T_max=20, eta_min=1e-6)

## Training Loop
num_epochs = 25
best_f1 = 0

for epoch in range(num_epochs):
    model.train()
    train_loss, correct, total = 0.0, 0, 0
    
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    
    train_acc = 100 * correct / total
    
    # Validation
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for val_images, val_labels_batch in val_loader:
            val_images = val_images.to(device)
            val_labels_batch = val_labels_batch.to(device)
            val_outputs = model(val_images)
            _, val_pred = val_outputs.max(1)
            val_preds.extend(val_pred.cpu().numpy())
            val_labels.extend(val_labels_batch.cpu().numpy())
    
    # Calculate F1 score
    val_f1 = f1_score(val_labels, val_preds, average='weighted')
    scheduler.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}] | Train Acc: {train_acc:.2f}% | Val F1: {val_f1:.4f}")
    
    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'f1': val_f1,
            'epoch': epoch
        }, 'best_soil_model.pth')
        print(f"Model saved with F1: {val_f1:.4f}")



  original_init(self, **validated_kwargs)
  A.CoarseDropout(max_holes=8, max_height=32, max_width=32, fill_value=0, p=0.5),


Epoch [1/25] | Train Acc: 78.20% | Val F1: 0.8700
Model saved with F1: 0.8700
Epoch [2/25] | Train Acc: 91.91% | Val F1: 0.9346
Model saved with F1: 0.9346
Epoch [3/25] | Train Acc: 96.42% | Val F1: 0.9593
Model saved with F1: 0.9593
Epoch [4/25] | Train Acc: 98.67% | Val F1: 0.9472
Epoch [5/25] | Train Acc: 99.39% | Val F1: 0.9674
Model saved with F1: 0.9674
Epoch [6/25] | Train Acc: 99.59% | Val F1: 0.9594
Epoch [7/25] | Train Acc: 99.59% | Val F1: 0.9675
Model saved with F1: 0.9675
Epoch [8/25] | Train Acc: 99.80% | Val F1: 0.9675
Epoch [9/25] | Train Acc: 99.80% | Val F1: 0.9594
Epoch [10/25] | Train Acc: 99.39% | Val F1: 0.9715
Model saved with F1: 0.9715
Epoch [11/25] | Train Acc: 99.80% | Val F1: 0.9552
Epoch [12/25] | Train Acc: 99.59% | Val F1: 0.9796
Model saved with F1: 0.9796
Epoch [13/25] | Train Acc: 99.69% | Val F1: 0.9716
Epoch [14/25] | Train Acc: 99.80% | Val F1: 0.9633
Epoch [15/25] | Train Acc: 99.80% | Val F1: 0.9796
Epoch [16/25] | Train Acc: 99.90% | Val F1: 0.97

In [None]:
## INFERENCE
#  Prediction with Robust Model Loading
test_transform = A.Compose([
    A.SmallestMaxSize(max_size=256),
    A.CenterCrop(height=224, width=224),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2()
])

# First try secure loading
try:
    checkpoint = torch.load('best_soil_model.pth', weights_only=False)
    model.load_state_dict(checkpoint['model_state_dict'])
except Exception as e:
    print(f"Secure loading failed: {str(e)}")
    print("Attempting safe_globals loading")
    
    # Import required modules
    from torch.serialization import safe_globals
    import numpy as np
    from numpy.core.multiarray import scalar
    from numpy import dtype
    
    # Create a custom safe_globals context
    def custom_safe_globals():
        base_globals = safe_globals()
        base_globals.update({
            'numpy.core.multiarray.scalar': scalar,
            'numpy.dtype': dtype
        })
        return base_globals
    
    with torch.serialization._use_custom_globals(custom_safe_globals()):
        checkpoint = torch.load('best_soil_model.pth')
        model.load_state_dict(checkpoint['model_state_dict'])

model.eval()

# Rest of your prediction code...
test_dataset = TestDataset(test_csv, test_dir, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)

predictions, filenames = [], []
with torch.no_grad():
    for images, names in test_loader:
        images = images.to(device)
        outputs = model(images)
        _, preds = outputs.max(1)
        predictions.extend([full_dataset.idx_to_class[p.item()] for p in preds])
        filenames.extend(names)

submission = pd.DataFrame({'image_id': filenames, 'soil_type': predictions})
submission.to_csv('submission.csv', index=False)
print(f"Final submission saved with {len(submission)} rows.")

Final submission saved with 341 rows.
