In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from src.model import get_model


In [None]:
# Paths
TRAIN_IMAGES = '/kaggle/input/soil-classification/soil_classification-2025/train'
TEST_IMAGES = '/kaggle/input/soil-classification/soil_classification-2025/test'
TRAIN_CSV = '/kaggle/input/soil-classification/soil_classification-2025/train_labels.csv'
TEST_CSV = '/kaggle/input/soil-classification/soil_classification-2025/test_ids.csv'

train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

# Encode soil labels
soil2idx = {soil: idx for idx, soil in enumerate(train_df['soil_type'].unique())}
train_df['label'] = train_df['soil_type'].map(soil2idx)

# Split train/val
train_data, val_data = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=42)


In [None]:
IMG_SIZE = 224
BATCH_SIZE = 32

train_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

class SoilDataset(Dataset):
    def __init__(self, df, img_dir, transform=None, is_test=False):
        self.df = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.transform = transform
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_id = self.df.loc[idx, 'image_id']
        image = Image.open(os.path.join(self.img_dir, img_id)).convert('RGB')
        if self.transform:
            image = self.transform(image)
        if self.is_test:
            return image, img_id
        label = self.df.loc[idx, 'label']
        return image, label

# Datasets & Dataloaders
train_ds = SoilDataset(train_data, TRAIN_IMAGES, transform=train_transform)
val_ds = SoilDataset(val_data, TRAIN_IMAGES, transform=val_transform)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)


In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs=10):
    best_model_wts = model.state_dict()
    best_f1 = 0

    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        
        model.train()
        train_loss, train_preds, train_labels = 0, [], []
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_preds.extend(torch.argmax(outputs, 1).cpu().numpy())
            train_labels.extend(labels.cpu().numpy())

        train_f1 = f1_score(train_labels, train_preds, average='macro')
        print(f"Train Loss: {train_loss/len(train_loader):.4f} | F1: {train_f1:.4f}")

        model.eval()
        val_loss, val_preds, val_labels = 0, [], []
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                val_preds.extend(torch.argmax(outputs, 1).cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        val_f1 = f1_score(val_labels, val_preds, average='macro')
        print(f"Val Loss: {val_loss/len(val_loader):.4f} | F1: {val_f1:.4f}")

        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model_wts = model.state_dict()
            print("✅ New best model")

    model.load_state_dict(best_model_wts)
    return model


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes = len(soil2idx)

model = get_model(num_classes=num_classes, pretrained=True).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

trained_model = train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs=10)
