In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from PIL import Image
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms


In [2]:
data_dir = "/kaggle/input/soil-classification-part-2/soil_competition-2025"
train_csv = os.path.join(data_dir, "train_labels.csv")
train_img_dir = os.path.join(data_dir, "train")
test_img_dir = os.path.join(data_dir, "test")


In [3]:
df = pd.read_csv(train_csv)

In [4]:
fake_neg = df.sample(300, random_state=42).copy()
fake_neg['label'] = 0
df_balanced = pd.concat([df, fake_neg], ignore_index=True)

In [5]:
train_df, val_df = train_test_split(df_balanced, test_size=0.2, stratify=df_balanced['label'], random_state=42)

In [6]:
class SoilDataset(Dataset):
    def __init__(self, df, img_dir, transform=None, test=False):
        self.df = df
        self.img_dir = img_dir
        self.transform = transform
        self.test = test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = self.df.iloc[idx]['image_id']
        img_path = os.path.join(self.img_dir, img_name)
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        if self.test:
            return image, img_name
        else:
            label = self.df.iloc[idx]['label']
            return image, label

In [7]:
transform_train = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

transform_test = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

In [8]:
batch_size = 32
train_loader = DataLoader(SoilDataset(train_df, train_img_dir, transform=transform_train), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(SoilDataset(val_df, train_img_dir, transform=transform_test), batch_size=batch_size)
test_loader = DataLoader(SoilDataset(pd.DataFrame({'image_id': os.listdir(test_img_dir)}), test_img_dir, transform=transform_test, test=True), batch_size=batch_size)

In [9]:
import torch.nn as nn
from torchvision import models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = models.resnet18(weights=None)

model.fc = nn.Sequential(
    nn.Linear(model.fc.in_features, 1),
    nn.Sigmoid()
)

model = model.to(device)

In [10]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

In [11]:
def train_model(model, epochs=10):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for images, labels in tqdm(train_loader):
            images = images.to(device)
            labels = labels.float().unsqueeze(1).to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch [{epoch+1}/{epochs}] - Train Loss: {total_loss/len(train_loader):.4f}")
        evaluate_model(model)
        scheduler.step()

In [12]:
def evaluate_model(model):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device)
            outputs = model(images)
            preds = (outputs > 0.5).int().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.numpy())
    f1 = f1_score(all_labels, all_preds)
    print(f"Validation F1 Score: {f1:.4f}")

In [13]:
train_model(model, epochs=10)

100%|██████████| 39/39 [02:50<00:00,  4.38s/it]


Epoch [1/10] - Train Loss: 0.5263
Validation F1 Score: 0.8909


100%|██████████| 39/39 [02:41<00:00,  4.14s/it]


Epoch [2/10] - Train Loss: 0.5067
Validation F1 Score: 0.8909


100%|██████████| 39/39 [02:44<00:00,  4.22s/it]


Epoch [3/10] - Train Loss: 0.5418
Validation F1 Score: 0.8909


100%|██████████| 39/39 [02:43<00:00,  4.19s/it]


Epoch [4/10] - Train Loss: 0.5013
Validation F1 Score: 0.8909


100%|██████████| 39/39 [02:41<00:00,  4.14s/it]


Epoch [5/10] - Train Loss: 0.5341
Validation F1 Score: 0.8909


100%|██████████| 39/39 [02:41<00:00,  4.15s/it]


Epoch [6/10] - Train Loss: 0.4968
Validation F1 Score: 0.8909


100%|██████████| 39/39 [02:39<00:00,  4.09s/it]


Epoch [7/10] - Train Loss: 0.4935
Validation F1 Score: 0.8909


100%|██████████| 39/39 [02:44<00:00,  4.21s/it]


Epoch [8/10] - Train Loss: 0.4915
Validation F1 Score: 0.8909


100%|██████████| 39/39 [02:40<00:00,  4.13s/it]


Epoch [9/10] - Train Loss: 0.4960
Validation F1 Score: 0.8909


100%|██████████| 39/39 [02:41<00:00,  4.14s/it]


Epoch [10/10] - Train Loss: 0.5258
Validation F1 Score: 0.8909


In [14]:
model.eval()
preds = []
file_names = []

with torch.no_grad():
    for images, names in tqdm(test_loader):
        images = images.to(device)
        outputs = model(images)
        predicted = (outputs > 0.5).int().cpu().numpy()
        preds.extend(predicted.flatten())
        file_names.extend(names)

100%|██████████| 31/31 [00:46<00:00,  1.51s/it]


In [15]:
submission = pd.DataFrame({
    'image_id': file_names,
    'label': preds
})

submission.to_csv("submission.csv", index=False)
submission.head()

Unnamed: 0,image_id,label
0,465084323936570da664f0ca8dc90326.jpg,1
1,1aa0b12029d35e778dba5bff1255c638.jpg,1
2,6df2c3dcd4fb59298c7a73467ea72eeb.jpg,1
3,107f25ebd87f581ea57c630a2dcdf50c.jpg,1
4,dc35d58782615e4f9582c6b32c8b956e.jpg,1
