In [3]:
# training.py

import os
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
import torch
from torchvision import transforms, models
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score
import joblib  # For saving sklearn models

# Paths
TRAIN_DIR = "/kaggle/input/soil-binary/soil_competition-2025/train"
TRAIN_LABELS_CSV = "/kaggle/input/soil-binary/soil_competition-2025/train_labels.csv"

# Read image IDs
train_df = pd.read_csv(TRAIN_LABELS_CSV)
train_ids = train_df['image_id'].tolist()

# Transforms
normal_tf = transforms.Compose([
    transforms.Resize((300, 300)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

anomaly_tf = transforms.Compose([
    transforms.Resize((300, 300)),
    transforms.RandomAffine(45, scale=(0.4, 1.5), shear=30),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.9, contrast=0.9, saturation=0.9, hue=0.2),
    transforms.GaussianBlur(5),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Feature extractor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = models.efficientnet_b3(weights=models.EfficientNet_B3_Weights.DEFAULT).to(device)
feature_extractor = torch.nn.Sequential(*list(base_model.children())[:-1]).eval()

def extract_feats(paths, tf):
    feats = []
    for p in tqdm(paths, desc="Extracting"):
        img = Image.open(p).convert("RGB")
        img_t = tf(img).unsqueeze(0).to(device)
        with torch.no_grad():
            feat = feature_extractor(img_t).view(1, -1).cpu().numpy()
        feats.append(feat[0])
    return np.array(feats)

def get_paths(ids, dir_path):
    return [os.path.join(dir_path, img_id) for img_id in ids]

# Prepare paths
train_paths = get_paths(train_ids, TRAIN_DIR)
train_paths, val_paths = train_test_split(train_paths, test_size=0.2, random_state=42)

# Extract features
X_all = extract_feats(train_paths, normal_tf)
X_val_norm = extract_feats(val_paths, normal_tf)
X_val_ano  = extract_feats(val_paths, anomaly_tf)
X_val = np.vstack([X_val_norm, X_val_ano])
y_val = np.concatenate([np.ones(len(X_val_norm)), np.zeros(len(X_val_ano))])

# Train ensemble
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)
ocsvm_models = []
iso_models = []
scalers = []

print("Training K-Fold Ensemble...")
for fold, (train_idx, _) in enumerate(kf.split(X_all)):
    X_fold = X_all[train_idx]

    iso = IsolationForest(n_estimators=200, contamination=0.1, random_state=fold)
    iso.fit(X_fold)
    iso_models.append(iso)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_fold)
    ocsvm = OneClassSVM(nu=0.01, kernel="rbf", gamma="scale")
    ocsvm.fit(X_scaled)

    ocsvm_models.append(ocsvm)
    scalers.append(scaler)

# Validation scoring
iso_scores = np.mean([m.decision_function(X_val) for m in iso_models], axis=0)
ocsvm_scores = np.mean([m.decision_function(s.transform(X_val)) for m, s in zip(ocsvm_models, scalers)], axis=0)
combined_scores = (iso_scores + ocsvm_scores) / 2

# Threshold tuning
best_f1 = 0
best_thresh = None
for t in np.linspace(min(combined_scores), max(combined_scores), 100):
    preds = (combined_scores >= t).astype(int)
    f1 = f1_score(y_val, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"Best ensemble F1: {best_f1:.4f} at threshold {best_thresh:.4f}")

# Save models and scalers
os.makedirs("trained_models", exist_ok=True)

for i, (iso, ocsvm, scaler) in enumerate(zip(iso_models, ocsvm_models, scalers)):
    joblib.dump(iso, f"trained_models/iso_{i}.pkl")
    joblib.dump(ocsvm, f"trained_models/ocsvm_{i}.pkl")
    joblib.dump(scaler, f"trained_models/scaler_{i}.pkl")

# Save best threshold separately
with open("trained_models/best_thresh.txt", "w") as f:
    f.write(str(best_thresh))

print("Training completed and models saved.")


Extracting: 100%|██████████| 977/977 [00:23<00:00, 42.22it/s]
Extracting: 100%|██████████| 245/245 [00:05<00:00, 42.67it/s]
Extracting: 100%|██████████| 245/245 [00:08<00:00, 28.25it/s]


Training K-Fold Ensemble...
Best ensemble F1: 0.8014 at threshold -0.0111
Training completed and models saved.
