Install + Imports

In [10]:
!pip install torch torchvision pandas scikit-learn --quiet

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from datetime import datetime
from tqdm import tqdm
import pathlib


Load CSV + Preprocess Metadata

In [5]:
df = pd.read_csv("train.csv")

# Convert date to cyclic features
df["date"] = pd.to_datetime(df["Sampling_Date"])
df["dayofyear"] = df["date"].dt.dayofyear
df["date_sin"] = np.sin(2 * np.pi * df["dayofyear"] / 365.25)
df["date_cos"] = np.cos(2 * np.pi * df["dayofyear"] / 365.25)

# Categorical encoding
enc = OneHotEncoder(sparse_output=False)
cat_features = enc.fit_transform(df[["State", "Species"]])

# Numeric features
num_features = df[["Pre_GSHH_NDVI", "Height_Ave_cm"]].values
scaler = StandardScaler()
num_scaled = scaler.fit_transform(num_features)

# Final tabular input
tabular = np.hstack([df["date_sin"].values.reshape(-1,1),
                     df["date_cos"].values.reshape(-1,1),
                     num_scaled,
                     cat_features])

targets = df["target"].values

train_df, val_df, tab_train, tab_val, y_train, y_val = train_test_split(
    df, tabular, targets, test_size=0.2, random_state=42
)


Dataset Class

In [15]:
class PastureDataset(Dataset):
    def __init__(self, df, tabular_data, targets, transform=None):
        self.df = df.reset_index(drop=True)
        self.tabular = torch.tensor(tabular_data, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = pathlib.Path(row["image_path"])
        img = Image.open(img_path).convert("RGB")
        
        if self.transform:
            img = self.transform(img)

        tab = self.tabular[idx]
        y = self.targets[idx]
        return img, tab, y


Transforms + Dataloaders

In [16]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

train_dataset = PastureDataset(train_df, tab_train, y_train, transform)
val_dataset   = PastureDataset(val_df,   tab_val,   y_val,   transform)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=16, shuffle=False)


Fusion Model

In [None]:
class FusionModel(nn.Module):
    def __init__(self, tab_dim, hidden_dim=256):
        super().__init__()

        self.cnn = models.efficientnet_b0(pretrained=True)
        self.cnn.classifier = nn.Identity()
        image_feat_dim = 1280

        self.tab_mlp = nn.Sequential(
            nn.Linear(tab_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU()
        )

        self.fusion = nn.Sequential(
            nn.Linear(image_feat_dim + 64, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, img, tab):
        img_feat = self.cnn(img)
        tab_feat = self.tab_mlp(tab)
        fused = torch.cat([img_feat, tab_feat], dim=1)
        return self.fusion(fused)


Training Loop + Validation

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = FusionModel(tab_dim=tab_train.shape[1]).to(device)

criterion = nn.L1Loss()  # MAE
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

def evaluate(loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for img, tab, y in loader:
            img, tab, y = img.to(device), tab.to(device), y.to(device).unsqueeze(1)
            preds = model(img, tab)
            loss = criterion(preds, y)
            total_loss += loss.item()
    return total_loss / len(loader)

for epoch in range(10):
    model.train()
    total_loss = 0
    for img, tab, y in tqdm(train_loader):
        img, tab, y = img.to(device), tab.to(device), y.to(device).unsqueeze(1)

        optimizer.zero_grad()
        preds = model(img, tab)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    val_loss = evaluate(val_loader)
    print(f"Epoch {epoch+1} | Train: {total_loss/len(train_loader):.4f} | Val: {val_loss:.4f}")


100%|██████████| 90/90 [03:45<00:00,  2.51s/it]


Epoch 1 | Train: 21.8987 | Val: 21.0159


100%|██████████| 90/90 [03:33<00:00,  2.37s/it]


Epoch 2 | Train: 18.7369 | Val: 18.7507


100%|██████████| 90/90 [03:24<00:00,  2.27s/it]


Epoch 3 | Train: 17.4404 | Val: 18.9351


100%|██████████| 90/90 [03:40<00:00,  2.46s/it]


Epoch 4 | Train: 17.2124 | Val: 18.9372


100%|██████████| 90/90 [03:34<00:00,  2.39s/it]


Epoch 5 | Train: 16.6183 | Val: 19.3267


100%|██████████| 90/90 [03:30<00:00,  2.34s/it]


Save trained model weights (run after training)

In [None]:
# Save model weights and optionally the whole checkpoint (so you can reload later)
checkpoint_path = "fusion_model_final.pth"
torch.save({
    'model_state_dict': model.state_dict(),
    'enc_categories': enc.categories_,    # saves categories for later use
    'scaler_mean': scaler.mean_,
    'scaler_scale': scaler.scale_,
    'tabular_dim': tab_train.shape[1]
}, checkpoint_path)
print("Saved checkpoint to", checkpoint_path)


Prepare test.csv (encoding must match train)

In [None]:
# Load test.csv and compute same tabular representation
test_df = pd.read_csv("test.csv")

# If Sampling_Date might be missing or formatted differently, handle safely
test_df["date"] = pd.to_datetime(test_df["Sampling_Date"], errors='coerce')
test_df["dayofyear"] = test_df["date"].dt.dayofyear.fillna(1).astype(int)  # default day 1 if missing
test_df["date_sin"] = np.sin(2 * np.pi * test_df["dayofyear"] / 365.25)
test_df["date_cos"] = np.cos(2 * np.pi * test_df["dayofyear"] / 365.25)

# Ensure categorical columns exist (State, Species)
# If there are unseen categories in test, OneHotEncoder will fail — handle by mapping unseen to zero vector.
def safe_onehot(encoder, df, cols):
    # Build manual one-hot using encoder.categories_
    arrays = []
    for i, col in enumerate(cols):
        cats = encoder.categories_[i]
        vals = df[col].astype(str).values
        onehot = np.zeros((len(vals), len(cats)), dtype=float)
        for j, v in enumerate(vals):
            # find index if present
            try:
                idx = np.where(cats == v)[0][0]
                onehot[j, idx] = 1.0
            except IndexError:
                # unseen category -> all zeros (you might prefer to map to 'unknown' if present)
                pass
        arrays.append(onehot)
    return np.hstack(arrays)

cat_cols = ["State", "Species"]
if set(cat_cols).issubset(test_df.columns):
    cat_features_test = safe_onehot(enc, test_df, cat_cols)
else:
    # if missing, create empty
    cat_features_test = np.zeros((len(test_df), sum(len(c) for c in enc.categories_)), dtype=float)

# Numeric features (must match the order used in training)
num_feats_test = test_df[["Pre_GSHH_NDVI", "Height_Ave_cm"]].fillna(0).values.astype(float)
# Apply training scaler: use stored mean/scale (scaler object should exist)
num_scaled_test = (num_feats_test - scaler.mean_) / scaler.scale_

tabular_test = np.hstack([
    test_df["date_sin"].values.reshape(-1,1),
    test_df["date_cos"].values.reshape(-1,1),
    num_scaled_test,
    cat_features_test
])

print("Prepared tabular_test shape:", tabular_test.shape)


Test Dataset + DataLoader

In [None]:
class TestPastureDataset(Dataset):
    def __init__(self, df, tabular_data, img_folder="test", transform=None, img_col="image_path"):
        self.df = df.reset_index(drop=True)
        self.tabular = torch.tensor(tabular_data, dtype=torch.float32)
        self.transform = transform
        self.img_folder = img_folder
        self.img_col = img_col

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.img_folder, row[self.img_col])
        # fallback: if path is absolute in CSV, use it directly
        if not os.path.exists(img_path) and os.path.exists(row[self.img_col]):
            img_path = row[self.img_col]
        img = Image.open(img_path).convert("RGB")
        if self.transform:
            img = self.transform(img)
        tab = self.tabular[idx]
        return img, tab, row["sample_id"]

test_dataset = TestPastureDataset(test_df, tabular_test, img_folder="test", transform=transform, img_col="image_path")
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
print("Test samples:", len(test_dataset))


Run Inference & Save Predictions

In [None]:
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

preds = []
sample_ids = []

with torch.no_grad():
    for imgs, tabs, ids in tqdm(test_loader, desc="Infer"):
        imgs = imgs.to(device)
        tabs = tabs.to(device)
        out = model(imgs, tabs)  # shape (B,1) or (B,)
        out = out.squeeze(1).cpu().numpy()
        preds.extend(out.tolist())
        sample_ids.extend(list(ids))

# Build DataFrame and save
submission = pd.DataFrame({
    "sample_id": sample_ids,
    "prediction": preds
})
submission.to_csv("submission.csv", index=False)
print("Saved submission.csv with", len(submission), "rows")
submission.head()
