In [16]:
import pandas as pd
import sqlite3

import os
import torch
from PIL import Image
from torchvision import transforms
from tqdm import tqdm

In [2]:
train_df = pd.read_csv('./csiro-biomass/train.csv')

In [3]:
train_df["sample_id"] = train_df["sample_id"].str.extract(r"(\d+)").astype(int)

In [4]:
train_df

Unnamed: 0,sample_id,image_path,Sampling_Date,State,Species,Pre_GSHH_NDVI,Height_Ave_cm,target_name,target
0,1011485656,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Clover_g,0.0000
1,1011485656,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Dead_g,31.9984
2,1011485656,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Green_g,16.2751
3,1011485656,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Total_g,48.2735
4,1011485656,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,GDM_g,16.2750
...,...,...,...,...,...,...,...,...,...
1780,983582017,train/ID983582017.jpg,2015/9/1,WA,Ryegrass,0.64,9.0000,Dry_Clover_g,0.0000
1781,983582017,train/ID983582017.jpg,2015/9/1,WA,Ryegrass,0.64,9.0000,Dry_Dead_g,0.0000
1782,983582017,train/ID983582017.jpg,2015/9/1,WA,Ryegrass,0.64,9.0000,Dry_Green_g,40.9400
1783,983582017,train/ID983582017.jpg,2015/9/1,WA,Ryegrass,0.64,9.0000,Dry_Total_g,40.9400


In [5]:
# ---------- Split date ----------
train_df["Sampling_Date"] = pd.to_datetime(train_df["Sampling_Date"])

In [None]:
samples = train_df[[
    'sample_id',
    'image_path',
    'Sampling_Date',
    'State',
    'Species',
    'Pre_GSHH_NDVI',
    'Height_Ave_cm'
]].drop_duplicates()

In [7]:
samples.reset_index(drop=True)

Unnamed: 0,sample_id,image_path,Sampling_Date,State,Species,Pre_GSHH_NDVI,Height_Ave_cm
0,1011485656,train/ID1011485656.jpg,2015-09-04,Tas,Ryegrass_Clover,0.62,4.6667
1,1012260530,train/ID1012260530.jpg,2015-04-01,NSW,Lucerne,0.55,16.0000
2,1025234388,train/ID1025234388.jpg,2015-09-01,WA,SubcloverDalkeith,0.38,1.0000
3,1028611175,train/ID1028611175.jpg,2015-05-18,Tas,Ryegrass,0.66,5.0000
4,1035947949,train/ID1035947949.jpg,2015-09-11,Tas,Ryegrass,0.54,3.5000
...,...,...,...,...,...,...,...
352,975115267,train/ID975115267.jpg,2015-07-08,WA,Clover,0.73,3.0000
353,978026131,train/ID978026131.jpg,2015-09-04,Tas,Clover,0.83,3.1667
354,980538882,train/ID980538882.jpg,2015-02-24,NSW,Phalaris,0.69,29.0000
355,980878870,train/ID980878870.jpg,2015-07-08,WA,Clover,0.74,2.0000


In [8]:
targets = train_df[[
    'sample_id',
    'target_name',
    'target'
]].copy()

In [9]:
targets

Unnamed: 0,sample_id,target_name,target
0,1011485656,Dry_Clover_g,0.0000
1,1011485656,Dry_Dead_g,31.9984
2,1011485656,Dry_Green_g,16.2751
3,1011485656,Dry_Total_g,48.2735
4,1011485656,GDM_g,16.2750
...,...,...,...
1780,983582017,Dry_Clover_g,0.0000
1781,983582017,Dry_Dead_g,0.0000
1782,983582017,Dry_Green_g,40.9400
1783,983582017,Dry_Total_g,40.9400


In [10]:
df_joined = samples.merge(targets, on='sample_id')

df_ml = df_joined.pivot(
    index='sample_id',
    columns='target_name',
    values='target'
).reset_index()

In [11]:
df_joined[df_joined['sample_id'] == 4464212]

Unnamed: 0,sample_id,image_path,Sampling_Date,State,Species,Pre_GSHH_NDVI,Height_Ave_cm,target_name,target
1230,4464212,train/ID4464212.jpg,2015-05-18,Tas,Ryegrass,0.87,6.0,Dry_Clover_g,2.3192
1231,4464212,train/ID4464212.jpg,2015-05-18,Tas,Ryegrass,0.87,6.0,Dry_Dead_g,8.1172
1232,4464212,train/ID4464212.jpg,2015-05-18,Tas,Ryegrass,0.87,6.0,Dry_Green_g,28.4103
1233,4464212,train/ID4464212.jpg,2015-05-18,Tas,Ryegrass,0.87,6.0,Dry_Total_g,38.8467
1234,4464212,train/ID4464212.jpg,2015-05-18,Tas,Ryegrass,0.87,6.0,GDM_g,30.7295


In [12]:
df_ml

target_name,sample_id,Dry_Clover_g,Dry_Dead_g,Dry_Green_g,Dry_Total_g,GDM_g
0,4464212,2.3192,8.1172,28.4103,38.8467,30.7295
1,6269659,35.9533,16.3424,2.3346,54.6304,38.2879
2,7850481,19.5700,0.0000,0.0000,19.5700,19.5700
3,8209776,9.9586,14.1080,12.0333,36.1000,21.9920
4,12390962,0.0000,15.2000,14.0000,29.2000,14.0000
...,...,...,...,...,...,...
352,2099464826,0.0000,25.2800,69.5200,94.8000,69.5200
353,2099742797,4.1400,1.3800,15.1800,20.7000,19.3200
354,2125100696,14.8364,4.9455,34.6182,54.4000,49.4545
355,2131261930,6.7440,42.7122,31.4721,80.9283,38.2161


In [13]:
samples.to_csv("./csiro-biomass/samples.csv", index=False)
df_ml.to_csv("./csiro-biomass/targets.csv", index=False)

In [14]:
# Height: standardize
height_mean = samples["Height_Ave_cm"].mean()
height_std = samples["Height_Ave_cm"].std()
height_mean, height_std

(7.595985434173669, 10.285262364329933)

In [15]:
height_mean = samples["Pre_GSHH_NDVI"].mean()
height_std = samples["Pre_GSHH_NDVI"].std()
height_mean, height_std

(0.6574229691876751, 0.1521422782849033)

In [18]:
def compute_image_stats_from_csv(
    csv_path,
    image_root,
    image_col="image_path",
    resize=(224, 224)
):
    df = pd.read_csv(csv_path)

    transform = transforms.Compose([
        transforms.Resize(resize),
        transforms.ToTensor()  # very important
    ])

    channel_sum = torch.zeros(3)
    channel_sq_sum = torch.zeros(3)
    channel_min = torch.full((3,), float("inf"))
    channel_max = torch.full((3,), float("-inf"))
    num_pixels = 0

    for _, row in tqdm(df.iterrows(), total=len(df)):
        img_path = os.path.join(image_root, row[image_col])

        try:
            image = Image.open(img_path).convert("RGB")
        except Exception as e:
            print(f"Skipping {img_path}: {e}")
            continue

        image = transform(image)  # [C, H, W]

        _, h, w = image.shape
        pixels = h * w

        channel_sum += image.sum(dim=[1, 2])
        channel_sq_sum += (image ** 2).sum(dim=[1, 2])

        channel_min = torch.minimum(channel_min, image.amin(dim=[1, 2]))
        channel_max = torch.maximum(channel_max, image.amax(dim=[1, 2]))

        num_pixels += pixels

    mean = channel_sum / num_pixels
    std = torch.sqrt(channel_sq_sum / num_pixels - mean ** 2)

    return {
        "min": channel_min,
        "max": channel_max,
        "mean": mean,
        "std": std
    }

In [19]:
compute_image_stats_from_csv(
    "./csiro-biomass/samples.csv",
    "./csiro-biomass/",
    image_col="image_path",
    resize=(224, 224)
)

100%|██████████| 357/357 [00:20<00:00, 17.40it/s]


{'min': tensor([0.0039, 0.0157, 0.0000]),
 'max': tensor([1., 1., 1.]),
 'mean': tensor([0.4417, 0.5036, 0.3057]),
 'std': tensor([0.1771, 0.1744, 0.1681])}

In [20]:
def compute_mean_std_from_csv(csv_path, target_columns):
    df = pd.read_csv(csv_path)

    # force numeric & clean
    df[target_columns] = (
        df[target_columns]
        .apply(pd.to_numeric, errors="coerce")
        .fillna(0.0)
    )

    mean = torch.tensor(
        df[target_columns].mean().values,
        dtype=torch.float32
    )

    std = torch.tensor(
        df[target_columns].std().values,
        dtype=torch.float32
    )

    return mean, std


target_columns = [
    "Dry_Clover_g",
    "Dry_Dead_g",
    "Dry_Green_g",
    "Dry_Total_g",
    "GDM_g"
]

mean, std = compute_mean_std_from_csv(
    csv_path="./csiro-biomass/targets.csv",
    target_columns=target_columns
)

print("Mean:", mean)
print("Std :", std)

Mean: tensor([ 6.6497, 12.0445, 26.6247, 45.3181, 33.2744])
Std : tensor([12.1178, 12.4020, 25.4012, 27.9840, 24.9358])
