In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Imports

In [11]:
import os
from pathlib import Path
from PIL import Image
import pandas as pd
from tqdm import tqdm

print("Done")

Done


In [12]:
# Input files
CSV_IN = Path("/kaggle/input/csiro-biomass/train.csv")
IMG_ROOT = Path("/kaggle/input/csiro-biomass")  # image paths in CSV are like 'train/ID...jpg'

# Output
OUT_IMG_DIR = Path("/kaggle/working/train_aug")
OUT_CSV = Path("/kaggle/working/train_augmented.csv")

# We create augmentations:
CREATE_ROT180 = True
CREATE_FLIP_X = True
CREATE_FLIP_Y = True

OUT_IMG_DIR.mkdir(parents=True, exist_ok=True)

In [13]:
# helper function to create filenames
def make_augmented_filename(orig_filename: str, suffix: str):
    p = Path(orig_filename)
    return f"{p.stem}{suffix}{p.suffix}"

In [16]:
df = pd.read_csv(CSV_IN)

#build a dataframe of augmented rows

aug_rows = []
unique_image_paths = df['image_path'].unique()
created_images = {}

for img_rel in tqdm(unique_image_paths, desc="Augmenting images"):
    src_img_path = IMG_ROOT / img_rel
    img = Image.open(src_img_path).convert("RGB")

    created_images[img_rel] = []
    basename = Path(img_rel).name

    # 1) rotate 180
    if CREATE_ROT180:
        out_name = make_augmented_filename(basename, "_rot180")
        out_path = OUT_IMG_DIR / out_name
        img_rot = img.rotate(180)
        img_rot.save(out_path, quality=95)
        created_images[img_rel].append(out_name)

    # 2) flip x axis
    if CREATE_FLIP_X:
        out_name = make_augmented_filename(basename, "_flipx")
        out_path = OUT_IMG_DIR / out_name
        img_fx = img.transpose(Image.FLIP_TOP_BOTTOM)
        img_fx.save(out_path, quality=95)
        created_images[img_rel].append(out_name)

    # 3) flip y axis
    if CREATE_FLIP_Y:
        out_name = make_augmented_filename(basename, "_flipy")
        out_path = OUT_IMG_DIR / out_name
        img_fy = img.transpose(Image.FLIP_LEFT_RIGHT)
        img_fy.save(out_path, quality=95)
        created_images[img_rel].append(out_name)

# Now duplicate CSV rows for each augmented image
for _, row in tqdm(df.iterrows(), total=len(df), desc="Building augmented CSV rows"):
    img_rel = row['image_path']
    for out_name in created_images[img_rel]:
        new_row = row.copy()
        # append suffix to sample_id to keep names unique
        orig_stem = Path(img_rel).stem
        aug_stem = Path(out_name).stem
        suffix = aug_stem.replace(orig_stem, "")
        new_row['sample_id'] = row['sample_id'] + suffix
        # Set image_path to point to where we wrote it. Use a path relative to the new CSV consumer.
        # I choose 'train_aug/<filename>' as the new relative path (you can adjust).
        new_row['image_path'] = str(Path("train_aug") / out_name)
        aug_rows.append(new_row)

# Create new dataframe and merge with original
df_aug = pd.DataFrame(aug_rows)
final_df = pd.concat([df, df_aug], ignore_index=True)

print(f"Original rows: {len(df):,}")
print(f"Augmented rows added: {len(df_aug):,}")
print(f"Final rows: {len(final_df):,}")

Augmenting images: 100%|██████████| 357/357 [00:43<00:00,  8.24it/s]
Building augmented CSV rows: 100%|██████████| 1785/1785 [00:00<00:00, 3962.77it/s]


Original rows: 1,785
Augmented rows added: 5,355
Final rows: 7,140


In [17]:
# save new CSV and show a small sample
final_df.to_csv(OUT_CSV, index=False)
print(f"Augmented images saved to: {OUT_IMG_DIR}")
print(f"Augmented CSV saved to: {OUT_CSV}")

display(df_aug.head())

Augmented images saved to: /kaggle/working/train_aug
Augmented CSV saved to: /kaggle/working/train_augmented.csv


Unnamed: 0,sample_id,image_path,Sampling_Date,State,Species,Pre_GSHH_NDVI,Height_Ave_cm,target_name,target
0,ID1011485656__Dry_Clover_g_rot180,train_aug/ID1011485656_rot180.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Clover_g,0.0
0,ID1011485656__Dry_Clover_g_flipx,train_aug/ID1011485656_flipx.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Clover_g,0.0
0,ID1011485656__Dry_Clover_g_flipy,train_aug/ID1011485656_flipy.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Clover_g,0.0
1,ID1011485656__Dry_Dead_g_rot180,train_aug/ID1011485656_rot180.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Dead_g,31.9984
1,ID1011485656__Dry_Dead_g_flipx,train_aug/ID1011485656_flipx.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Dead_g,31.9984
