In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from pathlib import Path
import pandas as pd

BASE_DIR = Path('/kaggle/input/csiro-biomass')

Train_DIR = BASE_DIR / 'test'
Test_DIR = BASE_DIR / 'test'

train_csv_path = BASE_DIR / 'train.csv'
test_csv_path = BASE_DIR / 'test.csv'

df = pd.read_csv(train_csv_path)
df_test = pd.read_csv(test_csv_path)    

In [None]:
df.head()

In [None]:
print('Shape:', df.shape)
print('Size:', df.size)
print('No. of Nulls:', df.isnull().sum().sum())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
sns.set_theme(style='darkgrid')

df['Species_short'] = df['Species'].apply(lambda x: ' '.join(x.replace('_', ' ').split()[:3]))

plt.figure(figsize=(18, 8))

plt.subplot(1, 3, 1)
order = df['Species_short'].value_counts().index
sns.countplot(data=df, x='Species_short', order=order, palette='Set3')
sns.despine()
plt.xticks(rotation=45, ha='right')
plt.title("Species Distribution", fontsize=14)

plt.subplot(1, 3, 2)
sns.scatterplot(data=df, x='Height_Ave_cm', y='target', hue='State', palette='Set2')
sns.despine()
plt.title("Height vs Target by State", fontsize=14)

plt.subplot(1, 3, 3)
sns.histplot(data=df, x='target', hue='State', kde=True, palette='Dark2')
sns.despine()
plt.title("Target Distribution by State", fontsize=14)

plt.tight_layout()
plt.show()

In [None]:
from sklearn.preprocessing import OneHotEncoder
from PIL import Image
import numpy as np

def img_to_array(paths, shape):
    return np.array([
        np.array(Image.open(BASE_DIR / path).resize(shape))
        for path in paths])

def log_transform(target):
    target = np.array(target)
    return np.where(target >= 0, np.log(target), -1)

def inverse_transform(log_target):
    log_target = np.array(log_target)
    return np.where(log_target >= 0, np.exp(log_target), -1)

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
species_reshaped = df['Species'].values.reshape(-1, 1)


image_arrays = img_to_array(df['image_path'], (384, 384))
one_hot_species = encoder.fit_transform(species_reshaped)
target = log_transform(df['target'])

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import EfficientNetV2S
from tensorflow.keras import layers, models, regularizers
from tensorflow.keras.applications.efficientnet_v2 import preprocess_input
import os
from tensorflow.keras import layers

preprocess_layer = tf.keras.Sequential([
    layers.Resizing(384, 384),
    layers.Rescaling(1./255),
    layers.Normalization(mean=[0.485, 0.456, 0.406],
                         variance=[0.229**2, 0.224**2, 0.225**2])
])



def preprocess_image(img):
    img = tf.image.convert_image_dtype(img, tf.float32)  
    img = tf.image.resize(img, (384, 384))
    img = tf.keras.applications.efficientnet_v2.preprocess_input(img)
    return img

def make_dataset(image_arrays, one_hot_species, targets, batch_size=32, shuffle=True, buffer_size=None):
 
    image_arrays = tf.convert_to_tensor(image_arrays, dtype=tf.float32)
    one_hot_species = tf.convert_to_tensor(one_hot_species, dtype=tf.float32)
    targets = tf.convert_to_tensor(targets, dtype=tf.float32)

    ds = tf.data.Dataset.from_tensor_slices((image_arrays, one_hot_species, targets))

    def _process(img, sp, y):
        img = preprocess_image(img)
        return ({"image_input": img, "species_input": sp}, y)

    
    if shuffle:
        if buffer_size is None:
            buffer_size = tf.shape(image_arrays)[0]  # dynamic shape
        ds = ds.shuffle(buffer_size=buffer_size, reshuffle_each_iteration=True)

    ds = ds.map(_process, num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds


num_samples = len(image_arrays)
if num_samples == 0:
    raise ValueError("image_arrays is empty. please check your dataset loading.")

split = int(num_samples * 0.8)

train_ds = make_dataset(
    image_arrays[:split],
    one_hot_species[:split],
    target[:split],
    batch_size=32,
    shuffle=True,
    buffer_size=min(1000, split)  
)

val_ds = make_dataset(
    image_arrays[split:],
    one_hot_species[split:],
    target[split:],
    batch_size=32,
    shuffle=False
)


def build_efficientnet_regressor(species_dim, fine_tune_at=400):
    
    backbone = EfficientNetV2S(include_top=False, weights='imagenet', pooling='avg')


    backbone.trainable = True
    for layer in backbone.layers[:fine_tune_at]:
        layer.trainable = False

    img_input = layers.Input(shape=(384, 384, 3), name="image_input")
    sp_input = layers.Input(shape=(species_dim,), name="species_input")

    x_img = tf.keras.applications.efficientnet_v2.preprocess_input(img_input)
    x_img = backbone(x_img)

    
    x_sp = layers.BatchNormalization()(sp_input)

    
    x = layers.Concatenate()([x_img, x_sp])
    x = layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.4)(x)

    x = layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)

    output = layers.Dense(1)(x)

    model = models.Model(inputs=[img_input, sp_input], outputs=output)
    return model


checkpoint_dir = "checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(checkpoint_dir, 'best_model.keras'),
                                       monitor='val_loss', save_best_only=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1),
    tf.keras.callbacks.TensorBoard(log_dir='logs')
]

species_dim = one_hot_species.shape[1]
model = build_efficientnet_regressor(species_dim, fine_tune_at=400)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss='mse',
    metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse')]
)

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=100,
    callbacks=callbacks
)

val_loss, val_rmse = model.evaluate(val_ds)
print(f"Validation RMSE: {val_rmse:.4f}")


In [None]:
image_arrays.shape

In [None]:
test_species = df_test['Species'].values.reshape(-1, 1)
test_one_hot_species = encoder.transform(test_species)  
test_image_arrays = img_to_array(df_test['image_path'], (384, 384))


preds_log = model.predict(test_ds).flatten()

preds = inverse_transform(preds_log)

In [None]:
submission = pd.DataFrame({
    "id": df_test.index,
    "target": preds
})
submission.to_csv("submission.csv", index=False)

print("Submission file saved as submission.csv")