In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.efficientnet import preprocess_input
import os
from tqdm import tqdm

In [None]:
df_train = pd.read_csv("/kaggle/input/csiro-biomass/train.csv")
df_test = pd.read_csv("/kaggle/input/csiro-biomass/test.csv")

In [None]:
df_train_wide = df_train.pivot_table(
    index='image_path', 
    columns='target_name', 
    values='target'
).reset_index()

# Merge back with other features (take first occurrence for each image)
available_feature_cols = [col for col in ['Sampling_Date', 'State', 'Species', 
                                           'Pre_GSHH_NDVI', 'Height_Ave_cm'] 
                          if col in df_train.columns]
df_train_features = df_train.groupby('image_path')[available_feature_cols].first().reset_index()
df_train_wide = df_train_wide.merge(df_train_features, on='image_path')

In [None]:
df_test_unique = df_test.drop_duplicates(subset=['image_path']).copy()

# Get available features from test
available_test_features = [col for col in ['Sampling_Date', 'State', 'Species', 
                                            'Pre_GSHH_NDVI', 'Height_Ave_cm'] 
                           if col in df_test.columns]

if available_test_features:
    df_test_features = df_test.groupby('image_path')[available_test_features].first().reset_index()
    df_test_unique = df_test_features
else:
    # If no features in test.csv, keep only image_path
    df_test_unique = df_test[['image_path']].drop_duplicates().reset_index(drop=True)

In [None]:
base_model = EfficientNetB0(include_top=False, pooling='avg',weights=None)
base_model.load_weights('/kaggle/input/effnet-b0-keras/keras/default/1/model_weights/efficientnetb0_notop.h5')

def extract_features(img_path, base_path='/kaggle/input/csiro-biomass/'):
    """Extract features from an image using the CNN"""
    try:
        full_path = os.path.join(base_path, img_path)
        img = image.load_img(full_path, target_size=(224, 224))
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)
        features = base_model.predict(img_array, verbose=0)
        return features.flatten()
    except Exception as e:
        print(f"Error processing {img_path}: {e}")
        return np.zeros(1280)  # EfficientNetB0 outputs 1280 features

# Extract CNN features for TRAIN
print("\nExtracting CNN features from TRAIN images...")
train_cnn_features = []
for img_path in tqdm(df_train_wide['image_path']):
    features = extract_features(img_path)
    train_cnn_features.append(features)
train_cnn_features = np.array(train_cnn_features)

In [None]:
print(f"Train CNN features shape: {train_cnn_features.shape}")

In [None]:
print("\nExtracting CNN features from TEST images...")
test_cnn_features = []
for img_path in tqdm(df_test_unique['image_path']):
    features = extract_features(img_path)
    test_cnn_features.append(features)
test_cnn_features = np.array(test_cnn_features)

In [None]:
tabular_features = []

# Process State if available
if 'State' in df_train_wide.columns:
    le_state = LabelEncoder()
    # Fit on train, transform both
    df_train_wide['State_encoded'] = le_state.fit_transform(df_train_wide['State'])
    if 'State' in df_test_unique.columns:
        # Handle unseen states in test
        df_test_unique['State_encoded'] = df_test_unique['State'].apply(
            lambda x: le_state.transform([x])[0] if x in le_state.classes_ else -1
        )
    else:
        df_test_unique['State_encoded'] = -1
    tabular_features.append('State_encoded')

# Process Species if available
if 'Species' in df_train_wide.columns:
    le_species = LabelEncoder()
    df_train_wide['Species_encoded'] = le_species.fit_transform(df_train_wide['Species'])
    if 'Species' in df_test_unique.columns:
        df_test_unique['Species_encoded'] = df_test_unique['Species'].apply(
            lambda x: le_species.transform([x])[0] if x in le_species.classes_ else -1
        )
    else:
        df_test_unique['Species_encoded'] = -1
    tabular_features.append('Species_encoded')

# Process Date features if available
if 'Sampling_Date' in df_train_wide.columns:
    df_train_wide['Sampling_Date'] = pd.to_datetime(df_train_wide['Sampling_Date'])
    df_train_wide['Year'] = df_train_wide['Sampling_Date'].dt.year
    df_train_wide['Month'] = df_train_wide['Sampling_Date'].dt.month
    df_train_wide['Day'] = df_train_wide['Sampling_Date'].dt.day
    
    if 'Sampling_Date' in df_test_unique.columns:
        df_test_unique['Sampling_Date'] = pd.to_datetime(df_test_unique['Sampling_Date'])
        df_test_unique['Year'] = df_test_unique['Sampling_Date'].dt.year
        df_test_unique['Month'] = df_test_unique['Sampling_Date'].dt.month
        df_test_unique['Day'] = df_test_unique['Sampling_Date'].dt.day
    else:
        # Use median values from train
        df_test_unique['Year'] = df_train_wide['Year'].median()
        df_test_unique['Month'] = df_train_wide['Month'].median()
        df_test_unique['Day'] = df_train_wide['Day'].median()
    
    tabular_features.extend(['Year', 'Month', 'Day'])

# Process numerical features
for num_col in ['Pre_GSHH_NDVI', 'Height_Ave_cm']:
    if num_col in df_train_wide.columns:
        # Fill missing values with median
        train_median = df_train_wide[num_col].median()
        df_train_wide[num_col] = df_train_wide[num_col].fillna(train_median)
        
        if num_col in df_test_unique.columns:
            df_test_unique[num_col] = df_test_unique[num_col].fillna(train_median)
        else:
            df_test_unique[num_col] = train_median
        
        tabular_features.append(num_col)

print(f"Tabular features: {tabular_features}")

In [None]:
if tabular_features:
    X_train_tabular = df_train_wide[tabular_features].values
    X_train_combined = np.concatenate([X_train_tabular, train_cnn_features], axis=1)
else:
    X_train_combined = train_cnn_features

# Test features
if tabular_features:
    X_test_tabular = df_test_unique[tabular_features].values
    X_test_combined = np.concatenate([X_test_tabular, test_cnn_features], axis=1)
else:
    X_test_combined = test_cnn_features

print(f"Train combined features shape: {X_train_combined.shape}")
print(f"Test combined features shape: {X_test_combined.shape}")

# Get target columns
exclude_cols = ['image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 
                'Height_Ave_cm', 'State_encoded', 'Species_encoded', 'Year', 'Month', 'Day']
target_cols = [col for col in df_train_wide.columns if col not in exclude_cols]
print(f"Target columns: {target_cols}")

In [None]:
print("\nTraining LightGBM models...")

# Split train data for validation
X_tr, X_val, y_tr_df, y_val_df = train_test_split(
    X_train_combined, 
    df_train_wide[target_cols], 
    test_size=0.2, 
    random_state=42
)

# Store models and predictions
models = {}
predictions = {}
metrics = {}

lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'n_estimators': 500,
    'random_state': 42
}

for target_col in tqdm(target_cols, desc="Training models"):
    # Get target values
    y_tr = y_tr_df[target_col].values
    y_val = y_val_df[target_col].values
    
    # Train LightGBM model
    model = lgb.LGBMRegressor(**lgb_params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )
    
    # Store model
    models[target_col] = model
    
    # Make predictions
    y_pred = model.predict(X_val)
    predictions[target_col] = y_pred
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    metrics[target_col] = {'RMSE': rmse, 'MAE': mae}


In [None]:
metrics_df = pd.DataFrame(metrics).T
metrics_df = metrics_df.sort_values('RMSE')
print(metrics_df.to_string())

print(f"\nAverage RMSE: {metrics_df['RMSE'].mean():.4f}")
print(f"Average MAE: {metrics_df['MAE'].mean():.4f}")

In [None]:
test_predictions = pd.DataFrame()
test_predictions['image_path'] = df_test_unique['image_path']

for target_col in tqdm(target_cols, desc="Predicting"):
    preds = models[target_col].predict(X_test_combined)
    test_predictions[target_col] = preds

print("\nTest predictions generated!")
print(test_predictions.head())


In [None]:
submission = test_predictions.melt(
    id_vars=['image_path'],
    value_vars=target_cols,
    var_name='target_name',
    value_name='target'
)

# Create sample_id by combining image identifier and target_name
submission['image_id'] = submission['image_path'].str.extract(r'/(ID\d+)\.')[0]
submission['sample_id'] = submission['image_id'] + '__' + submission['target_name']

# Select and reorder columns for final submission
submission_final = submission[['sample_id', 'target']].copy()

# Ensure predictions are non-negative (biomass can't be negative)
submission_final['target'] = submission_final['target'].clip(lower=0)

print("\nSubmission format:")
print(submission_final.head(10))
print(f"\nTotal predictions: {len(submission_final)}")

In [None]:
output_path = 'submission.csv'
submission_final.to_csv(output_path, index=False)
print(f"\nâœ“ Submission saved to: {output_path}")