In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import KFold

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
# Load data
df_train = pd.read_csv("/kaggle/input/csiro-biomass/train.csv")
df_test = pd.read_csv("/kaggle/input/csiro-biomass/test.csv")

# Pivot train data to wide format
df_train_wide = df_train.pivot_table(
    index='image_path', 
    columns='target_name', 
    values='target'
).reset_index()

# Merge back with other features (take first occurrence for each image)
available_feature_cols = [col for col in ['Sampling_Date', 'State', 'Species', 
                                           'Pre_GSHH_NDVI', 'Height_Ave_cm'] 
                          if col in df_train.columns]
df_train_features = df_train.groupby('image_path')[available_feature_cols].first().reset_index()
df_train_wide = df_train_wide.merge(df_train_features, on='image_path')

# Process test data
df_test_unique = df_test.drop_duplicates(subset=['image_path']).copy()

# Get available features from test
available_test_features = [col for col in ['Sampling_Date', 'State', 'Species', 
                                            'Pre_GSHH_NDVI', 'Height_Ave_cm'] 
                           if col in df_test.columns]

if available_test_features:
    df_test_features = df_test.groupby('image_path')[available_test_features].first().reset_index()
    df_test_unique = df_test_features
else:
    # If no features in test.csv, keep only image_path
    df_test_unique = df_test[['image_path']].drop_duplicates().reset_index(drop=True)

In [None]:
# Load EfficientNet-B0 model (PyTorch version)
base_model = models.efficientnet_b0(weights=None)
#torch.save(base_model.state_dict(), "efficientnet_b0_imagenet_pytorch.pth")

base_model.load_state_dict(torch.load("/kaggle/input/efficientnet-b0-imagenet-pytorch/pytorch/default/2/efficientnet_b0_imagenet_pytorch.pth",weights_only=True))
# Remove the classification head to get features
base_model = nn.Sequential(*list(base_model.children())[:-1])  # Remove classifier
base_model.eval()
base_model.to(device)

# Define image preprocessing transforms
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
def extract_features(img_path, base_path='/kaggle/input/csiro-biomass/'):
    """Extract features from an image using the CNN"""
    try:
        full_path = os.path.join(base_path, img_path)
        img = Image.open(full_path).convert('RGB')
        img_tensor = preprocess(img).unsqueeze(0).to(device)
        
        with torch.no_grad():
            features = base_model(img_tensor)
            features = features.squeeze()  # Remove batch and spatial dimensions
            
        return features.cpu().numpy().flatten()
    except Exception as e:
        print(f"Error processing {img_path}: {e}")
        return np.zeros(1280)  # EfficientNetB0 outputs 1280 features

In [None]:
# Extract CNN features for TRAIN
print("\nExtracting CNN features from TRAIN images...")
train_cnn_features = []
for img_path in tqdm(df_train_wide['image_path']):
    features = extract_features(img_path)
    train_cnn_features.append(features)
train_cnn_features = np.array(train_cnn_features)
print(f"Train CNN features shape: {train_cnn_features.shape}")

# Extract CNN features for TEST
print("\nExtracting CNN features from TEST images...")
test_cnn_features = []
for img_path in tqdm(df_test_unique['image_path']):
    features = extract_features(img_path)
    test_cnn_features.append(features)
test_cnn_features = np.array(test_cnn_features)
print(f"Test CNN features shape: {test_cnn_features.shape}")

In [None]:
# Process tabular features
tabular_features = []

le_state = LabelEncoder()

df_train_wide['State_encoded'] = le_state.fit_transform(df_train_wide['State'])
df_test_unique['State_encoded'] = -1

tabular_features.append('State_encoded')


le_species = LabelEncoder()
df_train_wide['Species_encoded'] = le_species.fit_transform(df_train_wide['Species'])
df_test_unique['Species_encoded'] = -1
tabular_features.append('Species_encoded')


for num_col in ['Pre_GSHH_NDVI', 'Height_Ave_cm']:
        train_median = df_train_wide[num_col].median()
        df_test_unique[num_col] = train_median
        
        tabular_features.append(num_col)

print(f"Tabular features: {tabular_features}")

In [None]:
# Combine features
X_train_tabular = df_train_wide[tabular_features].values
X_train_combined = np.concatenate([X_train_tabular, train_cnn_features], axis=1)


X_test_tabular = df_test_unique[tabular_features].values
X_test_combined = np.concatenate([X_test_tabular, test_cnn_features], axis=1)

print(f"Train combined features shape: {X_train_combined.shape}")
print(f"Test combined features shape: {X_test_combined.shape}")

# Get target columns
exclude_cols = ['image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 
                'Height_Ave_cm', 'State_encoded', 'Species_encoded', 'Year', 'Month', 'Day']
target_cols = [col for col in df_train_wide.columns if col not in exclude_cols]
print(f"Target columns: {target_cols}")

In [None]:
print("\nTraining LightGBM models with K-Fold Cross Validation...")

# K-Fold setup
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

# Store models and predictions
models = {}  # Will store list of models for each target
oof_predictions = {}  # Out-of-fold predictions for validation
test_predictions_folds = {}  # Test predictions from each fold
metrics = {}

lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'n_estimators': 500,
    'random_state': 42
}

for target_col in tqdm(target_cols, desc="Training models"):
    print(f"\nTraining {target_col}...")
    
    # Initialize storage for this target
    models[target_col] = []
    oof_preds = np.zeros(len(X_train_combined))
    test_preds = np.zeros(len(X_test_combined))
    
    # Get target values
    y_train = df_train_wide[target_col].values
    
    # K-Fold cross validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_combined), 1):
        X_tr, X_val = X_train_combined[train_idx], X_train_combined[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        # Train LightGBM model
        model = lgb.LGBMRegressor(**lgb_params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
        )
        
        # Store model
        models[target_col].append(model)
        
        # Out-of-fold predictions
        oof_preds[val_idx] = model.predict(X_val)
        
        # Test predictions (will be averaged later)
        test_preds += model.predict(X_test_combined) / N_FOLDS
        
        # Fold metrics
        fold_rmse = np.sqrt(mean_squared_error(y_val, oof_preds[val_idx]))
        fold_mae = mean_absolute_error(y_val, oof_preds[val_idx])
        print(f"  Fold {fold} - RMSE: {fold_rmse:.4f}, MAE: {fold_mae:.4f}")
    
    # Store predictions
    oof_predictions[target_col] = oof_preds
    test_predictions_folds[target_col] = test_preds
    
    # Calculate overall CV metrics
    cv_rmse = np.sqrt(mean_squared_error(y_train, oof_preds))
    cv_mae = mean_absolute_error(y_train, oof_preds)
    metrics[target_col] = {'CV_RMSE': cv_rmse, 'CV_MAE': cv_mae}
    print(f"  Overall CV - RMSE: {cv_rmse:.4f}, MAE: {cv_mae:.4f}")

metrics_df = pd.DataFrame(metrics).T
metrics_df = metrics_df.sort_values('CV_RMSE')
print("\n" + "="*60)
print("Cross-Validation Results:")
print("="*60)
print(metrics_df.to_string())

print(f"\nAverage CV RMSE: {metrics_df['CV_RMSE'].mean():.4f}")
print(f"Average CV MAE: {metrics_df['CV_MAE'].mean():.4f}")

In [None]:
# Generate test predictions (already computed during k-fold)
test_predictions = pd.DataFrame()
test_predictions['image_path'] = df_test_unique['image_path']

for target_col in target_cols:
    test_predictions[target_col] = test_predictions_folds[target_col]

print("\nTest predictions generated!")
print(test_predictions.head())

# Create submission
submission = test_predictions.melt(
    id_vars=['image_path'],
    value_vars=target_cols,
    var_name='target_name',
    value_name='target'
)

# Create sample_id by combining image identifier and target_name
submission['image_id'] = submission['image_path'].str.extract(r'/(ID\d+)\.')[0]
submission['sample_id'] = submission['image_id'] + '__' + submission['target_name']

# Select and reorder columns for final submission
submission_final = submission[['sample_id', 'target']].copy()

# Ensure predictions are non-negative (biomass can't be negative)
submission_final['target'] = submission_final['target'].clip(lower=0)


In [None]:
print("\nSubmission format:")
print(submission_final.head(10))
print(f"\nTotal predictions: {len(submission_final)}")

output_path = 'submission.csv'
submission_final.to_csv(output_path, index=False)
print(f"\nâœ“ Submission saved to: {output_path}")