In [None]:
import warnings

# Suppress the 'repr' warning from Pydantic
warnings.filterwarnings(
    "ignore",
    message="The 'repr' attribute with value False was provided to the `Field\\(\\)` function, which has no effect in the context it was used.*",
    category=UserWarning, # It's likely a UserWarning or UnsupportedFieldAttributeWarning, but UserWarning is safer to catch
    module="pydantic._internal._generate_schema"
)

# Suppress the 'frozen' warning from Pydantic
warnings.filterwarnings(
    "ignore",
    message="The 'frozen' attribute with value True was provided to the `Field\\(\\)` function, which has no effect in the context it was used.*",
    category=UserWarning,
    module="pydantic._internal._generate_schema"
)

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import timm
import cv2
from tqdm.auto import tqdm
import gc
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Suppress warnings for cleaner output
warnings.filterwarnings(
    "ignore",
    message="The 'repr' attribute with value False was provided to the `Field\\(\\)` function, which has no effect in the context it was used.*",
    category=UserWarning,
    module="pydantic._internal._generate_schema"
)
warnings.filterwarnings(
    "ignore",
    message="The 'frozen' attribute with value True was provided to the `Field\\(\\)` function, which has no effect in the context it was used.*",
    category=UserWarning,
    module="pydantic._internal._generate_schema"
)

# --- CONFIGURATION (UPDATED for Kaggle Paths) ---
class CONFIG:
    # Path for competition data (CSVs and images)
    # MODIFIED: Using the absolute path provided by the user for clarity.
    BASE_PATH = '/kaggle/input/csiro-biomass/'
    TEST_CSV = os.path.join(BASE_PATH, 'test.csv')
    TEST_IMAGE_DIR = os.path.join(BASE_PATH, 'test')
    
    # *** MODEL CHECKPOINT PATH CONFIRMED BY USER ***
    MODEL_CHECKPOINT_DIR = '/kaggle/input/forgery-models/' 

    # Model settings (Enhanced)
    #MODEL_NAME = 'efficientnet_b3' # Must match model used for training
    MODEL_NAME = 'efficientnet_b0'
    IMG_SIZE = 512
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Inference/Ensemble settings
    BATCH_SIZE = 8
    N_FOLDS = 5 
    
    # Target and Numerical Stability Constants
    MIN_BIOMASS = 1.0
    MAX_BIOMASS = 1500.0
    EPS = 1e-6
    DROPOUT_RATE = 0.5

    # Metadata Features
    METADATA_COLS = ['Pre_GSHH_NDVI', 'Height_Ave_cm']


# --- Simple Transforms (Keep the data preparation consistent) ---
def simple_transform(image, img_size):
    image = cv2.resize(image, (img_size, img_size))
    image = image.astype(np.float32) / 255.0
    mean_custom = np.array([0.5, 0.5, 0.5], dtype=np.float32)
    std_custom = np.array([0.5, 0.5, 0.5], dtype=np.float32)
    image = (image - mean_custom) / std_custom
    image = image.transpose(2, 0, 1)
    return torch.tensor(image, dtype=torch.float)

def get_tta_transforms():
    # Base transform
    orig_transform = lambda img: simple_transform(img, CONFIG.IMG_SIZE)
    
    # Horizontal Flip TTA
    def flip_transform(img):
        flipped_img = cv2.flip(img, 1)
        return simple_transform(flipped_img, CONFIG.IMG_SIZE)

    return [orig_transform, flip_transform]


# ===============================================================
# 1. DATA PREPARATION (Includes Dummy Metadata for Test Set)
# ===============================================================
def load_and_prep_test_data():
    """Loads test data, extracts unique images, and applies dummy metadata scaling."""

    df_long = pd.read_csv(CONFIG.TEST_CSV)
    df_long['image_id'] = df_long['sample_id'].apply(lambda x: x.split('__')[0])
    
    cols_for_unique_image = [col for col in df_long.columns if col not in ['target_name', 'target', 'sample_id']]
    df_unique = df_long[cols_for_unique_image].drop_duplicates(subset=['image_path']).reset_index(drop=True)

    # Use dummy metadata since the competition test set is missing the real metadata columns.
    dummy_metadata = [[0.0, 0.0] for _ in range(len(df_unique))]
    df_unique['meta_scaled'] = dummy_metadata
        
    print(f"Test images prepared: {len(df_unique)}. Using zeroed metadata for inference.")

    return df_long, df_unique


# ===============================================================
# 2. DATASET CLASS
# ===============================================================
class BiomassDataset(Dataset):
    def __init__(self, df, image_dir, transform=None):
        self.df = df
        self.image_dir = image_dir
        self.transform = transform
        self.image_paths = df['image_path'].values
        self.metadata = df['meta_scaled'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path_suffix = self.image_paths[idx]
        filename = os.path.basename(img_path_suffix)
        full_path = os.path.join(self.image_dir, filename)

        image = cv2.imread(full_path)
        if image is None:
            # Fallback for missing/bad image files
            image = np.full((1000, 2000, 3), [100, 150, 100], dtype=np.uint8)

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        height, width, _ = image.shape
        mid_point = width // 2
        img_left = image[:, :mid_point]
        img_right = image[:, mid_point:]

        if self.transform:
            img_left = self.transform(img_left)
            img_right = self.transform(img_right)

        metadata = torch.tensor(self.metadata[idx], dtype=torch.float)

        return img_left, img_right, metadata


# ===============================================================
# 3. MODEL ARCHITECTURE
# ===============================================================
class BiomassModel(nn.Module):
    def __init__(self, model_name, n_meta_features, pretrained=True):
        super(BiomassModel, self).__init__()

        self.backbone = timm.create_model(
            model_name, pretrained=pretrained, num_classes=0, global_pool='avg'
        )

        self.n_features = self.backbone.num_features
        self.n_combined_features = (self.n_features * 2) + n_meta_features

        for head_name in ['head_total', 'head_gdm', 'head_green']:
            setattr(self, head_name, nn.Sequential(
                nn.Linear(self.n_combined_features, 256),
                nn.ReLU(),
                nn.Dropout(CONFIG.DROPOUT_RATE),
                nn.Linear(256, 1)
            ))

    def forward(self, img_left, img_right, metadata):
        features_left = self.backbone(img_left)
        features_right = self.backbone(img_right)
        combined = torch.cat([features_left, features_right, metadata], dim=1)
        out_total = self.head_total(combined)
        out_gdm = self.head_gdm(combined)
        out_green = self.head_green(combined)
        return out_total, out_gdm, out_green


# ===============================================================
# 4. BIOLOGICAL CONSTRAINT ENFORCEMENT (FIXED)
# ===============================================================
def enforce_biological_constraints(total, gdm, green):
    """Applies constraints and recalculates parent masses to ensure consistency."""
    
    # 1. Undo Log Transformation and Apply MIN_BIOMASS (1.0) Clip
    total = np.exp(total) - CONFIG.EPS
    gdm = np.exp(gdm) - CONFIG.EPS
    green = np.exp(green) - CONFIG.EPS

    total = np.maximum(total, CONFIG.MIN_BIOMASS)
    gdm = np.maximum(gdm, CONFIG.MIN_BIOMASS)
    green = np.maximum(green, CONFIG.MIN_BIOMASS)

    # 2. Enforce Hierarchy: GDM >= Green
    gdm = np.maximum(gdm, green)

    # 3. Derive Clover & RE-ENFORCE GDM Consistency (The Fix)
    clover = np.maximum(gdm - green, CONFIG.MIN_BIOMASS)
    gdm = green + clover # Recalculate GDM based on CAPPED Clover

    # 4. Enforce Hierarchy: Total >= GDM
    total = np.maximum(total, gdm)

    # 5. Derive Dead mass
    dead = np.maximum(total - gdm, CONFIG.MIN_BIOMASS)

    # 6. Final Clipping
    final_green = np.clip(green, CONFIG.MIN_BIOMASS, CONFIG.MAX_BIOMASS)
    final_clover = np.clip(clover, CONFIG.MIN_BIOMASS, CONFIG.MAX_BIOMASS)
    final_dead = np.clip(dead, CONFIG.MIN_BIOMASS, CONFIG.MAX_BIOMASS)
    final_gdm = np.clip(gdm, CONFIG.MIN_BIOMASS, CONFIG.MAX_BIOMASS)
    final_total = np.clip(total, CONFIG.MIN_BIOMASS, CONFIG.MAX_BIOMASS)

    return final_total, final_gdm, final_green, final_clover, final_dead


# ===============================================================
# 5. INFERENCE LOGIC
# ===============================================================
def predict_single_view(models_list, loader):
    view_preds = {'total': [], 'gdm': [], 'green': []}

    with torch.no_grad():
        for img_left, img_right, metadata in loader:
            img_left = img_left.to(CONFIG.DEVICE)
            img_right = img_right.to(CONFIG.DEVICE)
            metadata = metadata.to(CONFIG.DEVICE)

            fold_preds = {'total': [], 'gdm': [], 'green': []}
            for model in models_list:
                pred_total, pred_gdm, pred_green = model(img_left, img_right, metadata)
                fold_preds['total'].append(pred_total.cpu())
                fold_preds['gdm'].append(pred_gdm.cpu())
                fold_preds['green'].append(pred_green.cpu())

            # Median ensemble across K-Folds for this TTA view
            avg_total = torch.median(torch.stack(fold_preds['total']), dim=0)[0]
            avg_gdm = torch.median(torch.stack(fold_preds['gdm']), dim=0)[0]
            avg_green = torch.median(torch.stack(fold_preds['green']), dim=0)[0]

            view_preds['total'].append(avg_total.numpy())
            view_preds['gdm'].append(avg_gdm.numpy())
            view_preds['green'].append(avg_green.numpy())

    return {
        'total': np.concatenate(view_preds['total']).flatten(),
        'gdm': np.concatenate(view_preds['gdm']).flatten(),
        'green': np.concatenate(view_preds['green']).flatten()
    }


def run_full_inference():
    print("üîÆ STEP 1: Preparing Data and Loading Models for Kaggle Inference...")
    
    df_long, df_unique = load_and_prep_test_data()
    
    models_list = []
    n_meta_features = len(CONFIG.METADATA_COLS)

    # Load N_FOLDS models from the Kaggle input directory
    for fold in range(CONFIG.N_FOLDS):
        model_filename = f'best_model_fold{fold}.pth'
        model_path = os.path.join(CONFIG.MODEL_CHECKPOINT_DIR, model_filename)
        
        if os.path.exists(model_path):
            try:
                # Use pretrained=False to avoid relying on a dynamic download in Kaggle inference
                model = BiomassModel(CONFIG.MODEL_NAME, n_meta_features, pretrained=False) 
                
                # Load the trained weights
                model.load_state_dict(torch.load(model_path, map_location=CONFIG.DEVICE))
                model.eval()
                model.to(CONFIG.DEVICE)
                models_list.append(model)
                print(f"‚úÖ Loaded model fold {fold} from {model_path}")
            except Exception as e:
                print(f"‚ùå Failed to load model fold {fold}: {e}")

    if len(models_list) == 0:
        print("‚ùå ERROR: No trained models found. Cannot run prediction. Check MODEL_CHECKPOINT_DIR.")
        return None, df_long, df_unique

    print("\nüîÆ STEP 2: Running Inference with TTA...")
    tta_transforms = get_tta_transforms()
    all_predictions = []

    for idx, transform in enumerate(tta_transforms):
        dataset = BiomassDataset(df_unique, CONFIG.TEST_IMAGE_DIR, transform)
        loader = DataLoader(dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False, num_workers=0)

        view_preds = predict_single_view(models_list, loader)
        all_predictions.append(view_preds)
        print(f"‚úÖ TTA view {idx} completed.")

    # Median ensemble across TTA views
    final_preds = {
        'total': np.median([p['total'] for p in all_predictions], axis=0),
        'gdm': np.median([p['gdm'] for p in all_predictions], axis=0),
        'green': np.median([p['green'] for p in all_predictions], axis=0)
    }
    
    return final_preds, df_long, df_unique


# ===============================================================
# 6. SUBMISSION CREATION AND EXECUTION
# ===============================================================
def create_submission(preds_np, test_df_long, test_df_unique):
    print("\nüìÑ STEP 3: Creating submission file...")

    if preds_np is None:
        print("Using fallback baseline predictions.")
        n_images = len(test_df_unique)
        preds_np = {'total': np.full(n_images, 5.6), 'gdm': np.full(n_images, 5.2), 'green': np.full(n_images, 4.7)}

    # Apply the FIXED and CONSTRAINED logic
    total, gdm, green, clover, dead = enforce_biological_constraints(preds_np['total'], preds_np['gdm'], preds_np['green'])

    preds_wide_df = pd.DataFrame({
        'image_path': test_df_unique['image_path'],
        'Dry_Green_g': green, 'Dry_Dead_g': dead, 'Dry_Clover_g': clover, 'GDM_g': gdm, 'Dry_Total_g': total
    })

    preds_long_df = preds_wide_df.melt(
        id_vars=['image_path'],
        value_vars=['Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g', 'Dry_Total_g'],
        var_name='target_name',
        value_name='target'
    )

    submission_df = pd.merge(
        test_df_long[['sample_id', 'image_path', 'target_name']],
        preds_long_df,
        on=['image_path', 'target_name'],
        how='left'
    )

    submission_df = submission_df[['sample_id', 'target']]
    submission_df['target'] = submission_df['target'].clip(lower=CONFIG.MIN_BIOMASS)
    submission_df.to_csv('submission.csv', index=False)

    print(f"\nüéâ SUCCESS! Submission created: submission.csv")
    print("üìã First 5 consistent predictions:")
    print(submission_df.head(5))

    return submission_df


# --- MAIN EXECUTION ---
if __name__ == "__main__":
    
    all_preds, df_long, df_unique = run_full_inference()
    if all_preds is not None:
        create_submission(all_preds, df_long, df_unique)

In [None]:
import pandas as pd
import numpy as np
import os

# --- A. CONFIGURATION ---
#DATA_PATH = '/kaggle/input/csiro-biomass/'
DATA_PATH = "/kaggle/input/csiro-biomass"
TRAIN_CSV = os.path.join(DATA_PATH, 'train.csv')
TEST_CSV = os.path.join(DATA_PATH, 'test.csv')
TRAIN_IMG_DIR = DATA_PATH
IMG_SIZE = (128, 128)
EPS = 1e-6

# üõë Targets the model WILL predict (The 3 independent components)
PREDICTED_TARGETS = ['Dry_Total_g', 'GDM_g', 'Dry_Green_g']

# All five targets are used for the final submission column list
TARGET_NAMES = ['Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g', 'Dry_Total_g']

IMAGE_PATH_COL = 'image_path'
TARGET_COL = 'target'

SUBMISSION_ID_COL_VAR = 'sample_id'
TARGET_COL_VAR = TARGET_COL


# --- CONFIGURATION (Load from global) ---
SUBMISSION_FILE = 'submission.csv'
SUBMISSION_ID_COL = SUBMISSION_ID_COL_VAR
TARGET_COL = TARGET_COL_VAR


# --- FILE VERIFICATION ---

print("\n--- Final Submission File Verification and Content Analysis ---")

if not os.path.exists(SUBMISSION_FILE):
    print(f"FATAL ERROR: Submission file '{SUBMISSION_FILE}' not found.")
else:
    df_submission = pd.read_csv(SUBMISSION_FILE)

    # 1. Validation Checks
    expected_cols = [SUBMISSION_ID_COL, TARGET_COL]
    if df_submission.columns.tolist() != expected_cols:
        print(f"‚ùå FAIL: Expected columns {expected_cols}, found {df_submission.columns.tolist()}.")
    else:
        print("‚úÖ PASS: Submission file has the correct columns and order.")

    # 2. Print Structure
    print("-" * 50)
    print(f"Shape: {df_submission.shape}")

    print("\nSubmission Head (First 10 rows, showing constrained predictions):")
    print(df_submission.head(10).to_markdown(index=False))

    # 3. Post-Processing Constraint Check (Validation based on the first sample)

    if len(df_submission) >= 5:
        # Sort the first 5 rows to ensure correct mapping for constraint check
        df_check = df_submission.head(5).sort_values(by=SUBMISSION_ID_COL)

        # Mapping values based on the component name in sample_id
        T = df_check[df_check[SUBMISSION_ID_COL].str.contains('Total_g')]['target'].iloc[0]
        M = df_check[df_check[SUBMISSION_ID_COL].str.contains('GDM_g')]['target'].iloc[0]
        G = df_check[df_check[SUBMISSION_ID_COL].str.contains('Green_g')]['target'].iloc[0]
        D = df_check[df_check[SUBMISSION_ID_COL].str.contains('Dead_g')]['target'].iloc[0]
        C = df_check[df_check[SUBMISSION_ID_COL].str.contains('Clover_g')]['target'].iloc[0]

        # Check Total Derivation: T = M + D
        total_derived_check = M + D

        # Check GDM Derivation: M = G + C
        gdm_derived_check = G + C

        print("\n--- Biological Constraint Check (First Sample) ---")
        print(f"Dry_Total_g (T): {T:.4f} | GDM_g (M): {M:.4f} | Dry_Green_g (G): {G:.4f}")

        # Check if derived components match the total/GDM:
        if np.isclose(T, total_derived_check, atol=EPS * 10):
            print(f"‚úÖ PASS: Dry_Total_g (T={T:.4f}) matches GDM + Dry_Dead ({total_derived_check:.4f})")
        else:
            print(f"‚ùå FAIL: Dry_Total_g ({T:.4f}) should equal GDM + Dry_Dead ({total_derived_check:.4f})")

        if np.isclose(M, gdm_derived_check, atol=EPS * 10):
            print(f"‚úÖ PASS: GDM_g (M={M:.4f}) matches Dry_Green + Dry_Clover ({gdm_derived_check:.4f})")
        else:
            print(f"‚ùå FAIL: GDM_g ({M:.4f}) should equal Dry_Green + Dry_Clover ({gdm_derived_check:.4f})")

    print("-" * 50)