In [None]:
!nvidia-smi

In [None]:
import warnings

# Suppress the 'repr' warning from Pydantic
warnings.filterwarnings(
    "ignore",
    message="The 'repr' attribute with value False was provided to the `Field\\(\\)` function, which has no effect in the context it was used.*",
    category=UserWarning, # It's likely a UserWarning or UnsupportedFieldAttributeWarning, but UserWarning is safer to catch
    module="pydantic._internal._generate_schema"
)

# Suppress the 'frozen' warning from Pydantic
warnings.filterwarnings(
    "ignore",
    message="The 'frozen' attribute with value True was provided to the `Field\\(\\)` function, which has no effect in the context it was used.*",
    category=UserWarning,
    module="pydantic._internal._generate_schema"
)

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import timm
import cv2
from tqdm.auto import tqdm
import gc
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

import warnings

# Suppress the 'repr' warning from Pydantic
warnings.filterwarnings(
    "ignore",
    message="The 'repr' attribute with value False was provided to the `Field\\(\\)` function, which has no effect in the context it was used.*",
    category=UserWarning, # It's likely a UserWarning or UnsupportedFieldAttributeWarning, but UserWarning is safer to catch
    module="pydantic._internal._generate_schema"
)

# Suppress the 'frozen' warning from Pydantic
warnings.filterwarnings(
    "ignore",
    message="The 'frozen' attribute with value True was provided to the `Field\\(\\)` function, which has no effect in the context it was used.*",
    category=UserWarning,
    module="pydantic._internal._generate_schema"
)

BASE_PATH='/kaggle/input/csiro-biomass'


# --- CONFIGURATION ---
class CONFIG:
    # Paths (Set BASE_PATH to your actual data directory if not running in a standard Kaggle notebook)
    TRAIN_CSV = os.path.join(BASE_PATH, 'train.csv')
    TEST_CSV = os.path.join(BASE_PATH, 'test.csv')
    TRAIN_IMAGE_DIR = os.path.join(BASE_PATH, 'train')
    TEST_IMAGE_DIR = os.path.join(BASE_PATH, 'test')

    # Model settings
    MODEL_NAME = 'efficientnet_b0'
    IMG_SIZE = 512
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Training Hyperparameters (As in your successful training run)
    BATCH_SIZE = 8
    NUM_WORKERS = 2
    N_FOLDS = 5
    EPOCHS = 30  ### 30 FOR KAGGLE SCORE 0.39 
    LR = 5e-5
    WEIGHT_DECAY = 1e-6
    GRAD_CLIP = 5.0
    SCHEDULER = 'CosineAnnealingLR'


    # Target and Numerical Stability Constants
    MIN_BIOMASS = 1.0
    MAX_BIOMASS = 1500.0
    EPS = 1e-6
    DROPOUT_RATE = 0.5

    # Metadata Features
    METADATA_COLS = ['Pre_GSHH_NDVI', 'Height_Ave_cm']


# --- Simple Transforms ---
def simple_transform(image, img_size):
    image = cv2.resize(image, (img_size, img_size))
    image = image.astype(np.float32) / 255.0
    mean_custom = np.array([0.5, 0.5, 0.5], dtype=np.float32)
    std_custom = np.array([0.5, 0.5, 0.5], dtype=np.float32)
    image = (image - mean_custom) / std_custom
    image = image.transpose(2, 0, 1)
    return torch.tensor(image, dtype=torch.float)

def get_train_transforms():
    return lambda img: simple_transform(img, CONFIG.IMG_SIZE)

def get_valid_transforms():
    return lambda img: simple_transform(img, CONFIG.IMG_SIZE)

def get_tta_transforms():
    return [lambda img: simple_transform(img, CONFIG.IMG_SIZE)]


# ===============================================================
# 1. DATA PREPARATION (FINAL FIX APPLIED)
# ===============================================================
def prepare_data(data_path, is_train=True, scaler=None):
    """Loads and pivots data, applies log-scaling, and handles metadata."""

    df_long = pd.read_csv(data_path)

    # CRITICAL PIVOT FIX: Extract the actual unique image ID
    df_long['image_id'] = df_long['sample_id'].apply(lambda x: x.split('__')[0])

    index_cols = ['image_id', 'image_path', 'Sampling_Date', 'State', 'Species'] + CONFIG.METADATA_COLS

    if is_train:
        # --- TRAINING LOGIC (Confirmed Working) ---
        df = df_long.pivot_table(
            index=index_cols,
            columns='target_name',
            values='target'
        ).reset_index()
        df.columns.name = None

        df = df.dropna(subset=['Dry_Total_g', 'GDM_g', 'Dry_Green_g']).reset_index(drop=True)

        expected_targets = ['Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g', 'Dry_Total_g']
        for col in expected_targets:
            df[col] = np.log(df[col].clip(lower=CONFIG.MIN_BIOMASS) + CONFIG.EPS)

        df['target_total'] = df['Dry_Total_g']
        df['target_gdm'] = df['GDM_g']
        df['target_green'] = df['Dry_Green_g']

        scaler = StandardScaler()
        df['meta_scaled'] = scaler.fit_transform(df[CONFIG.METADATA_COLS]).tolist()
        return df, scaler

    else:
        # --- INFERENCE LOGIC (ROBUST FIX FOR COLUMN ACCESS) ---

        # 1. Select the necessary image-level features from the long format.
        # We explicitly list all columns that are NOT specific to the long-format targets.
        cols_for_unique_image = [col for col in df_long.columns if col not in ['target_name', 'target', 'sample_id']]

        # 2. Get unique image row: Extract these specific columns and drop duplicates.
        # This ensures the metadata columns (Pre_GSHH_NDVI, Height_Ave_cm) are definitely
        # present in the resulting 'df' DataFrame before the scaling step.
        df = df_long[cols_for_unique_image].drop_duplicates(subset=['image_path']).reset_index(drop=True)

        # 3. Final check and transformation of metadata
        try:
            # This line requires the columns to be in 'df', which step 2 now guarantees.
            df['meta_scaled'] = scaler.transform(df[CONFIG.METADATA_COLS]).tolist()
        except KeyError as e:
            # Added for final debugging clarity if the error somehow recurs
            raise KeyError(f"Metadata columns not found in test data: {e}. Check that 'test.csv' contains {CONFIG.METADATA_COLS}.")

        return df, scaler


# ===============================================================
# 2. DATASET CLASS
# ===============================================================
class BiomassDataset(Dataset):
    def __init__(self, df, image_dir, transform=None, is_train=True):
        self.df = df
        self.image_dir = image_dir
        self.transform = transform
        self.is_train = is_train
        self.image_paths = df['image_path'].values
        self.metadata = df['meta_scaled'].values

        if is_train:
            self.targets = df[['target_total', 'target_gdm', 'target_green']].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path_suffix = self.image_paths[idx]
        filename = os.path.basename(img_path_suffix)
        full_path = os.path.join(self.image_dir, filename)

        image = cv2.imread(full_path)
        if image is None:
            image = np.full((1000, 2000, 3), [100, 150, 100], dtype=np.uint8)

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        height, width, _ = image.shape
        mid_point = width // 2
        img_left = image[:, :mid_point]
        img_right = image[:, mid_point:]

        if self.transform:
            img_left = self.transform(img_left)
            img_right = self.transform(img_right)

        metadata = torch.tensor(self.metadata[idx], dtype=torch.float)

        if self.is_train:
            targets = torch.FloatTensor(self.targets[idx])
            return img_left, img_right, metadata, targets
        else:
            return img_left, img_right, metadata


# ===============================================================
# 3. MODEL ARCHITECTURE
# ===============================================================
class BiomassModel(nn.Module):
    def __init__(self, model_name, n_meta_features, pretrained=False):
        super(BiomassModel, self).__init__()

        self.backbone = timm.create_model(
            model_name, pretrained=pretrained, num_classes=0, global_pool='avg'
        )

        self.n_features = self.backbone.num_features
        self.n_combined_features = (self.n_features * 2) + n_meta_features

        for head_name in ['head_total', 'head_gdm', 'head_green']:
            setattr(self, head_name, nn.Sequential(
                nn.Linear(self.n_combined_features, 256),
                nn.ReLU(),
                nn.Dropout(CONFIG.DROPOUT_RATE),
                nn.Linear(256, 1)
            ))

        self._init_weights()

    def _init_weights(self):
        """Custom initialization for the final regression layer biases."""
        nn.init.xavier_uniform_(self.head_total[-1].weight)
        self.head_total[-1].bias.data.fill_(5.6)

        nn.init.xavier_uniform_(self.head_gdm[-1].weight)
        self.head_gdm[-1].bias.data.fill_(5.2)

        nn.init.xavier_uniform_(self.head_green[-1].weight)
        self.head_green[-1].bias.data.fill_(4.7)

    def forward(self, img_left, img_right, metadata):
        features_left = self.backbone(img_left)
        features_right = self.backbone(img_right)

        combined = torch.cat([features_left, features_right, metadata], dim=1)

        out_total = self.head_total(combined)
        out_gdm = self.head_gdm(combined)
        out_green = self.head_green(combined)

        return out_total, out_gdm, out_green


# ===============================================================
# 4. LOSS FUNCTION
# ===============================================================
class WeightedSmoothL1Loss(nn.Module):
    def __init__(self, weights=[0.5, 0.2, 0.1]):
        super().__init__()
        self.weights = weights
        self.loss_fn = nn.SmoothL1Loss()

    def forward(self, preds, targets):
        total_loss = self.loss_fn(preds[0].squeeze(), targets[:, 0]) * self.weights[0]
        gdm_loss = self.loss_fn(preds[1].squeeze(), targets[:, 1]) * self.weights[1]
        green_loss = self.loss_fn(preds[2].squeeze(), targets[:, 2]) * self.weights[2]
        return total_loss + gdm_loss + green_loss


# ===============================================================
# 5. TRAINING FUNCTION
# ===============================================================
def train_model():
    print("üöÄ STARTING ENHANCED TRAINING")

    train_df, scaler = prepare_data(CONFIG.TRAIN_CSV, is_train=True)

    if train_df is None or train_df[['target_total', 'target_gdm', 'target_green']].isnull().any().any():
        print("‚ùå Training stopped: Data not ready.")
        return scaler

    kf = KFold(n_splits=CONFIG.N_FOLDS, shuffle=True, random_state=42)

    for fold, (train_index, valid_index) in enumerate(kf.split(train_df)):
        print(f"\n=== TRAINING FOLD {fold} ===")
        if os.path.exists(f'best_model_fold{fold}.pth'):
            print(f"Skipping Fold {fold}. Checkpoint already exists.")
            continue

        train_fold = train_df.iloc[train_index].reset_index(drop=True)
        valid_fold = train_df.iloc[valid_index].reset_index(drop=True)
        print(f"Train: {len(train_fold)}, Valid: {len(valid_fold)}")

        train_dataset = BiomassDataset(train_fold, CONFIG.TRAIN_IMAGE_DIR, get_train_transforms())
        valid_dataset = BiomassDataset(valid_fold, CONFIG.TRAIN_IMAGE_DIR, get_valid_transforms())

        train_loader = DataLoader(train_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=True, num_workers=0, pin_memory=True)
        valid_loader = DataLoader(valid_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=True)

        n_meta_features = len(CONFIG.METADATA_COLS)
        model = BiomassModel(CONFIG.MODEL_NAME, n_meta_features, pretrained=False).to(CONFIG.DEVICE)
        print(f"‚ö†Ô∏è Warning: Model is training from scratch for Fold {fold}.")

        optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG.LR, weight_decay=CONFIG.WEIGHT_DECAY)
        criterion = WeightedSmoothL1Loss()

        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=CONFIG.EPOCHS, eta_min=1e-6)

        best_val_loss = float('inf')
        model_path = f'best_model_fold{fold}.pth'

        for epoch in range(CONFIG.EPOCHS):
            model.train()
            train_loss = 0
            for _, (img_left, img_right, metadata, targets) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}")):
                img_left = img_left.to(CONFIG.DEVICE)
                img_right = img_right.to(CONFIG.DEVICE)
                metadata = metadata.to(CONFIG.DEVICE)
                targets = targets.to(CONFIG.DEVICE)

                optimizer.zero_grad()
                pred_total, pred_gdm, pred_green = model(img_left, img_right, metadata)
                loss = criterion([pred_total, pred_gdm, pred_green], targets)

                if torch.isnan(loss).any():
                    print("\n‚ö†Ô∏è NaN loss detected! Skipping batch.")
                    continue

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG.GRAD_CLIP)
                optimizer.step()
                scheduler.step()

                train_loss += loss.item()

            model.eval()
            val_loss = 0
            val_batches = 0
            with torch.no_grad():
                for img_left, img_right, metadata, targets in valid_loader:
                    img_left = img_left.to(CONFIG.DEVICE)
                    img_right = img_right.to(CONFIG.DEVICE)
                    metadata = metadata.to(CONFIG.DEVICE)
                    targets = targets.to(CONFIG.DEVICE)
                    pred_total, pred_gdm, pred_green = model(img_left, img_right, metadata)
                    loss = criterion([pred_total, pred_gdm, pred_green], targets)
                    val_loss += loss.item()
                    val_batches += 1

            avg_train_loss = train_loss / len(train_loader)
            avg_val_loss = val_loss / val_batches

            print(f"Epoch {epoch+1}: LR: {optimizer.param_groups[0]['lr']:.2e}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                torch.save(model.state_dict(), model_path)
                print(f"‚úÖ Saved best model for fold {fold}")

        print(f"Fold {fold} completed. Best val loss: {best_val_loss:.4f}")
        gc.collect()
        torch.cuda.empty_cache()

    print("üéâ TRAINING COMPLETED")
    return scaler


# ===============================================================
# 6. BIOLOGICAL CONSTRAINT ENFORCEMENT
# ===============================================================
def enforce_biological_constraints(total, gdm, green):
    total = np.exp(total) - CONFIG.EPS
    gdm = np.exp(gdm) - CONFIG.EPS
    green = np.exp(green) - CONFIG.EPS

    total = np.maximum(total, CONFIG.MIN_BIOMASS)
    gdm = np.maximum(gdm, CONFIG.MIN_BIOMASS)
    green = np.maximum(green, CONFIG.MIN_BIOMASS)

    gdm = np.maximum(gdm, green)
    total = np.maximum(total, gdm)

    clover = np.maximum(gdm - green, CONFIG.MIN_BIOMASS)
    dead = np.maximum(total - gdm, CONFIG.MIN_BIOMASS)

    final_green = np.clip(green, CONFIG.MIN_BIOMASS, CONFIG.MAX_BIOMASS)
    final_clover = np.clip(clover, CONFIG.MIN_BIOMASS, CONFIG.MAX_BIOMASS)
    final_dead = np.clip(dead, CONFIG.MIN_BIOMASS, CONFIG.MAX_BIOMASS)
    final_gdm = np.clip(gdm, CONFIG.MIN_BIOMASS, CONFIG.MAX_BIOMASS)
    final_total = np.clip(total, CONFIG.MIN_BIOMASS, CONFIG.MAX_BIOMASS)

    return final_total, final_gdm, final_green, final_clover, final_dead


# ===============================================================
# 7. INFERENCE FUNCTION (MODIFIED TO HANDLE MISSING METADATA)
# ===============================================================
def run_inference(scaler):
    print("üöÄ STARTING INFERENCE")

    test_df_long = pd.read_csv(CONFIG.TEST_CSV)

    # *** CHANGE 1: Create a placeholder unique DataFrame ***
    # We strip down the test data to only what we know exists: image_path.
    test_df_unique = test_df_long[['image_path']].drop_duplicates(subset=['image_path']).reset_index(drop=True)

    # *** CHANGE 2: Create a dummy metadata column ***
    # Since the true columns are missing, we add a dummy 'meta_scaled' column of zeros.
    # The length of this dummy vector MUST match the expected input (2 features)
    dummy_metadata = [[0.0, 0.0] for _ in range(len(test_df_unique))]
    test_df_unique['meta_scaled'] = dummy_metadata

    print(f"Test images: {len(test_df_unique)}. WARNING: Running inference with zeroed metadata.")

    models_list = []
    # *** CHANGE 3: The model was trained with 2 metadata features, so we MUST load it that way. ***
    n_meta_features = len(CONFIG.METADATA_COLS)

    for fold in range(CONFIG.N_FOLDS):
        model_path = f'best_model_fold{fold}.pth'
        if os.path.exists(model_path):
            try:
                # Load model with correct architecture size (2 meta features)
                model = BiomassModel(CONFIG.MODEL_NAME, n_meta_features, pretrained=False)
                model.load_state_dict(torch.load(model_path, map_location=CONFIG.DEVICE))
                model.eval()
                model.to(CONFIG.DEVICE)
                models_list.append(model)
                print(f"‚úÖ Loaded model fold {fold}")
            except Exception as e:
                print(f"‚ùå Failed to load model fold {fold}: {e}")

    if len(models_list) == 0:
        print("‚ùå No trained models found. Using baseline predictions.")
        return None, test_df_long, test_df_unique

    tta_transforms = get_tta_transforms()
    all_predictions = []

    for _, transform in enumerate(tta_transforms):
        # The dataset and loader will now use the dummy metadata
        dataset = BiomassDataset(test_df_unique, CONFIG.TEST_IMAGE_DIR, transform, is_train=False)
        loader = DataLoader(dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False)

        view_preds = predict_single_view(models_list, loader)
        all_predictions.append(view_preds)
        print(f"‚úÖ TTA view completed")

    final_preds = {
        'total': np.median([p['total'] for p in all_predictions], axis=0),
        'gdm': np.median([p['gdm'] for p in all_predictions], axis=0),
        'green': np.median([p['green'] for p in all_predictions], axis=0)
    }

    return final_preds, test_df_long, test_df_unique

# --- Note: You must ensure the entire rest of the code (CONFIG, Model, Dataset, etc.)
# is present and correct from the last successful training run. ---

def predict_single_view(models_list, loader):
    view_preds = {'total': [], 'gdm': [], 'green': []}

    with torch.no_grad():
        for img_left, img_right, metadata in loader:
            img_left = img_left.to(CONFIG.DEVICE)
            img_right = img_right.to(CONFIG.DEVICE)
            metadata = metadata.to(CONFIG.DEVICE)

            fold_preds = {'total': [], 'gdm': [], 'green': []}
            for model in models_list:
                pred_total, pred_gdm, pred_green = model(img_left, img_right, metadata)
                fold_preds['total'].append(pred_total.cpu())
                fold_preds['gdm'].append(pred_gdm.cpu())
                fold_preds['green'].append(pred_green.cpu())

            avg_total = torch.median(torch.stack(fold_preds['total']), dim=0)[0]
            avg_gdm = torch.median(torch.stack(fold_preds['gdm']), dim=0)[0]
            avg_green = torch.median(torch.stack(fold_preds['green']), dim=0)[0]

            view_preds['total'].append(avg_total.numpy())
            view_preds['gdm'].append(avg_gdm.numpy())
            view_preds['green'].append(avg_green.numpy())

    return {
        'total': np.concatenate(view_preds['total']).flatten(),
        'gdm': np.concatenate(view_preds['gdm']).flatten(),
        'green': np.concatenate(view_preds['green']).flatten()
    }


# ===============================================================
# 8. SUBMISSION CREATION
# ===============================================================
def create_submission(preds_np, test_df_long, test_df_unique):
    print("üìÑ Creating submission file...")

    if preds_np is None:
        print("Using biologically reasonable baseline predictions")
        n_images = len(test_df_unique)
        preds_np = {'total': np.full(n_images, 5.6), 'gdm': np.full(n_images, 5.2), 'green': np.full(n_images, 4.7)}

    total, gdm, green, clover, dead = enforce_biological_constraints(preds_np['total'], preds_np['gdm'], preds_np['green'])

    preds_wide_df = pd.DataFrame({
        'image_path': test_df_unique['image_path'],
        'Dry_Green_g': green, 'Dry_Dead_g': dead, 'Dry_Clover_g': clover, 'GDM_g': gdm, 'Dry_Total_g': total
    })

    preds_long_df = preds_wide_df.melt(
        id_vars=['image_path'],
        value_vars=['Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g', 'Dry_Total_g'],
        var_name='target_name',
        value_name='target'
    )

    submission_df = pd.merge(
        test_df_long[['sample_id', 'image_path', 'target_name']],
        preds_long_df,
        on=['image_path', 'target_name'],
        how='left'
    )

    submission_df = submission_df[['sample_id', 'target']]
    submission_df['target'] = submission_df['target'].clip(lower=CONFIG.MIN_BIOMASS)
    submission_df.to_csv('submission.csv', index=False)

    print(f"‚úÖ Submission created: submission.csv")
    print(f"üìä Statistics: Samples: {len(submission_df)}, Mean value: {submission_df['target'].mean():.1f}")

    return submission_df


# ===============================================================
# 9. MAIN EXECUTION
# ===============================================================
if __name__ == "__main__":
    print("üå± CSIRO Biomass Prediction - ENHANCED SOLUTION")
    print("=" * 60)

    try:
        # STEP 1: TRAINING
        print("üìö STEP 1: Training models...")
        fitted_scaler = train_model()

        # STEP 2: INFERENCE
        print("\nüîÆ STEP 2: Running inference...")
        all_preds, df_long, df_unique = run_inference(fitted_scaler)

        # STEP 3: SUBMISSION
        print("\nüìÑ STEP 3: Creating submission...")
        submission_df = create_submission(all_preds, df_long, df_unique)

        print("\nüéâ SUCCESS! COMPLETED ALL STEPS")
        print("üìã First 10 predictions:")
        print(submission_df.head(10))

    except Exception as e:
        print(f"‚ùå Error: {e}")
        print("Creating fallback submission...")

        # Fallback submission logic
        try:
            test_df_long = pd.read_csv(CONFIG.TEST_CSV)
            fallback_df = test_df_long[['sample_id']].copy()
            fallback_df['target'] = 150.0
            fallback_df.to_csv('submission.csv', index=False)
            print("‚úÖ Created fallback submission.csv")
        except Exception as file_error:
            print(f"‚ùå Could not create fallback submission: {file_error}")

In [None]:
import pandas as pd
import numpy as np
import os


# --- A. CONFIGURATION ---
DATA_PATH = '/kaggle/input/csiro-biomass/'
TRAIN_CSV = os.path.join(DATA_PATH, 'train.csv')
TEST_CSV = os.path.join(DATA_PATH, 'test.csv') 
TRAIN_IMG_DIR = DATA_PATH 
IMG_SIZE = (128, 128) 
EPS = 1e-6 

# üõë Targets the model WILL predict (The 3 independent components)
PREDICTED_TARGETS = ['Dry_Total_g', 'GDM_g', 'Dry_Green_g']

# All five targets are used for the final submission column list
TARGET_NAMES = ['Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g', 'Dry_Total_g']

IMAGE_PATH_COL = 'image_path'
TARGET_COL = 'target'

SUBMISSION_ID_COL_VAR = 'sample_id'
TARGET_COL_VAR = TARGET_COL


# --- CONFIGURATION (Load from global) ---
SUBMISSION_FILE = 'submission.csv'
SUBMISSION_ID_COL = SUBMISSION_ID_COL_VAR
TARGET_COL = TARGET_COL_VAR

# --- A. CONFIGURATION ---
DATA_PATH = '/kaggle/input/csiro-biomass/'
TRAIN_CSV = os.path.join(DATA_PATH, 'train.csv')
TEST_CSV = os.path.join(DATA_PATH, 'test.csv') 
TRAIN_IMG_DIR = DATA_PATH 
IMG_SIZE = (128, 128) 
EPS = 1e-6 





# --- FILE VERIFICATION ---

print("\n--- Final Submission File Verification and Content Analysis ---")

if not os.path.exists(SUBMISSION_FILE):
    print(f"FATAL ERROR: Submission file '{SUBMISSION_FILE}' not found.")
else:
    df_submission = pd.read_csv(SUBMISSION_FILE)

    # 1. Validation Checks
    expected_cols = [SUBMISSION_ID_COL, TARGET_COL]
    if df_submission.columns.tolist() != expected_cols:
        print(f"‚ùå FAIL: Expected columns {expected_cols}, found {df_submission.columns.tolist()}.")
    else:
        print("‚úÖ PASS: Submission file has the correct columns and order.")

    # 2. Print Structure
    print("-" * 50)
    print(f"Shape: {df_submission.shape}")
    
    print("\nSubmission Head (First 10 rows, showing constrained predictions):")
    print(df_submission.head(10).to_markdown(index=False))
    
    # 3. Post-Processing Constraint Check (Validation based on the first sample)
    
    if len(df_submission) >= 5:
        # Sort the first 5 rows to ensure correct mapping for constraint check
        df_check = df_submission.head(5).sort_values(by=SUBMISSION_ID_COL)
        
        # Mapping values based on the component name in sample_id
        T = df_check[df_check[SUBMISSION_ID_COL].str.contains('Total_g')]['target'].iloc[0]
        M = df_check[df_check[SUBMISSION_ID_COL].str.contains('GDM_g')]['target'].iloc[0]
        G = df_check[df_check[SUBMISSION_ID_COL].str.contains('Green_g')]['target'].iloc[0]
        D = df_check[df_check[SUBMISSION_ID_COL].str.contains('Dead_g')]['target'].iloc[0]
        C = df_check[df_check[SUBMISSION_ID_COL].str.contains('Clover_g')]['target'].iloc[0]
        
        # Check Total Derivation: T = M + D
        total_derived_check = M + D
        
        # Check GDM Derivation: M = G + C
        gdm_derived_check = G + C
        
        print("\n--- Biological Constraint Check (First Sample) ---")
        print(f"Dry_Total_g (T): {T:.4f} | GDM_g (M): {M:.4f} | Dry_Green_g (G): {G:.4f}")
        
        # Check if derived components match the total/GDM:
        if np.isclose(T, total_derived_check, atol=EPS * 10):
            print(f"‚úÖ PASS: Dry_Total_g (T={T:.4f}) matches GDM + Dry_Dead ({total_derived_check:.4f})")
        else:
            print(f"‚ùå FAIL: Dry_Total_g ({T:.4f}) should equal GDM + Dry_Dead ({total_derived_check:.4f})")

        if np.isclose(M, gdm_derived_check, atol=EPS * 10):
            print(f"‚úÖ PASS: GDM_g (M={M:.4f}) matches Dry_Green + Dry_Clover ({gdm_derived_check:.4f})")
        else:
            print(f"‚ùå FAIL: GDM_g ({M:.4f}) should equal Dry_Green + Dry_Clover ({gdm_derived_check:.4f})")
    
    print("-" * 50)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

# --- Replicating the CONFIG and Data Preparation for EDA ---

# NOTE: The BASE_PATH variable must be defined by earlier cells for this to run.
# The CONFIG is needed to locate train.csv correctly.

class CONFIG:
    BASE_PATH = BASE_PATH
    TRAIN_CSV = os.path.join(BASE_PATH, 'train.csv')
    MIN_BIOMASS = 1.0
    EPS = 1e-6
    # Targets used by the model
    MODEL_TARGETS = ['Dry_Total_g', 'GDM_g', 'Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g']


def prepare_data_for_eda():
    """Loads long-format data and pivots it to the final wide format."""
    print("Preparing data for EDA (Correct Pivot Applied)...")
    try:
        train_df_long = pd.read_csv(CONFIG.TRAIN_CSV)
    except FileNotFoundError:
        print(f"‚ùå ERROR: train.csv not found at {CONFIG.TRAIN_CSV}.")
        return None

    # CRITICAL PIVOT FIX: Extract the actual unique image ID
    train_df_long['image_id'] = train_df_long['sample_id'].apply(lambda x: x.split('__')[0])
    index_cols = ['image_id', 'image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm']

    train_df = train_df_long.pivot_table(
        index=index_cols,
        columns='target_name',
        values='target'
    ).reset_index()
    train_df.columns.name = None

    # Drop rows missing the primary targets and apply log transform
    train_df = train_df.dropna(subset=['Dry_Total_g', 'GDM_g', 'Dry_Green_g']).reset_index(drop=True)

    # Apply Log Transformation for visualization consistency
    for col in CONFIG.MODEL_TARGETS:
        train_df[f'Log_{col}'] = np.log(train_df[col].clip(lower=CONFIG.MIN_BIOMASS) + CONFIG.EPS)

    print(f"‚úÖ EDA DataFrame ready with {len(train_df)} unique samples.")
    return train_df


# ===============================================================
# --- EDA EXECUTION CELL ---
# ===============================================================

eda_df = prepare_data_for_eda()

if eda_df is not None:
    print("\n## üìä 1. Descriptive Statistics (Real-World Targets)")
    # Show statistics for the UN-transformed targets (e.g., in grams)
    display(eda_df[CONFIG.MODEL_TARGETS].describe().T)

    print("\n## üìâ 2. Target Distribution (Log-Transformed)")
    # Visualize the distribution of the log-transformed targets
    log_targets = [f'Log_{col}' for col in CONFIG.MODEL_TARGETS]

    # Plotting target distributions
    fig, axes = plt.subplots(ncols=len(log_targets), figsize=(18, 4))
    fig.suptitle('Distribution of Log-Transformed Biomass Targets', fontsize=16)

    for i, col in enumerate(log_targets):
        sns.histplot(eda_df[col], kde=True, ax=axes[i], bins=20, color='g')
        axes[i].set_title(col)

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()

    print("\n## üìà 3. Correlation Matrix")
    # Plotting correlation between the three primary targets (log-transformed)
    primary_log_targets = ['Log_Dry_Total_g', 'Log_GDM_g', 'Log_Dry_Green_g']
    plt.figure(figsize=(6, 5))
    sns.heatmap(eda_df[primary_log_targets].corr(),
                annot=True,
                cmap='viridis',
                fmt=".3f",
                linewidths=.5)
    plt.title('Correlation of Primary Log-Targets')
    plt.show()

    print("\n## üåæ 4. Metadata Feature Analysis: Height vs. Total Biomass")
    # Visualize the relationship between a key metadata feature (Height) and the target
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x='Height_Ave_cm', y='Dry_Total_g', data=eda_df, alpha=0.6, color='darkorange')
    plt.title('Average Height vs. Dry Total Biomass (g)')
    plt.xlabel('Average Height (cm)')
    plt.ylabel('Dry Total Biomass (g)')
    plt.yscale('log') # Use log scale for total biomass for visibility
    plt.grid(True, alpha=0.3)
    plt.show()