In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# First, let's see what's available in the input directory
import os
import pandas as pd

# List all directories in /kaggle/input
print("Contents of /kaggle/input:")
input_dir = '/kaggle/input'
if os.path.exists(input_dir):
    for item in os.listdir(input_dir):
        item_path = os.path.join(input_dir, item)
        if os.path.isdir(item_path):
            print(f"üìÅ {item}")
            # List contents of this directory
            try:
                sub_items = os.listdir(item_path)
                for sub_item in sub_items[:10]:  # Show first 10 items
                    print(f"   üìÑ {sub_item}")
                if len(sub_items) > 10:
                    print(f"   ... and {len(sub_items) - 10} more files")
            except PermissionError:
                print("   üîí Permission denied")
else:
    print("‚ùå /kaggle/input directory not found")

print("\n" + "="*50)

# Also check the current working directory
print(f"Current working directory: {os.getcwd()}")
print("Contents of current directory:")
for item in os.listdir('.'):
    print(f"üìÑ {item}")

In [None]:
# Check if data is now available
import os
import pandas as pd

print("Checking for competition data...")

# List all datasets in /kaggle/input
input_dir = '/kaggle/input'
datasets = [d for d in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, d))]
print("Available datasets:", datasets)

# Look for biomass-related datasets
biomass_datasets = [d for d in datasets if 'biomass' in d.lower() or 'image' in d.lower()]
print("Biomass-related datasets:", biomass_datasets)

if biomass_datasets:
    data_path = f'/kaggle/input/{biomass_datasets[0]}'
    print(f"üìÅ Using dataset: {data_path}")
    print("Files in this dataset:")
    for file in os.listdir(data_path):
        print(f"   üìÑ {file}")
else:
    print("‚ùå No biomass datasets found - please add the competition data")

In [None]:
# Sometimes the competition has a different exact name
# Let's search more broadly
import os

def find_competition_data():
    input_dir = '/kaggle/input'
    if not os.path.exists(input_dir):
        print("‚ùå /kaggle/input directory doesn't exist")
        return None
    
    all_datasets = os.listdir(input_dir)
    print("All available datasets:")
    for dataset in all_datasets:
        dataset_path = os.path.join(input_dir, dataset)
        files = os.listdir(dataset_path)
        print(f"üìÅ {dataset}: {len(files)} files")
        for file in files[:5]:  # Show first 5 files
            print(f"   üìÑ {file}")
        if len(files) > 5:
            print(f"   ... and {len(files) - 5} more")
        print()
    
    return all_datasets

find_competition_data()

In [None]:
# Check for specific competition files we need
required_files = ['train.csv', 'train_images', 'test_images', 'sample_submission.csv']

for dataset in os.listdir('/kaggle/input'):
    dataset_path = f'/kaggle/input/{dataset}'
    found_files = []
    
    for file in required_files:
        if os.path.exists(os.path.join(dataset_path, file)):
            found_files.append(file)
    
    if found_files:
        print(f"‚úÖ Found {len(found_files)} required files in '{dataset}':")
        for file in found_files:
            print(f"   ‚úì {file}")
        
        # Try to load the data
        try:
            train_path = os.path.join(dataset_path, 'train.csv')
            train_df = pd.read_csv(train_path)
            print(f"‚úÖ Successfully loaded training data: {len(train_df)} samples")
            break
        except:
            continue

In [None]:
# Create a complete working environment with sample data
import pandas as pd
import numpy as np
import os
from PIL import Image

def create_working_environment():
    print("Setting up working environment with sample data...")
    
    # Create directories
    os.makedirs('/kaggle/working/train_images', exist_ok=True)
    os.makedirs('/kaggle/working/test_images', exist_ok=True)
    
    # Create sample training data
    train_data = {
        'image_id': [f'train_{i:04d}' for i in range(200)],
        'biomass': np.random.uniform(0.1, 25.0, 200)
    }
    train_df = pd.DataFrame(train_data)
    train_df.to_csv('/kaggle/working/train.csv', index=False)
    
    # Create sample submission
    sub_data = {
        'image_id': [f'test_{i:04d}' for i in range(100)],
        'biomass': [0.0] * 100
    }
    sub_df = pd.DataFrame(sub_data)
    sub_df.to_csv('/kaggle/working/sample_submission.csv', index=False)
    
    # Create sample images (greenish images to simulate plants)
    for img_id in train_data['image_id'] + sub_data['image_id']:
        # Create green-dominant images
        img_array = np.random.randint(50, 150, (256, 256, 3), dtype=np.uint8)
        # Make it more green (boost green channel)
        img_array[:, :, 1] = np.random.randint(100, 200, (256, 256))
        
        img = Image.fromarray(img_array)
        if 'train' in img_id:
            img.save(f'/kaggle/working/train_images/{img_id}.jpg')
        else:
            img.save(f'/kaggle/working/test_images/{img_id}.jpg')
    
    print("‚úÖ Sample environment created!")
    print(f"Training samples: {len(train_df)}")
    print(f"Test samples: {len(sub_df)}")
    
    return train_df, sub_df

# Use sample data if real data isn't available
try:
    # Try one more time to find real data
    for dataset in os.listdir('/kaggle/input'):
        dataset_path = f'/kaggle/input/{dataset}'
        if 'train.csv' in os.listdir(dataset_path):
            train_df = pd.read_csv(f'{dataset_path}/train.csv')
            print(f"‚úÖ Found real competition data in '{dataset}'!")
            print(f"Training samples: {len(train_df)}")
            break
    else:
        raise FileNotFoundError
except:
    print("‚ö†Ô∏è Using sample data - you can replace with real data later")
    train_df, sample_sub = create_working_environment()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from PIL import Image

# Set the correct paths
data_path = '/kaggle/input/csiro-biomass'
train_csv_path = f'{data_path}/train.csv'
test_csv_path = f'{data_path}/test.csv'
sample_sub_path = f'{data_path}/sample_submission.csv'
train_images_path = f'{data_path}/train'
test_images_path = f'{data_path}/test'

print("üìÅ Competition Data Structure:")
print(f"Train CSV: {train_csv_path}")
print(f"Test CSV: {test_csv_path}")
print(f"Sample submission: {sample_sub_path}")
print(f"Train images: {train_images_path}")
print(f"Test images: {test_images_path}")

# Load the data
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)
sample_sub = pd.read_csv(sample_sub_path)

print(f"\n‚úÖ Data loaded successfully!")
print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Sample submission: {len(sample_sub)}")

In [None]:
# Display data overview
print("üìä TRAINING DATA:")
print(train_df.head())
print(f"\nTraining data shape: {train_df.shape}")
print(f"Columns: {train_df.columns.tolist()}")

print("\nüìä TEST DATA:")
print(test_df.head())
print(f"\nTest data shape: {test_df.shape}")

print("\nüìä SAMPLE SUBMISSION:")
print(sample_sub.head())

# Check for missing values
print("\nüîç MISSING VALUES:")
print("Training data:")
print(train_df.isnull().sum())
print("\nTest data:")
print(test_df.isnull().sum())

In [None]:
# Now analyze with the correct column name
if 'target_column' in locals() and target_column:
    plt.figure(figsize=(15, 5))

    plt.subplot(1, 3, 1)
    plt.hist(train_df[target_column], bins=50, alpha=0.7, color='green', edgecolor='black')
    plt.title(f'Distribution of {target_column}')
    plt.xlabel(target_column)
    plt.ylabel('Frequency')

    plt.subplot(1, 3, 2)
    plt.boxplot(train_df[target_column])
    plt.title(f'Boxplot of {target_column}')
    plt.ylabel(target_column)

    plt.subplot(1, 3, 3)
    # Log transform if needed
    if train_df[target_column].min() > 0:
        plt.hist(np.log1p(train_df[target_column]), bins=50, alpha=0.7, color='orange', edgecolor='black')
        plt.title(f'Log-Transformed {target_column}')
        plt.xlabel(f'Log({target_column} + 1)')
        plt.ylabel('Frequency')

    plt.tight_layout()
    plt.show()

    # Basic statistics
    print(f"üìà {target_column.upper()} STATISTICS:")
    print(train_df[target_column].describe())
else:
    print("‚ùå Could not identify target column")

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Updated Dataset class with correct target column
class BiomassDataset(Dataset):
    def __init__(self, dataframe, images_dir, target_column=None, transform=None, is_test=False):
        self.dataframe = dataframe
        self.images_dir = images_dir
        self.transform = transform
        self.is_test = is_test
        self.target_column = target_column
        
        # Try to find the image ID column
        self.image_id_col = None
        potential_id_cols = ['id', 'image', 'image_id', 'filename', 'name']
        
        for col in potential_id_cols:
            if col in dataframe.columns:
                self.image_id_col = col
                break
        
        if self.image_id_col is None:
            # Use first column as fallback
            self.image_id_col = dataframe.columns[0]
        
        print(f"Using '{self.image_id_col}' for image IDs")
        if not is_test and self.target_column:
            print(f"Using '{self.target_column}' as target variable")
    
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        # Get image ID from dataframe
        img_id = str(self.dataframe.iloc[idx][self.image_id_col])
        
        # Try different image extensions and patterns
        img_path = None
        for ext in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG', '']:
            for pattern in [img_id + ext, f"{img_id}{ext}"]:
                potential_path = os.path.join(self.images_dir, pattern)
                if os.path.exists(potential_path):
                    img_path = potential_path
                    break
            if img_path:
                break
        
        if img_path is None:
            # Try to find any image that contains the ID
            for img_file in os.listdir(self.images_dir):
                if img_id in img_file:
                    img_path = os.path.join(self.images_dir, img_file)
                    break
        
        if img_path is None:
            # Create a dummy image if file not found (for testing)
            print(f"Warning: Could not find image for ID: {img_id}")
            img = Image.new('RGB', (256, 256), color='green')
        else:
            img = Image.open(img_path).convert('RGB')
        
        if self.transform:
            img = self.transform(img)
        
        if self.is_test:
            return img, img_id
        else:
            if self.target_column and self.target_column in self.dataframe.columns:
                biomass = self.dataframe.iloc[idx][self.target_column]
                return img, torch.tensor(biomass, dtype=torch.float32)
            else:
                # Return dummy value if no target column
                return img, torch.tensor(0.0, dtype=torch.float32)

# Define transforms
train_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomHorizontalFlip(p=0.3),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create datasets (we'll determine target_column from our analysis)
target_column = 'Dry_biomass'  # This will be updated based on what we find

# Use the actual target column we identified
if 'target_column' in locals() and target_column:
    train_dataset = BiomassDataset(train_df, f'{data_path}/train', 
                                 target_column=target_column, transform=train_transform)
else:
    train_dataset = BiomassDataset(train_df, f'{data_path}/train', 
                                 transform=train_transform)

test_dataset = BiomassDataset(test_df, f'{data_path}/test', 
                            transform=val_transform, is_test=True)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=2)

print("‚úÖ Data loaders created successfully!")
print(f"Train batches: {len(train_loader)}")
print(f"Test batches: {len(test_loader)}")

In [None]:
# Test one batch
try:
    for images, targets in train_loader:
        print(f"‚úÖ Pipeline working!")
        print(f"Batch images shape: {images.shape}")
        print(f"Batch targets shape: {targets.shape}")
        print(f"Sample target values: {targets[:5]}")
        break
except Exception as e:
    print(f"‚ùå Error in pipeline: {e}")
    print("Let's debug further...")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from PIL import Image

data_path = '/kaggle/input/csiro-biomass'

# Load the data
train_df = pd.read_csv(f'{data_path}/train.csv')
test_df = pd.read_csv(f'{data_path}/test.csv')

print("üéØ DATA STRUCTURE ANALYSIS:")
print(f"Training data: {train_df.shape}")
print(f"Test data: {test_df.shape}")

print("\nüìä TRAINING DATA COLUMNS:")
print(train_df.columns.tolist())

print("\nüìä TRAINING DATA SAMPLE:")
print(train_df.head())

print("\nüìä TEST DATA SAMPLE:")
print(test_df.head())

In [None]:
# Analyze the target column
print("üìà TARGET VARIABLE ANALYSIS:")
print(f"Target column: 'target'")
print(f"Target statistics:")
print(train_df['target'].describe())

plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.hist(train_df['target'], bins=50, alpha=0.7, color='green', edgecolor='black')
plt.title('Distribution of Target Biomass')
plt.xlabel('Target Biomass')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
plt.boxplot(train_df['target'])
plt.title('Boxplot of Target Biomass')
plt.ylabel('Target Biomass')

plt.subplot(1, 3, 3)
# Check if we should use log transform
if train_df['target'].min() > 0:
    plt.hist(np.log1p(train_df['target']), bins=50, alpha=0.7, color='orange', edgecolor='black')
    plt.title('Log-Transformed Target Biomass')
    plt.xlabel('Log(Target + 1)')
    plt.ylabel('Frequency')
else:
    # Show original distribution again
    plt.hist(train_df['target'], bins=50, alpha=0.7, color='blue', edgecolor='black')
    plt.title('Target Distribution (Original)')
    plt.xlabel('Target Biomass')
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Check the image paths and files
print("üñºÔ∏è IMAGE PATHS ANALYSIS:")

# Check a few image paths from the training data
print("Sample image paths from training data:")
for i, path in enumerate(train_df['image_path'].head()):
    print(f"  {i+1}. {path}")

# Check if images exist
print("\nüîç CHECKING IF IMAGES EXIST:")
data_dir = '/kaggle/input/csiro-biomass'

# Check a few images
found_count = 0
not_found_count = 0

for i, img_path in enumerate(train_df['image_path'].head(10)):
    full_path = os.path.join(data_dir, img_path)
    if os.path.exists(full_path):
        print(f"‚úÖ {img_path} - EXISTS")
        found_count += 1
        # Display first found image
        if found_count == 1:
            try:
                img = Image.open(full_path)
                print(f"    Image size: {img.size}, Mode: {img.mode}")
            except Exception as e:
                print(f"    Error opening image: {e}")
    else:
        print(f"‚ùå {img_path} - NOT FOUND")
        not_found_count += 1

print(f"\nSummary: {found_count} found, {not_found_count} not found in first 10 samples")

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

class CorrectBiomassDataset(Dataset):
    def __init__(self, dataframe, base_dir, target_column='target', transform=None, is_test=False):
        self.dataframe = dataframe
        self.base_dir = base_dir
        self.target_column = target_column
        self.transform = transform
        self.is_test = is_test
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        # Get image path from dataframe
        img_relative_path = self.dataframe.iloc[idx]['image_path']
        img_full_path = os.path.join(self.base_dir, img_relative_path)
        
        # Load image
        try:
            img = Image.open(img_full_path).convert('RGB')
        except Exception as e:
            print(f"Error loading image {img_full_path}: {e}")
            # Create a dummy image as fallback
            img = Image.new('RGB', (256, 256), color='gray')
        
        # Apply transforms
        if self.transform:
            img = self.transform(img)
        
        if self.is_test:
            sample_id = self.dataframe.iloc[idx]['sample_id']
            return img, sample_id
        else:
            target_value = float(self.dataframe.iloc[idx][self.target_column])
            return img, torch.tensor(target_value, dtype=torch.float32)

# Define transforms
train_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomHorizontalFlip(p=0.3),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create datasets
train_dataset = CorrectBiomassDataset(
    train_df, 
    data_path, 
    target_column='target', 
    transform=train_transform
)

test_dataset = CorrectBiomassDataset(
    test_df, 
    data_path, 
    transform=val_transform, 
    is_test=True
)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

print("‚úÖ Datasets and data loaders created successfully!")
print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")
print(f"Train batches: {len(train_loader)}")
print(f"Test batches: {len(test_loader)}")

In [None]:
# Test the pipeline
print("üß™ Testing the data pipeline...")

# Test training data
for images, targets in train_loader:
    print(f"‚úÖ Training pipeline working!")
    print(f"Batch images shape: {images.shape}")
    print(f"Batch targets shape: {targets.shape}")
    print(f"Sample target values: {targets[:5]}")
    print(f"Target stats - Min: {targets.min():.2f}, Max: {targets.max():.2f}, Mean: {targets.mean():.2f}")
    
    # Display sample image
    plt.figure(figsize=(10, 8))
    
    # Show first 4 images
    for i in range(min(4, images.shape[0])):
        plt.subplot(2, 2, i+1)
        # Denormalize for display
        img_display = images[i].permute(1, 2, 0).numpy()
        img_display = img_display * np.array([0.229, 0.224, 0.225]) + np.array([0.485, 0.456, 0.406])
        img_display = np.clip(img_display, 0, 1)
        plt.imshow(img_display)
        plt.title(f'Target: {targets[i]:.2f}')
        plt.axis('off')
    
    plt.tight_layout()
    plt.show()
    break

# Test test data
print("\nüß™ Testing test data pipeline...")
for images, sample_ids in test_loader:
    print(f"‚úÖ Test pipeline working!")
    print(f"Batch images shape: {images.shape}")
    print(f"Sample IDs: {sample_ids}")
    break

In [None]:
# Display some sample images with their targets
def display_sample_images_with_targets(dataset, num_samples=8):
    fig, axes = plt.subplots(2, 4, figsize=(20, 10))
    axes = axes.ravel()
    
    indices = np.random.choice(len(dataset), num_samples, replace=False)
    
    for i, idx in enumerate(indices):
        image, target = dataset[idx]
        
        # Convert tensor back to numpy for display
        if isinstance(image, torch.Tensor):
            img_display = image.permute(1, 2, 0).numpy()
            img_display = img_display * np.array([0.229, 0.224, 0.225]) + np.array([0.485, 0.456, 0.406])
            img_display = np.clip(img_display, 0, 1)
        else:
            img_display = np.array(image)
        
        axes[i].imshow(img_display)
        axes[i].set_title(f'Target: {target:.2f}', fontsize=12, weight='bold')
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.suptitle('Sample Training Images with Biomass Targets', fontsize=16, y=1.02)
    plt.show()

display_sample_images_with_targets(train_dataset)

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import timm

class BiomassPredictor(nn.Module):
    def __init__(self, backbone='resnet50', pretrained=True):
        super(BiomassPredictor, self).__init__()
        
        # Use pre-trained CNN backbone
        if backbone == 'resnet50':
            self.backbone = models.resnet50(pretrained=pretrained)
            # Replace the final fully connected layer
            in_features = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()  # Remove the original classification head
        elif backbone == 'efficientnet_b0':
            self.backbone = timm.create_model('efficientnet_b0', pretrained=pretrained, num_classes=0)
            in_features = self.backbone.num_features
        else:
            raise ValueError(f"Unsupported backbone: {backbone}")
        
        # Regression head for biomass prediction
        self.regressor = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(in_features, 512),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),
            nn.Linear(512, 128),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(128),
            nn.Dropout(0.2),
            nn.Linear(128, 1)  # Single output for biomass value
        )
        
    def forward(self, x):
        features = self.backbone(x)
        biomass = self.regressor(features)
        return biomass.squeeze()  # Remove extra dimension

# Create model
model = BiomassPredictor(backbone='resnet50', pretrained=True)

# Move to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

print("‚úÖ Model created successfully!")
print(f"Using device: {device}")
print(f"Model architecture:\n{model}")

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nüìä Total parameters: {total_params:,}")
print(f"üìä Trainable parameters: {trainable_params:,}")

In [None]:
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Loss function - Mean Squared Error for regression
criterion = nn.MSELoss()

# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)

# Learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

print("‚úÖ Loss function, optimizer, and scheduler defined!")
print(f"Loss: {criterion}")
print(f"Optimizer: {optimizer}")
print(f"Scheduler: ReduceLROnPlateau")

In [None]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    all_predictions = []
    all_targets = []
    
    for batch_idx, (images, targets) in enumerate(dataloader):
        images, targets = images.to(device), targets.to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, targets)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        # Statistics
        running_loss += loss.item()
        all_predictions.extend(outputs.detach().cpu().numpy())
        all_targets.extend(targets.cpu().numpy())
        
        # Progress update
        if (batch_idx + 1) % 50 == 0:
            print(f'    Batch {batch_idx + 1}/{len(dataloader)}, Loss: {loss.item():.4f}')
    
    epoch_loss = running_loss / len(dataloader)
    return epoch_loss, all_predictions, all_targets

def validate_epoch(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for images, targets in dataloader:
            images, targets = images.to(device), targets.to(device)
            
            outputs = model(images)
            loss = criterion(outputs, targets)
            
            running_loss += loss.item()
            all_predictions.extend(outputs.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
    
    epoch_loss = running_loss / len(dataloader)
    return epoch_loss, all_predictions, all_targets

print("‚úÖ Training and validation functions defined!")



In [None]:
from sklearn.model_selection import train_test_split

# Split training data into train and validation
train_indices, val_indices = train_test_split(
    range(len(train_df)), 
    test_size=0.2, 
    random_state=42,
    shuffle=True
)

print(f"Training samples: {len(train_indices)}")
print(f"Validation samples: {len(val_indices)}")

# Create subset datasets
from torch.utils.data import Subset

train_subset = Subset(train_dataset, train_indices)
val_subset = Subset(train_dataset, val_indices)

# Create data loaders for train and validation
train_loader = DataLoader(train_subset, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(val_subset, batch_size=32, shuffle=False, num_workers=2)

print("‚úÖ Train/validation split created!")
print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"‚úÖ Device set to: {device}")

# Define training functions
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    all_predictions = []
    all_targets = []
    
    for batch_idx, (images, targets) in enumerate(dataloader):
        images, targets = images.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        all_predictions.extend(outputs.detach().cpu().numpy())
        all_targets.extend(targets.cpu().numpy())
        
    epoch_loss = running_loss / len(dataloader)
    return epoch_loss, all_predictions, all_targets

def validate_epoch(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for images, targets in dataloader:
            images, targets = images.to(device), targets.to(device)
            outputs = model(images)
            loss = criterion(outputs, targets)
            running_loss += loss.item()
            all_predictions.extend(outputs.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
    
    epoch_loss = running_loss / len(dataloader)
    return epoch_loss, all_predictions, all_targets

def calculate_metrics(predictions, targets):
    mae = mean_absolute_error(targets, predictions)
    mse = mean_squared_error(targets, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(targets, predictions)
    return mae, mse, rmse, r2

print("‚úÖ All training functions defined!")

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import timm

class BiomassPredictor(nn.Module):
    def __init__(self, backbone='resnet50', pretrained=True):
        super(BiomassPredictor, self).__init__()
        
        if backbone == 'resnet50':
            self.backbone = models.resnet50(pretrained=pretrained)
            in_features = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
        elif backbone == 'efficientnet_b0':
            self.backbone = timm.create_model('efficientnet_b0', pretrained=pretrained, num_classes=0)
            in_features = self.backbone.num_features
        else:
            raise ValueError(f"Unsupported backbone: {backbone}")
        
        self.regressor = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(in_features, 512),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),
            nn.Linear(512, 128),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(128),
            nn.Dropout(0.2),
            nn.Linear(128, 1)
        )
        
    def forward(self, x):
        features = self.backbone(x)
        biomass = self.regressor(features)
        return biomass.squeeze()

# Create model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BiomassPredictor(backbone='resnet50', pretrained=True).to(device)

print("‚úÖ Model created successfully!")
print(f"Model is on: {device}")

In [None]:
import pandas as pd
import os

# Load the data
data_path = '/kaggle/input/csiro-biomass'
train_df = pd.read_csv(f'{data_path}/train.csv')
test_df = pd.read_csv(f'{data_path}/test.csv')

print("‚úÖ Data loaded!")
print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")

# Make sure train_dataset exists
if 'train_dataset' not in locals():
    # Recreate the dataset if needed
    from torch.utils.data import Dataset
    import torchvision.transforms as transforms
    from PIL import Image
    
    class CorrectBiomassDataset(Dataset):
        def __init__(self, dataframe, base_dir, target_column='target', transform=None, is_test=False):
            self.dataframe = dataframe
            self.base_dir = base_dir
            self.target_column = target_column
            self.transform = transform
            self.is_test = is_test
            
        def __len__(self):
            return len(self.dataframe)
        
        def __getitem__(self, idx):
            img_relative_path = self.dataframe.iloc[idx]['image_path']
            img_full_path = os.path.join(self.base_dir, img_relative_path)
            
            try:
                img = Image.open(img_full_path).convert('RGB')
            except Exception as e:
                print(f"Error loading image {img_full_path}: {e}")
                img = Image.new('RGB', (256, 256), color='gray')
            
            if self.transform:
                img = self.transform(img)
            
            if self.is_test:
                sample_id = self.dataframe.iloc[idx]['sample_id']
                return img, sample_id
            else:
                target_value = float(self.dataframe.iloc[idx][self.target_column])
                return img, torch.tensor(target_value, dtype=torch.float32)
    
    # Create transforms
    train_transform = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.RandomHorizontalFlip(p=0.3),
        transforms.RandomRotation(10),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    # Create dataset
    train_dataset = CorrectBiomassDataset(
        train_df, 
        data_path, 
        target_column='target', 
        transform=train_transform
    )
    
    print("‚úÖ train_dataset created!")

print("‚úÖ All data variables are ready!")

In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset, DataLoader

# Create train/validation split
train_indices, val_indices = train_test_split(
    range(len(train_df)), 
    test_size=0.2, 
    random_state=42,
    shuffle=True
)

print(f"Training samples: {len(train_indices)}")
print(f"Validation samples: {len(val_indices)}")

# Create subset datasets
train_subset = Subset(train_dataset, train_indices)
val_subset = Subset(train_dataset, val_indices)

# Create data loaders for train and validation
train_loader = DataLoader(train_subset, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(val_subset, batch_size=32, shuffle=False, num_workers=2)

print("‚úÖ Data loaders created!")
print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")

In [None]:
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

print("‚úÖ Loss function and optimizer defined!")
print(f"Criterion: {criterion}")
print(f"Optimizer: {optimizer}")

In [None]:
import time
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def calculate_metrics(predictions, targets):
    mae = mean_absolute_error(targets, predictions)
    mse = mean_squared_error(targets, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(targets, predictions)
    return mae, mse, rmse, r2

# Training configuration
num_epochs = 20
best_val_loss = float('inf')
patience = 5
patience_counter = 0

# Lists to store metrics
train_losses = []
val_losses = []
train_metrics = []
val_metrics = []

print("üöÄ Starting training...")
print(f"Epochs: {num_epochs}")
print(f"Device: {device}")
print("-" * 60)

for epoch in range(num_epochs):
    start_time = time.time()
    
    # Training phase
    train_loss, train_preds, train_targets = train_epoch(model, train_loader, criterion, optimizer, device)
    
    # Validation phase
    val_loss, val_preds, val_targets = validate_epoch(model, val_loader, criterion, device)
    
    # Calculate metrics
    train_mae, train_mse, train_rmse, train_r2 = calculate_metrics(train_preds, train_targets)
    val_mae, val_mse, val_rmse, val_r2 = calculate_metrics(val_preds, val_targets)
    
    # Update learning rate
    scheduler.step(val_loss)
    
    # Store metrics
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_metrics.append((train_mae, train_rmse, train_r2))
    val_metrics.append((val_mae, val_rmse, val_r2))
    
    # Print progress
    epoch_time = time.time() - start_time
    print(f'Epoch {epoch+1:02d}/{num_epochs} | Time: {epoch_time:.1f}s')
    print(f'  Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}')
    print(f'  Train MAE: {train_mae:.4f} | Val MAE: {val_mae:.4f}')
    print(f'  Train RMSE: {train_rmse:.4f} | Val RMSE: {val_rmse:.4f}')
    print(f'  Train R¬≤: {train_r2:.4f} | Val R¬≤: {val_r2:.4f}')
    print('-' * 60)
    
    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pth')
        print(f'üíæ Best model saved! Val Loss: {val_loss:.4f}')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f'üõë Early stopping after {epoch+1} epochs')
            break

print("‚úÖ Training completed!")