# Data Exploration

This notebook explores the CSIRO Pasture Biomass Estimation dataset.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import cv2
from PIL import Image

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)


In [None]:
# Load data
data_dir = Path('../data/raw')
train_df = pd.read_csv(data_dir / 'train.csv')

print(f"Training samples: {len(train_df)}")
print(f"Columns: {train_df.columns.tolist()}")
train_df.head()


In [None]:
# Basic statistics
train_df.describe()


In [None]:
# Visualize target distribution
if 'biomass' in train_df.columns:
    plt.figure(figsize=(10, 6))
    plt.hist(train_df['biomass'], bins=50, edgecolor='black')
    plt.xlabel('Biomass')
    plt.ylabel('Frequency')
    plt.title('Distribution of Pasture Biomass')
    plt.show()


In [None]:
# Visualize sample images
image_dir = data_dir / 'images'
if image_dir.exists():
    sample_images = train_df.head(9)
    fig, axes = plt.subplots(3, 3, figsize=(15, 15))
    for idx, (_, row) in enumerate(sample_images.iterrows()):
        ax = axes[idx // 3, idx % 3]
        # TODO: Load and display image based on dataset structure
        ax.axis('off')
    plt.tight_layout()
    plt.show()
