In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from PIL import Image
import plotly.express as px

In [None]:
sns.set_style('whitegrid')
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")

In [None]:
BASE_PATH = '/kaggle/input/csiro-biomass/'
TRAIN_CSV_PATH = os.path.join(BASE_PATH, 'train.csv')
TEST_CSV_PATH = os.path.join(BASE_PATH, 'test.csv')
SUBMISSION_CSV_PATH = os.path.join(BASE_PATH, 'sample_submission.csv')
TRAIN_IMG_DIR = os.path.join(BASE_PATH, 'train/')
TEST_IMG_DIR = os.path.join(BASE_PATH, 'test/')

try:
    train_df = pd.read_csv(TRAIN_CSV_PATH)
    test_df = pd.read_csv(TEST_CSV_PATH)
    submission_df = pd.read_csv(SUBMISSION_CSV_PATH)
    
    print("CSV files loaded successfully!")
    print(f"Train data shape: {train_df.shape}")
    print(f"Test data shape: {test_df.shape}")
    print(f"Submission data shape: {submission_df.shape}")

except FileNotFoundError as e:
    print(f"Error loading files: {e}")
    print("Please ensure you are running this in a Kaggle environment with the competition data attached.")

In [None]:
print("\n" + "="*50)
print("--- Train DataFrame Head ---")
print("="*50)
display(train_df.head())

print("\n" + "="*50)
print("--- Test DataFrame Head ---")
print("="*50)
display(test_df.head())

print("\n" + "="*50)
print("--- Sample Submission DataFrame Head ---")
print("="*50)
display(submission_df.head())

# Deeper Dive into the Training Data

In [None]:
# --- Data Info and Null Check ---
print("="*50)
print("--- DataFrame Info ---")
print("="*50)
train_df.info()

# --- Descriptive Statistics ---
print("\n" + "="*50)
print("--- Descriptive Statistics for Numerical Columns (Long)---")
print("="*50)
display(train_df.describe())

# Reshaping the Data for Analysis

In [None]:
# --- Pivot the DataFrame ---
train_wide_df = train_df.pivot_table(
    index=['image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm'],
    columns='target_name',
    values='target'
).reset_index()

# Extract the unique image ID for easier access
train_wide_df['image_id'] = train_wide_df['image_path'].apply(lambda x: os.path.splitext(os.path.basename(x))[0])

print("\n" + "="*50)
print("--- Reshaped (Wide) DataFrame Head ---")
print("="*50)
display(train_wide_df.head())

In [None]:
print(f"\nShape of the wide DataFrame: {train_wide_df.shape}")
print(f"Number of unique images: {train_df['image_path'].nunique()}")

In [None]:
# --- Descriptive Statistics ---
print("\n" + "="*50)
print("--- Descriptive Statistics for Wide DataFrame ---")
print("="*50)
display(train_wide_df.describe())

# Correlation Analysis

In [None]:
numeric_cols = ['Pre_GSHH_NDVI', 'Height_Ave_cm', 'Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'GDM_g', 'Dry_Total_g']

# --- Compute Correlation Matrix ---
corr_matrix = train_wide_df[numeric_cols].corr()

# --- Visualize the Correlation Matrix with a Heatmap ---
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)
plt.title('Correlation Matrix of Features and Biomass Targets', fontsize=16)
plt.show()

In [None]:
# Display the correlation matrix as a table
print("\n" + "="*50)
print("--- Correlation Matrix ---")
print("="*50)
display(corr_matrix)

# Categorical Feature Distribution

In [None]:
# --- State Distribution ---
fig_state = px.bar(
    train_wide_df['State'].value_counts().reset_index(),
    x='State',
    y='count',
    title='Number of Samples per State',
    labels={'State': 'State', 'count': 'Number of Images'},
    color='State',
    text_auto=True
)
fig_state.show(renderer='iframe')

In [None]:
# --- Species Distribution ---
# Let's look at the top 15 most common species combinations
species_counts = train_wide_df['Species'].value_counts().nlargest(15).reset_index()
fig_species = px.bar(
    species_counts,
    x='Species',
    y='count',
    title='Top 15 Most Common Pasture Species',
    labels={'Species': 'Species Combination', 'count': 'Number of Images'},
    color='count',
    color_continuous_scale=px.colors.sequential.Viridis,
    text_auto=True
)
fig_species.update_xaxes(tickangle=45)
fig_species.show(renderer='iframe')

# Temporal Analysis: Exploring Sampling Dates

In [None]:
# --- Convert Sampling_Date to datetime objects ---
train_wide_df['Sampling_Date'] = pd.to_datetime(train_wide_df['Sampling_Date'])

# --- Extract Year and Month ---
train_wide_df['Year'] = train_wide_df['Sampling_Date'].dt.year
train_wide_df['Month'] = train_wide_df['Sampling_Date'].dt.month

# --- Plot Distribution by Year ---
fig_year = px.histogram(
    train_wide_df,
    x='Year',
    title='Distribution of Samples by Year',
    labels={'Year': 'Year'},
    text_auto=True
)
fig_year.show(renderer='iframe')

In [None]:
# --- Plot Distribution by Month ---
fig_month = px.histogram(
    train_wide_df,
    x='Month',
    title='Distribution of Samples by Month',
    labels={'Month': 'Month'},
    text_auto=True
)
fig_month.update_xaxes(dtick=1)
fig_month.show(renderer='iframe')

# Visual Exploration of Image Data

In [None]:
# --- Function to Display Images with Metadata ---
def display_sample_images(df, n_samples=5, random_seed=42):
    """
    Displays a random selection of images along with their metadata and biomass targets.
    """
    samples = df.sample(n_samples, random_state=random_seed)
    
    plt.figure(figsize=(20, 5 * n_samples))
    
    for i, (idx, row) in enumerate(samples.iterrows()):
        img_path = os.path.join(TRAIN_IMG_DIR, os.path.basename(row['image_path']))
        
        try:
            img = Image.open(img_path)
        except FileNotFoundError:
            print(f"Image not found at {img_path}. Skipping.")
            continue
            
        plt.subplot(n_samples, 1, i + 1)
        plt.imshow(img)
        plt.axis('off')
        
        # --- Create Title with Metadata and Targets ---
        title = (
            f"Image ID: {row['image_id']} | State: {row['State']} | Species: {row['Species']}\n"
            f"NDVI: {row['Pre_GSHH_NDVI']:.2f} | Height: {row['Height_Ave_cm']:.2f} cm\n"
            f"--------------------------------------------------------------------------------\n"
            f"Dry_Total_g: {row['Dry_Total_g']:.2f} | GDM_g: {row['GDM_g']:.2f} | Dry_Green_g: {row['Dry_Green_g']:.2f} | "
            f"Dry_Dead_g: {row['Dry_Dead_g']:.2f} | Dry_Clover_g: {row['Dry_Clover_g']:.2f}"
        )
        plt.title(title, loc='left', fontsize=12, family='monospace')

    plt.tight_layout()
    plt.show()

In [None]:
display_sample_images(train_wide_df, n_samples=5)

# Analyzing Image Dimensions

In [None]:
# --- Get Image Dimensions ---
image_dims = []
for image_file in os.listdir(TRAIN_IMG_DIR):
    with Image.open(os.path.join(TRAIN_IMG_DIR, image_file)) as img:
        image_dims.append(img.size)

# --- Create a DataFrame of Dimensions ---
dims_df = pd.DataFrame(image_dims, columns=['Width', 'Height'])
dims_counts = dims_df.value_counts().reset_index(name='count')

print("="*50)
print("--- Image Dimension Analysis ---")
print("="*50)
print(f"Total images analyzed: {len(dims_df)}")
print(f"Number of unique dimensions found: {len(dims_counts)}")

if len(dims_counts) == 1:
    print(f"\nAll images have the same dimensions: {dims_counts.iloc[0]['Width']}x{dims_counts.iloc[0]['Height']}")
else:
    print("\nMultiple image dimensions found:")
    display(dims_counts)


This exploratory data analysis has provided a deep understanding of the CSIRO pasture biomass dataset, laying a strong foundation for the modeling phases.

### 1. Data Structure and Integrity
*   **Complete Data**: The training dataset contains 357 unique images, each with 5 corresponding biomass target values. Crucially, there are **no missing values**, which simplifies preprocessing.
*   **Wide vs. Long Format**: We successfully reshaped the data from its initial 'long' format to a 'wide' format, which proved essential for effective correlation and feature analysis.

### 2. Key Feature Insights
*   **`Height_Ave_cm` is a Powerhouse**: Pasture height shows a strong positive correlation with green biomass (`Dry_Green_g`: 0.65) and total green matter (`GDM_g`: 0.58). This is likely one of the most predictive features available.
*   **`Pre_GSHH_NDVI` is a Reliable Indicator**: As expected, the NDVI (a measure of greenness) is moderately correlated with `GDM_g` (0.47) and `Dry_Green_g` (0.35). It's a valuable input for any model.
*   **Biomass Inter-correlation**: The target variables themselves are highly correlated. `Dry_Total_g` is strongly predicted by `GDM_g` (0.90) and `Dry_Green_g` (0.83). This suggests that models that accurately predict green matter will likely do well in predicting total biomass, which has the highest weight (0.5) in the evaluation metric.

### 3. Data Distribution and Bias
*   **Geographic Skew**: The data is predominantly from Tasmania (138 samples) and Victoria (112 samples). Any model will inherently be biased towards the pasture types and conditions found in these states.
*   **Species Imbalance**: The dataset is dominated by `Ryegrass_Clover` and `Ryegrass`. There is a long tail of rare species, which may be challenging to model accurately.
*   **Temporal Concentration**: All samples were collected in **2015**, primarily during the autumn and spring growing seasons (May, June, September). The model will not have seen data from other years or seasons.

### 4. Image Characteristics
*   **Consistent Dimensions**: All training images share the same dimensions: **2000x1000 pixels**. This is a significant advantage, as it standardizes the input for computer vision models and simplifies the preprocessing pipeline.
*   **Visual-Data Link**: The sample images clearly show a visual correspondence between the pasture's appearance (color, density, texture) and its biomass composition. Lush, green fields correspond to high `Dry_Green_g`, while patchy, brown fields have high `Dry_Dead_g`. This confirms that a computer vision approach is highly viable.

---