# *CSIRO - Image2Biomass Prediction*

In [None]:

DATA_PATH = "/kaggle/input/csiro-biomass/train.csv"
SAVE_DIR = "/kaggle/working/eda_results"  
import os
from pathlib import Path
Path(SAVE_DIR).mkdir(parents=True, exist_ok=True)


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import isfinite

%matplotlib inline
pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 180)


## Load Data & Basic Info

In [None]:
# Load CSV
data = pd.read_csv(DATA_PATH)
print(f"Data shape: {data.shape}")
display(data.head())

In [None]:
# Column types & memory usage
mem_usage_mb = data.memory_usage(deep=True).sum() / (1024**2)
print(f"Memory usage: {mem_usage_mb:.2f} MB")
display(pd.DataFrame({"dtype": data.dtypes}))


## Missing Values Check

In [None]:
missing_count = data.isna().sum()
missing_pct = (missing_count / len(data) * 100).round(2)
missing_table = pd.DataFrame({"missing_count": missing_count, "missing_pct": missing_pct})
display(missing_table)


## Column Categorization

In [None]:
# Numeric vs categorical
numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
low_cardinality_numeric = [c for c in numeric_cols if data[c].nunique() <= 10]
categorical_cols = [c for c in data.columns if c not in numeric_cols]
categorical_like_cols = sorted(set(categorical_cols + low_cardinality_numeric))
strict_numeric_cols = [c for c in numeric_cols if c not in low_cardinality_numeric]

print("Strict numeric columns:", len(strict_numeric_cols))
print("Categorical-like columns:", len(categorical_like_cols))


## Descriptive Statistics

In [None]:
# Numeric summary
if strict_numeric_cols:
    display(data[strict_numeric_cols].describe().T)



In [None]:

# Categorical summary
if categorical_like_cols:
    cat_summary = pd.DataFrame({
        "unique_values": [data[c].nunique() for c in categorical_like_cols],
        "top_value": [data[c].mode()[0] if data[c].notna().any() else np.nan for c in categorical_like_cols],
        "top_count": [data[c].value_counts().iloc[0] if data[c].notna().any() else np.nan for c in categorical_like_cols]
    }, index=categorical_like_cols)
    display(cat_summary)

## Helper Functions

In [None]:
def save_plot(fig_title):
    """Save matplotlib figure to SAVE_DIR"""
    fname = "".join(ch if ch.isalnum() or ch in "-_." else "_" for ch in fig_title)
    plt.savefig(Path(SAVE_DIR)/f"{fname}.png", bbox_inches="tight", dpi=150)
    plt.show()
    plt.close()

def fd_bins(series):
    """Freedman-Diaconis bin calculation"""
    s = series.dropna().astype(float)
    if s.size < 2: return 10
    q1, q3 = np.percentile(s, [25, 75])
    iqr = q3 - q1
    if iqr <= 0: return min(max(s.nunique(), 10), 50)
    width = 2 * iqr * (len(s) ** (-1/3))
    if width <= 0: return 30
    return int(np.clip((s.max() - s.min()) / width, 10, 100))

def iqr_outlier_percentage(series):
    """Calculate percentage of outliers using IQR method"""
    s = series.dropna().astype(float)
    if s.empty: return np.nan
    q1, q3 = np.percentile(s, [25, 75])
    iqr = q3 - q1
    if iqr <= 0: return 0.0
    lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
    return ((s < lower) | (s > upper)).mean() * 100


## Univariate Analysis (Numeric)

In [None]:
numeric_summary = []

for col in strict_numeric_cols:
    series = data[col].dropna().astype(float)
    print(f"\n--- Column: {col} ---")
    display(series.describe(percentiles=[.01,.05,.25,.5,.75,.95,.99]).to_frame().T)
    
    # Outliers
    pct_out = iqr_outlier_percentage(series)
    print(f"IQR outlier %: {pct_out:.2f}%")
    
    # Histogram
    plt.figure(figsize=(8,4))
    if not series.empty:
        plt.hist(series, bins=fd_bins(series))
    else:
        plt.text(0.5, 0.5, "No data", ha="center")
    plt.title(f"Histogram: {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    save_plot(f"hist_{col}")
    
    # Boxplot
    plt.figure(figsize=(6,4))
    if not series.empty:
        plt.boxplot(series, vert=True, labels=[col], showfliers=True)
    plt.title(f"Boxplot: {col}")
    save_plot(f"box_{col}")
    
    numeric_summary.append({"column": col, "iqr_outlier_pct": pct_out})
    
# Summary Table
display(pd.DataFrame(numeric_summary).sort_values("iqr_outlier_pct", ascending=False))


## Categorical Analysis

In [None]:
for col in categorical_like_cols:
    plt.figure(figsize=(8,4))
    vc = data[col].astype("category").value_counts().head(20)
    vc.plot(kind="bar")
    plt.title(f"Top 20 categories: {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    save_plot(f"bar_{col}")


## Correlation (Numeric)

In [None]:
if len(strict_numeric_cols) >= 2:
    corr_matrix = data[strict_numeric_cols].corr(numeric_only=True)
    plt.figure(figsize=(6,5))
    im = plt.imshow(corr_matrix, interpolation="nearest")
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.title("Correlation Heatmap")
    plt.xticks(range(len(strict_numeric_cols)), strict_numeric_cols, rotation=90)
    plt.yticks(range(len(strict_numeric_cols)), strict_numeric_cols)
    save_plot("correlation_heatmap")


## Target Column Inference

In [None]:
def find_target_column(df):
    """Heuristic to detect numeric target column"""
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    keywords = ["target","label","biomass","agb","yield","gdm"]
    candidates = [c for c in df.columns if any(k in c.lower() for k in keywords)]
    numeric_candidates = [c for c in candidates if c in numeric_cols]
    if numeric_candidates:
        numeric_candidates.sort(key=lambda c: df[c].isna().sum())
        return numeric_candidates[0]
    return None

TARGET = find_target_column(data)
print("Inferred target column:", TARGET)


## Target Distribution

In [None]:
if TARGET and TARGET in data.columns:
    plt.figure(figsize=(6,4))
    y = data[TARGET]
    if y.dtype.kind in "ifu":
        s = y.dropna().astype(float)
        if not s.empty:
            plt.hist(s, bins=30)
        else:
            plt.text(0.5,0.5,"No target data", ha="center")
    else:
        vc = y.astype("category").value_counts()
        vc.plot(kind="bar")
    plt.title(f"Target distribution: {TARGET}")
    plt.xlabel(TARGET)
    plt.ylabel("Count")
    save_plot(f"target_dist_{TARGET}")


## Numeric Predictors vs Target

In [None]:
if TARGET and TARGET in data.columns and data[TARGET].dtype.kind in "ifu":
    for col in [c for c in strict_numeric_cols if c != TARGET]:
        plt.figure(figsize=(5,4))
        xs, ys = data[col].astype(float), data[TARGET].astype(float)
        mask = xs.notna() & ys.notna()
        if mask.sum():
            plt.scatter(xs[mask], ys[mask], s=10, alpha=0.6)
        else:
            plt.text(0.5,0.5,"No overlapping data", ha="center")
        plt.title(f"{col} vs {TARGET}")
        plt.xlabel(col)
        plt.ylabel(TARGET)
        save_plot(f"scatter_{col}_vs_{TARGET}")


## Categorical Predictors vs Target

In [None]:
if TARGET and TARGET in data.columns and len(categorical_like_cols) > 0 and data[TARGET].dtype.kind in "ifu":
    for col in categorical_like_cols:
        tmp = data[[col,TARGET]].dropna()
        if tmp.empty: continue
        top_levels = tmp[col].value_counts().head(12).index
        data_list = [tmp[tmp[col]==lvl][TARGET].values for lvl in top_levels]
        plt.figure(figsize=(8,4))
        plt.boxplot(data_list, labels=[str(l) for l in top_levels], showfliers=False)
        plt.title(f"{TARGET} by {col} (Top 12)")
        plt.xlabel(col)
        plt.ylabel(TARGET)
        save_plot(f"box_{TARGET}_by_{col}")


## Outlier Summary Table

In [None]:
outlier_rows = []
for col in strict_numeric_cols:
    s = data[col].dropna().astype(float)
    if s.empty:
        outlier_rows.append((col, np.nan, np.nan, np.nan))
        continue
    q1, q3 = np.percentile(s,[25,75])
    iqr = q3 - q1
    if iqr <= 0:
        outlier_rows.append((col,q1,q3,0.0))
        continue
    lower, upper = q1-1.5*iqr, q3+1.5*iqr
    pct = ((s<lower) | (s>upper)).mean()*100
    outlier_rows.append((col, round(q1,4), round(q3,4), round(pct,3)))

outlier_summary = pd.DataFrame(outlier_rows, columns=["column","Q1","Q3","%_outliers_IQR"])
display(outlier_summary.sort_values("%_outliers_IQR", ascending=False))


## CSIRO-specific Pivot / Wide-table Creation

In [None]:
EXPECTED_TARGETS = ["Dry_Green_g","Dry_Dead_g","Dry_Clover_g","GDM_g","Dry_Total_g"]

# Pivot long -> wide
pivot_df = data.pivot_table(
    index="image_path",
    columns="target_name",
    values="target",
    aggfunc="mean"
)
pivot_df = pivot_df.reindex(columns=EXPECTED_TARGETS)

# Merge metadata (first row per image)
meta_cols = [c for c in ["Sampling_Date","State","Pre_GSHH_NDVI","Height_Ave_cm","Species"] if c in data.columns]
meta = data.drop_duplicates(subset=["image_path"]).set_index("image_path")[meta_cols]
wide_df = pivot_df.join(meta, how="left").reset_index()

print("Wide table shape:", wide_df.shape)
display(wide_df.head())


## Target Relationships Check

In [None]:
# Dry_Total residual
residual_total = wide_df["Dry_Total_g"] - (wide_df["Dry_Green_g"].fillna(0) + wide_df["Dry_Dead_g"].fillna(0) + wide_df["Dry_Clover_g"].fillna(0))
plt.figure(figsize=(6,4))
plt.hist(residual_total.dropna(), bins=fd_bins(residual_total))
plt.title("Residual: Dry_Total_g - sum(Green+Dead+Clover)")
plt.xlabel("Residual"); plt.ylabel("Count")
plt.show()

# GDM vs Dry_Green scatter
plt.figure(figsize=(5,4))
mask = wide_df["GDM_g"].notna() & wide_df["Dry_Green_g"].notna()
plt.scatter(wide_df.loc[mask,"Dry_Green_g"], wide_df.loc[mask,"GDM_g"], s=10, alpha=0.6)
plt.title("Dry_Green_g vs GDM_g")
plt.xlabel("Dry_Green_g"); plt.ylabel("GDM_g")
plt.show()


## Optional Aggregated Plots

In [None]:
# Daily trends
if "Sampling_Date" in wide_df.columns and np.issubdtype(wide_df["Sampling_Date"].dtype, np.datetime64):
    daily_mean = wide_df.groupby(wide_df["Sampling_Date"].dt.date)[EXPECTED_TARGETS].mean()
    daily_mean.plot(figsize=(10,4))
    plt.title("Daily mean of targets")
    plt.xticks(rotation=45)
    plt.show()

# State-level
if "State" in wide_df.columns:
    state_grp = wide_df.groupby("State")[EXPECTED_TARGETS].mean().sort_values("Dry_Total_g", ascending=False).head(10)
    state_grp["Dry_Total_g"].plot(kind="bar", figsize=(8,4))
    plt.title("Top 10 States by mean Dry_Total_g")
    plt.ylabel("Dry_Total_g")
    plt.show()


## ML Algorithm

In [None]:
# Basic ML imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [None]:
df = wide_df.copy()  
TARGET = "Dry_Total_g" 

# Drop rows with missing target
df = df.dropna(subset=[TARGET])

## Identify features

In [None]:
# Numeric features
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_features = [c for c in numeric_features if c != TARGET]

# Categorical features
categorical_features = df.select_dtypes(exclude=[np.number]).columns.tolist()


## Train-Test Split

In [None]:
X = df[numeric_features + categorical_features]
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


## Preprocessing

In [None]:
# Numeric: scale | Categorical: one-hot encode
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])


### Model Pipelines

In [None]:
# Linear Regression
lr_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', LinearRegression())
])

# Random Forest
rf_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', RandomForestRegressor(n_estimators=200, random_state=42))
])

# XGBoost
xgb_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', xgb.XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42))
])


## Train & Evaluate

In [None]:
pipelines = {
    'Linear Regression': lr_pipeline,
    'Random Forest': rf_pipeline,
    'XGBoost': xgb_pipeline
}

results = []

for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)                # Train the model
    y_pred = pipe.predict(X_test)             # Predict on test set
    
    rmse = mean_squared_error(y_test, y_pred, squared=False)  # Root Mean Squared Error
    r2 = r2_score(y_test, y_pred)                               # R-squared
    
    results.append({'Model': name, 'RMSE': rmse, 'R2': r2})


In [None]:
results_df = pd.DataFrame(results)

results_df['RMSE'] = results_df['RMSE'].round(3)
results_df['R2'] = results_df['R2'].round(3)
results_df = results_df.sort_values('RMSE')  # RMSE  sort
display(results_df)


In [None]:
best_model = rf_pipeline  # ধরো Random Forest best
y_pred = best_model.predict(X_test)

plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # perfect prediction line
plt.xlabel("Actual Dry_Total_g")
plt.ylabel("Predicted Dry_Total_g")
plt.title("Actual vs Predicted (Random Forest)")
plt.show()


## Feature Importance (Tree-based models)

In [None]:
# Only for RF or XGB
model = rf_pipeline.named_steps['model']
# Preprocessed feature names
ohe_cols = rf_pipeline.named_steps['preprocess'].named_transformers_['cat'].get_feature_names_out(categorical_features)
all_features = numeric_features + list(ohe_cols)

importances = model.feature_importances_
feat_imp = pd.DataFrame({'feature': all_features, 'importance': importances}).sort_values('importance', ascending=False)
plt.figure(figsize=(10,6))
sns.barplot(x='importance', y='feature', data=feat_imp.head(20))
plt.title("Top 20 Feature Importances (Random Forest)")
plt.show()
