In [None]:
# ===================== CSIRO Biomass（纵表5标签）EDA =====================
CSV_PATH = "/kaggle/input/csiro-biomass/train.csv"

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

%matplotlib inline
pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 180)

EXPECTED_TARGETS = ["Dry_Green_g", "Dry_Dead_g", "Dry_Clover_g", "GDM_g", "Dry_Total_g"]

# ----------------- 读取 & 基本检查 -----------------
df_long = pd.read_csv(CSV_PATH)
print("Shape:", df_long.shape)
display(df_long.head())

print("\n===== 列信息 =====")
print(df_long.dtypes)

print("\n===== 缺失值情况 =====")
missing = df_long.isna().sum().sort_values(ascending=False)
display(pd.DataFrame({"missing_count": missing, "missing_pct": (missing/len(df_long)*100).round(2)}))

# 关键列存在性
required_cols = ["image_path", "target_name", "target"]
for c in required_cols:
    assert c in df_long.columns, f"缺少必须列：{c}"

# 解析日期（若失败则保留为字符串）
if "Sampling_Date" in df_long.columns:
    try:
        df_long["Sampling_Date"] = pd.to_datetime(df_long["Sampling_Date"])
    except Exception as e:
        print("Sampling_Date 解析为 datetime 失败，按字符串保留。", e)

# ----------------- 分组一致性与5行齐全性校验 -----------------
print("\n===== 以 image_path 分组的行数分布（应为全 5）=====")
cnt_per_img = df_long.groupby("image_path")["target_name"].count()
display(cnt_per_img.describe())
bad_count = cnt_per_img[cnt_per_img != 5]
print(f"非5行的图片数：{len(bad_count)}")
if len(bad_count) > 0:
    display(bad_count.head())

print("\n===== 每张图片是否包含5个预期 target_name（应为全 True）=====")
def has_all_targets(g):
    return set(g["target_name"].unique()) == set(EXPECTED_TARGETS)
ok_mask = df_long.groupby("image_path").apply(has_all_targets)
print("缺失某些 target_name 的图片数：", int((~ok_mask).sum()))
if (~ok_mask).any():
    display(ok_mask[~ok_mask].head())

# 同图元数据一致性（日期/州/NDVI/高度等）
meta_cols = [c for c in ["Sampling_Date","State","Pre_GSHH_NDVI","Height_Ave_cm","Species"] if c in df_long.columns]
print("\n===== 同一 image_path 的元数据一致性检查 =====")
inconsist = {}
for c in meta_cols:
    nunq = df_long.groupby("image_path")[c].nunique(dropna=False)
    bad = nunq[nunq > 1]
    inconsist[c] = len(bad)
    if len(bad) > 0:
        print(f"[不一致] {c}：{len(bad)} 张图片的该字段在5行内不一致（仅展示前5）")
        display(df_long[df_long["image_path"].isin(bad.index)][["image_path","target_name",c]].head(15))
if inconsist:
    print("不一致计数：", inconsist)

# ----------------- 纵表 -> 横表（每图一行，5目标作列）-----------------
print("\n===== 纵转横（pivot）=====")
pivot = df_long.pivot_table(index="image_path",
                            columns="target_name",
                            values="target",
                            aggfunc="mean")  # 若有重复行，用均值消解
# 保持列顺序
pivot = pivot.reindex(columns=EXPECTED_TARGETS)
# 合并元数据（取每图第一行）
meta = df_long.drop_duplicates(subset=["image_path"]).set_index("image_path")
meta = meta[[c for c in meta_cols if c in meta.columns]]
df = pivot.join(meta, how="left").reset_index()

print("横表形状：", df.shape)
display(df.head())

# 基本数量核对
n_images = df["image_path"].nunique()
print(f"唯一图片数量（横表行数）：{n_images}")

# ----------------- 五目标总体统计 & 缺失 -----------------
print("\n===== 五目标 describe() =====")
display(df[EXPECTED_TARGETS].describe().T)

print("\n===== 五目标缺失情况 =====")
miss_t = df[EXPECTED_TARGETS].isna().sum().sort_values(ascending=False)
display(pd.DataFrame({"missing_count": miss_t, "missing_pct": (miss_t/len(df)*100).round(2)}))

# ----------------- 工具函数 -----------------
def fd_bins(x):
    x = pd.Series(x).dropna().astype(float)
    if x.size < 2: return 10
    q1,q3 = np.percentile(x,[25,75]); iqr = q3-q1
    if iqr <= 0: return min(max(x.nunique(),10),50)
    bw = 2*iqr*(len(x)**(-1/3))
    if bw <= 0: return 30
    return int(np.clip((x.max()-x.min())/bw, 10, 100))

def outlier_iqr_pct(x):
    s = pd.Series(x).dropna().astype(float)
    if s.empty: return np.nan
    q1,q3 = np.percentile(s,[25,75]); iqr = q3-q1
    if iqr <= 0: return 0.0
    lower = q1 - 1.5*iqr; upper = q3 + 1.5*iqr
    return float(((s<lower)|(s>upper)).mean()*100)

# ----------------- 单目标：分布/箱线图/异常值/对数建议 -----------------
from math import isfinite

per_target_summary = []
for col in EXPECTED_TARGETS:
    print(f"\n================ Target: {col} ================")
    s = df[col]
    display(s.describe(percentiles=[.01,.05,.25,.5,.75,.95,.99]).to_frame().T)

    # 异常值
    iqr_pct = outlier_iqr_pct(s)
    print(f"IQR异常值比例: {iqr_pct:.3f}%")

    # 偏度/峰度
    s_valid = s.dropna().astype(float)
    if len(s_valid) > 1:
        skew = float(s_valid.skew())
        kurt = float(s_valid.kurtosis())
    else:
        skew, kurt = np.nan, np.nan
    print(f"Skewness: {skew:.3f} | Kurtosis: {kurt:.3f}")

    # 直方图 + 箱线图
    fig = plt.figure(figsize=(12,4))
    plt.subplot(1,2,1)
    if s_valid.empty:
        plt.text(0.5,0.5,"No data",ha="center",va="center")
    else:
        plt.hist(s_valid, bins=fd_bins(s_valid))
    plt.title(f"Histogram: {col}"); plt.xlabel(col); plt.ylabel("Count")

    plt.subplot(1,2,2)
    if s_valid.empty:
        plt.text(0.5,0.5,"No data",ha="center",va="center")
    else:
        plt.boxplot(s_valid, vert=True, labels=[col], showfliers=True)
    plt.title(f"Boxplot: {col}")
    plt.tight_layout(); plt.show()

    # 对数变换建议
    if s_valid.min() > 0 and isfinite(skew) and abs(skew) > 1:
        print("提示：分布明显右偏且>0，尝试 log1p 变换有助于稳态。")
        t = np.log1p(s_valid)
        plt.figure(figsize=(6,4))
        plt.hist(t, bins=fd_bins(t))
        plt.title(f"log1p({col}) histogram"); plt.xlabel(f"log1p({col})"); plt.ylabel("Count")
        plt.show()
    elif s_valid.min() <= 0 and isfinite(skew) and abs(skew) > 1:
        print("提示：分布偏斜但包含非正值，可考虑 Yeo-Johnson/稳健缩放。")

    per_target_summary.append({"target": col, "iqr_outlier_pct": iqr_pct, "skew": skew})

print("\n===== 单目标概览汇总 =====")
display(pd.DataFrame(per_target_summary).sort_values("iqr_outlier_pct", ascending=False))

# ----------------- 五目标之间的相关性 & 散点矩阵 -----------------
corr_targets = df[EXPECTED_TARGETS].corr(numeric_only=True)
plt.figure(figsize=(5,4))
im = plt.imshow(corr_targets, interpolation="nearest")
plt.title("Correlation heatmap (5 targets)")
plt.xticks(range(len(EXPECTED_TARGETS)), EXPECTED_TARGETS, rotation=45)
plt.yticks(range(len(EXPECTED_TARGETS)), EXPECTED_TARGETS)
plt.colorbar(im, fraction=0.046, pad=0.04)
plt.tight_layout(); plt.show()

from pandas.plotting import scatter_matrix
_ = scatter_matrix(df[EXPECTED_TARGETS].dropna(), figsize=(10,10), diagonal="hist")
plt.suptitle("Scatter Matrix of 5 Targets", y=1.02)
plt.show()

# ----------------- 与关键自变量关系（NDVI/高度） -----------------
for feat in ["Pre_GSHH_NDVI", "Height_Ave_cm"]:
    if feat in df.columns:
        print(f"\n===== {feat} 分布 =====")
        f = df[feat].dropna().astype(float)
        plt.figure(figsize=(6,4))
        if f.empty:
            plt.text(0.5,0.5,"No data",ha="center",va="center")
        else:
            plt.hist(f, bins=fd_bins(f))
        plt.title(f"Histogram: {feat}"); plt.xlabel(feat); plt.ylabel("Count")
        plt.show()

        # feat vs 每个目标
        for col in EXPECTED_TARGETS:
            xs = df[feat].astype(float); ys = df[col].astype(float)
            mask = xs.notna() & ys.notna()
            if mask.sum() == 0: 
                continue
            plt.figure(figsize=(5,4))
            plt.scatter(xs[mask], ys[mask], s=10, alpha=0.6)
            plt.title(f"{feat} vs {col}")
            plt.xlabel(feat); plt.ylabel(col)
            plt.show()

# ----------------- 时间/州 聚合可视化（可选） -----------------
if "Sampling_Date" in df.columns and np.issubdtype(df["Sampling_Date"].dtype, np.datetime64):
    # 每日均值
    daily = df.groupby(df["Sampling_Date"].dt.date)[EXPECTED_TARGETS].mean()
    plt.figure(figsize=(10,4))
    for col in EXPECTED_TARGETS:
        plt.plot(daily.index, daily[col], label=col)
    plt.xticks(rotation=45)
    plt.title("Daily mean of targets"); plt.legend(); plt.tight_layout(); plt.show()

if "State" in df.columns:
    # 各州均值（样本数>=5）
    grp = df.groupby("State")[EXPECTED_TARGETS].agg(["mean","count"])
    grp = grp[grp.xs("count", level=1, axis=1).min(axis=1) >= 5]
    print("\n===== 按州的目标均值（展示样本数>=5的州）=====")
    display(grp)
    # 简单柱状（以 Dry_Total_g 为例）
    if ("Dry_Total_g" in EXPECTED_TARGETS) and (not grp.empty):
        means = grp[("Dry_Total_g","mean")].sort_values(ascending=False).head(10)
        plt.figure(figsize=(8,4))
        means.plot(kind="bar")
        plt.title("Top-10 States by mean Dry_Total_g")
        plt.ylabel("mean Dry_Total_g"); plt.xlabel("State")
        plt.tight_layout(); plt.show()

# ----------------- 关系检验：Total 与 组分之和；GDM 与 Green -----------------
if all(c in df.columns for c in ["Dry_Total_g","Dry_Green_g","Dry_Dead_g","Dry_Clover_g"]):
    comp_sum = df["Dry_Green_g"].fillna(0) + df["Dry_Dead_g"].fillna(0) + df["Dry_Clover_g"].fillna(0)
    resid_total = df["Dry_Total_g"] - comp_sum
    print("\n===== Dry_Total_g 与 (Green+Dead+Clover) 关系 =====")
    display(resid_total.describe(percentiles=[.01,.05,.25,.5,.75,.95,.99]).to_frame("residual"))
    plt.figure(figsize=(6,4))
    plt.hist(resid_total.dropna(), bins=fd_bins(resid_total))
    plt.title("Residual: Dry_Total_g - (Green+Dead+Clover)")
    plt.xlabel("residual"); plt.ylabel("Count"); plt.show()

if all(c in df.columns for c in ["GDM_g","Dry_Green_g"]):
    plt.figure(figsize=(5,4))
    mask = df["GDM_g"].notna() & df["Dry_Green_g"].notna()
    plt.scatter(df.loc[mask,"Dry_Green_g"], df.loc[mask,"GDM_g"], s=10, alpha=0.6)
    plt.title("Dry_Green_g vs GDM_g")
    plt.xlabel("Dry_Green_g"); plt.ylabel("GDM_g"); plt.show()

print("\n==== 结束：若你需要把横表 df 导出用于后续建模，可： df.to_csv('train_wide.csv', index=False) ====")


In [None]:

# ==== Config ====
CSV_PATH = "/kaggle/input/csiro-biomass/train.csv"  # 如果在本地，请改为你的文件路径，例如 "/mnt/data/train.csv"
OUT_DIR = "/kaggle/working/eda_plots"  # 若想保存图片，设定目录名；设为 None 则不保存

# Kaggle/本地 Matplotlib 后端设置（通常无需更改）
import matplotlib
%matplotlib inline


In [None]:

# ==== Imports ====
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

if OUT_DIR:
    Path(OUT_DIR).mkdir(parents=True, exist_ok=True)


In [None]:

# ==== Load ====
df = pd.read_csv(CSV_PATH)
print("Shape:", df.shape)
df.head(10)


In [None]:

# ==== Basic info ====
mem_mb = df.memory_usage(deep=True).sum() / (1024**2)
print(f"Rows: {len(df)}  |  Cols: {df.shape[1]}  |  Memory: {mem_mb:.3f} MB")

dtypes = df.dtypes.sort_index()
display(pd.DataFrame({"dtype": dtypes}))


In [None]:

# ==== Missingness ====
missing = df.isna().sum().sort_values(ascending=False)
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({"missing_count": missing, "missing_pct": missing_pct})
display(missing_df)


In [None]:

# ==== Column type splits ====
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
low_card_num_as_cat = [c for c in numeric_cols if df[c].nunique(dropna=True) <= 10]
categorical_cols = [c for c in df.columns if c not in numeric_cols]
categorical_like = sorted(set(categorical_cols + low_card_num_as_cat))
numeric_strict = [c for c in numeric_cols if c not in low_card_num_as_cat]

print("numeric_strict:", len(numeric_strict))
print("categorical_like:", len(categorical_like))


In [None]:

# ==== Descriptive statistics ====
if numeric_cols:
    display(df[numeric_cols].describe().T)

if categorical_like:
    desc_cat = pd.DataFrame({
        "unique": [df[c].nunique(dropna=True) for c in categorical_like],
        "top": [df[c].mode(dropna=True).iloc[0] if df[c].dropna().size else np.nan for c in categorical_like],
        "freq_of_top": [df[c].value_counts(dropna=True).iloc[0] if df[c].dropna().size else np.nan for c in categorical_like],
    }, index=categorical_like).sort_index()
    display(desc_cat)


In [None]:

# ==== Helper functions ====
def maybe_save(fig_title):
    if OUT_DIR:
        fname = "".join(ch if ch.isalnum() or ch in "-_." else "_" for ch in fig_title)
        path = Path(OUT_DIR) / f"{fname}.png"
        plt.savefig(path, bbox_inches="tight", dpi=150)
    plt.show()
    plt.close()

def freedman_diaconis_bins(series):
    # series must be numeric and dropna'ed
    s = series.astype(float)
    if s.size < 2:
        return 10
    q1, q3 = np.percentile(s, [25, 75])
    iqr = q3 - q1
    if iqr <= 0:
        return min(s.nunique(), 50)
    bin_width = 2 * iqr * (len(s) ** (-1/3))
    if bin_width <= 0:
        return 30
    return int(np.clip((s.max() - s.min()) / bin_width, 10, 100))


In [None]:

# ==== Histograms for numeric_strict ====
for col in numeric_strict:
    plt.figure()
    s = df[col].dropna().astype(float)
    if s.empty:
        plt.text(0.5, 0.5, f"No data for {col}", ha="center")
        plt.title(f"Histogram: {col}")
    else:
        bins = freedman_diaconis_bins(s)
        plt.hist(s, bins=bins)
        plt.title(f"Histogram: {col}")
        plt.xlabel(col)
        plt.ylabel("Count")
    maybe_save(f"hist_{col}")


In [None]:

# ==== Boxplots for numeric_strict (univariate) ====
for col in numeric_strict:
    plt.figure()
    s = df[col].dropna().astype(float)
    if s.empty:
        plt.text(0.5, 0.5, f"No data for {col}", ha="center")
        plt.title(f"Boxplot: {col}")
    else:
        plt.boxplot(s, vert=True, labels=[col], showfliers=True)
        plt.title(f"Boxplot: {col}")
        plt.ylabel(col)
    maybe_save(f"box_{col}")


In [None]:

# ==== Categorical-like bar charts (Top 20 levels) ====
for col in categorical_like:
    plt.figure()
    vc = df[col].astype("category").value_counts(dropna=False).head(20)
    vc.plot(kind="bar")  # uses matplotlib backend
    plt.title(f"Top 20 categories: {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    maybe_save(f"bar_{col}")


In [None]:

# ==== Correlation heatmap for numeric_strict ====
if len(numeric_strict) >= 2:
    plt.figure()
    corr = df[numeric_strict].corr(numeric_only=True)
    im = plt.imshow(corr, interpolation="nearest")
    plt.title("Correlation heatmap (numeric features)")
    plt.xticks(range(len(numeric_strict)), numeric_strict, rotation=90)
    plt.yticks(range(len(numeric_strict)), numeric_strict)
    plt.colorbar(im, fraction=0.046, pad=0.04)
    maybe_save("correlation_heatmap")
else:
    print("Not enough numeric columns for correlation heatmap.")


In [None]:

# ==== Infer target column (heuristic) ====
def infer_target_column(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    lowered = [c.lower() for c in df.columns]
    candidates = []
    for i, cl in enumerate(lowered):
        if any(key in cl for key in [
            "target", "label", "biomass", "agb", "agb_t_ha", "agbtha", "agb_ha",
            "agb_t_per_ha", "agb_per_ha", "aboveground", "yield", "y", "biomasstpa", "agb_mgha"
        ]):
            candidates.append(df.columns[i])
    cand_numeric = [c for c in candidates if c in numeric_cols]
    if cand_numeric:
        cand_numeric.sort(key=lambda c: df[c].isna().sum())
        return cand_numeric[0]
    return None

TARGET = infer_target_column(df)
print("Inferred target:", TARGET)


In [None]:

# ==== Target distribution ====
if TARGET is not None and TARGET in df.columns:
    plt.figure()
    y = df[TARGET]
    if y.dtype.kind in "ifu":
        s = y.dropna().astype(float)
        if s.size:
            plt.hist(s, bins=30)
        else:
            plt.text(0.5, 0.5, "No target data", ha="center")
        plt.xlabel(TARGET)
        plt.ylabel("Count")
    else:
        vc = y.astype("category").value_counts(dropna=False)
        vc.plot(kind="bar")
        plt.xlabel(TARGET)
        plt.ylabel("Count")
    plt.title(f"Target distribution: {TARGET}")
    maybe_save(f"target_dist_{TARGET}")
else:
    print("No obvious target inferred. You can set TARGET = 'your_target_column' and rerun related cells.")


In [None]:

# ==== Numeric predictors vs numeric target: scatter ====
if TARGET is not None and TARGET in df.columns and df[TARGET].dtype.kind in "ifu":
    for col in [c for c in numeric_strict if c != TARGET]:
        plt.figure()
        xs = df[col].astype(float)
        ys = df[TARGET].astype(float)
        mask = xs.notna() & ys.notna()
        if mask.sum() == 0:
            plt.text(0.5, 0.5, "No overlapping data", ha="center")
        else:
            plt.scatter(xs[mask], ys[mask], s=10, alpha=0.6)
        plt.title(f"{col} vs {TARGET}")
        plt.xlabel(col)
        plt.ylabel(TARGET)
        maybe_save(f"scatter_{col}_vs_{TARGET}")
else:
    print("Skip scatter: no numeric target detected.")


In [None]:

# ==== Categorical-like predictors vs numeric target: boxplots ====
if TARGET is not None and TARGET in df.columns and df[TARGET].dtype.kind in "ifu" and len(categorical_like) > 0:
    for col in categorical_like:
        plt.figure()
        tmp = df[[col, TARGET]].dropna()
        if tmp.empty:
            plt.text(0.5, 0.5, "No data", ha="center")
        else:
            top_levels = tmp[col].value_counts().head(12).index
            tmp2 = tmp[tmp[col].isin(top_levels)]
            data = [tmp2.loc[tmp2[col] == lvl, TARGET].values for lvl in top_levels]
            plt.boxplot(data, labels=[str(l) for l in top_levels], showfliers=False)
        plt.title(f"{TARGET} by {col} (top 12 levels)")
        plt.xlabel(col)
        plt.ylabel(TARGET)
        maybe_save(f"box_{TARGET}_by_{col}")
else:
    print("Skip categorical vs target boxplots.")


In [None]:

# ==== Outlier overview via IQR (numeric_strict) ====
from math import isfinite

rows = []
for col in numeric_strict:
    s = df[col].astype(float).dropna()
    if s.empty:
        rows.append((col, np.nan, np.nan, np.nan))
        continue
    q1, q3 = np.percentile(s, [25, 75])
    iqr = q3 - q1
    if iqr <= 0:
        rows.append((col, q1, q3, 0.0))
        continue
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    pct_out = ((s < lower) | (s > upper)).mean() * 100.0
    rows.append((col, round(q1, 4), round(q3, 4), round(pct_out, 3)))

outlier_df = pd.DataFrame(rows, columns=["column", "Q1", "Q3", "%_outliers_IQR"])
display(outlier_df.sort_values("%_outliers_IQR", ascending=False))
