In [None]:
# =====================================================================
# CSIRO PASTURE BIOMASS - COMPREHENSIVE EDA WITH DEEP INSIGHTS
# =====================================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
from scipy import stats
from scipy.stats import pearsonr, spearmanr, normaltest, shapiro, anderson
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
import itertools

warnings.filterwarnings('ignore')

# Enhanced plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10
%matplotlib inline

# Configuration
CSV_PATH = "/kaggle/input/csiro-biomass/train.csv"
EXPECTED_TARGETS = ["Dry_Green_g", "Dry_Dead_g", "Dry_Clover_g", "GDM_g", "Dry_Total_g"]
TARGET_WEIGHTS = {"Dry_Green_g": 0.1, "Dry_Dead_g": 0.1, "Dry_Clover_g": 0.1, "GDM_g": 0.2, "Dry_Total_g": 0.5}

pd.set_option("display.max_columns", 150)
pd.set_option("display.width", 200)
pd.set_option("display.float_format", '{:.4f}'.format)

print("="*80)
print("CSIRO PASTURE BIOMASS PREDICTION - EXTENSIVE EDA")
print("="*80)

In [None]:
# Load data
df_long = pd.read_csv(CSV_PATH)
print(f"\n{'='*80}")
print("1. DATA STRUCTURE OVERVIEW")
print(f"{'='*80}")
print(f"Shape: {df_long.shape}")
print(f"Memory usage: {df_long.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"\nFirst few rows:")
display(df_long.head(10))

print(f"\n{'='*80}")
print("2. COLUMN TYPES & MISSING VALUES")
print(f"{'='*80}")
info_df = pd.DataFrame({
    'dtype': df_long.dtypes,
    'non_null': df_long.count(),
    'null_count': df_long.isna().sum(),
    'null_pct': (df_long.isna().sum() / len(df_long) * 100).round(2),
    'unique_values': df_long.nunique(),
    'sample_value': [df_long[col].dropna().iloc[0] if df_long[col].dropna().size > 0 else None 
                     for col in df_long.columns]
})
display(info_df)

# Parse dates
if "Sampling_Date" in df_long.columns:
    df_long["Sampling_Date"] = pd.to_datetime(df_long["Sampling_Date"], errors='coerce')
    df_long['Year'] = df_long['Sampling_Date'].dt.year
    df_long['Month'] = df_long['Sampling_Date'].dt.month
    df_long['Quarter'] = df_long['Sampling_Date'].dt.quarter
    df_long['Season'] = df_long['Month'].map({12: 'Summer', 1: 'Summer', 2: 'Summer',
                                               3: 'Autumn', 4: 'Autumn', 5: 'Autumn',
                                               6: 'Winter', 7: 'Winter', 8: 'Winter',
                                               9: 'Spring', 10: 'Spring', 11: 'Spring'})
    print("\n‚úì Date features extracted: Year, Month, Quarter, Season")

In [None]:
print(f"\n{'='*80}")
print("3. IMAGE-LEVEL DATA QUALITY CHECKS")
print(f"{'='*80}")

# Check rows per image
n_unique_images = df_long['image_path'].nunique()
rows_per_image = df_long.groupby('image_path').size()

print(f"\nUnique images: {n_unique_images}")
print(f"Expected total rows (5 per image): {n_unique_images * 5}")
print(f"Actual total rows: {len(df_long)}")
print(f"\nRows per image distribution:")
print(rows_per_image.value_counts().sort_index())

# Find problematic images
problematic_images = rows_per_image[rows_per_image != 5]
if len(problematic_images) > 0:
    print(f"\n‚ö†Ô∏è  WARNING: {len(problematic_images)} images don't have exactly 5 rows!")
    display(df_long[df_long['image_path'].isin(problematic_images.index)].sort_values('image_path'))
else:
    print("\n‚úì All images have exactly 5 rows")

# Check target completeness
print(f"\n{'='*80}")
print("4. TARGET COMPLETENESS CHECK")
print(f"{'='*80}")

def check_targets_complete(group):
    return set(group['target_name'].values) == set(EXPECTED_TARGETS)

complete_mask = df_long.groupby('image_path').apply(check_targets_complete)
incomplete_images = complete_mask[~complete_mask]

if len(incomplete_images) > 0:
    print(f"‚ö†Ô∏è  {len(incomplete_images)} images missing some targets!")
    for img in incomplete_images.index[:5]:
        img_data = df_long[df_long['image_path'] == img]
        missing = set(EXPECTED_TARGETS) - set(img_data['target_name'].values)
        print(f"  {img}: missing {missing}")
else:
    print("‚úì All images have all 5 target types")

# Metadata consistency check
print(f"\n{'='*80}")
print("5. METADATA CONSISTENCY ACROSS SAME IMAGE")
print(f"{'='*80}")

meta_cols = ['Sampling_Date', 'State', 'Pre_GSHH_NDVI', 'Height_Ave_cm', 'Species']
meta_cols = [c for c in meta_cols if c in df_long.columns]

inconsistencies = {}
for col in meta_cols:
    unique_per_image = df_long.groupby('image_path')[col].nunique()
    inconsistent = unique_per_image[unique_per_image > 1]
    inconsistencies[col] = len(inconsistent)
    
    if len(inconsistent) > 0:
        print(f"\n‚ö†Ô∏è  {col}: {len(inconsistent)} images have inconsistent values")
        sample_img = inconsistent.index[0]
        display(df_long[df_long['image_path'] == sample_img][['image_path', 'target_name', col]])
    else:
        print(f"‚úì {col}: All images have consistent values")

# Duplicate check
print(f"\n{'='*80}")
print("6. DUPLICATE DETECTION")
print(f"{'='*80}")

duplicates = df_long.duplicated(subset=['image_path', 'target_name'], keep=False)
n_duplicates = duplicates.sum()

if n_duplicates > 0:
    print(f"‚ö†Ô∏è  Found {n_duplicates} duplicate (image, target) pairs!")
    display(df_long[duplicates].sort_values(['image_path', 'target_name']))
else:
    print("‚úì No duplicate (image, target) pairs found")

In [None]:
print(f"\n{'='*80}")
print("7. CONVERTING TO WIDE FORMAT & FEATURE ENGINEERING")
print(f"{'='*80}")

# Pivot to wide format
df = df_long.pivot_table(
    index='image_path',
    columns='target_name',
    values='target',
    aggfunc='mean'
).reindex(columns=EXPECTED_TARGETS)

# Add metadata
meta = df_long.drop_duplicates('image_path').set_index('image_path')
meta_cols_to_add = [c for c in ['Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 
                                 'Height_Ave_cm', 'Year', 'Month', 'Quarter', 'Season'] 
                    if c in meta.columns]
df = df.join(meta[meta_cols_to_add]).reset_index()

print(f"Wide format shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

# FEATURE ENGINEERING: Derived features
print("\nüîß Creating derived features...")

# Ratios and percentages
df['Green_Pct'] = (df['Dry_Green_g'] / df['Dry_Total_g'] * 100).replace([np.inf, -np.inf], np.nan)
df['Dead_Pct'] = (df['Dry_Dead_g'] / df['Dry_Total_g'] * 100).replace([np.inf, -np.inf], np.nan)
df['Clover_Pct'] = (df['Dry_Clover_g'] / df['Dry_Total_g'] * 100).replace([np.inf, -np.inf], np.nan)
df['GDM_Pct'] = (df['GDM_g'] / df['Dry_Total_g'] * 100).replace([np.inf, -np.inf], np.nan)

# Ratios between components
df['Green_Dead_Ratio'] = (df['Dry_Green_g'] / (df['Dry_Dead_g'] + 1)).replace([np.inf, -np.inf], np.nan)
df['Green_Clover_Ratio'] = (df['Dry_Green_g'] / (df['Dry_Clover_g'] + 1)).replace([np.inf, -np.inf], np.nan)

# Sum check (should equal Dry_Total_g)
df['Component_Sum'] = df['Dry_Green_g'] + df['Dry_Dead_g'] + df['Dry_Clover_g']
df['Sum_Error'] = df['Component_Sum'] - df['Dry_Total_g']
df['Sum_Error_Pct'] = (df['Sum_Error'] / df['Dry_Total_g'] * 100).replace([np.inf, -np.inf], np.nan)

# NDVI per height (if available)
if 'Pre_GSHH_NDVI' in df.columns and 'Height_Ave_cm' in df.columns:
    df['NDVI_per_Height'] = (df['Pre_GSHH_NDVI'] / (df['Height_Ave_cm'] + 1)).replace([np.inf, -np.inf], np.nan)

# Biomass per height
if 'Height_Ave_cm' in df.columns:
    df['Biomass_per_Height'] = (df['Dry_Total_g'] / (df['Height_Ave_cm'] + 1)).replace([np.inf, -np.inf], np.nan)

print(f"‚úì Created {df.shape[1] - len(EXPECTED_TARGETS) - len(meta_cols_to_add) - 1} derived features")
print(f"\nNew shape: {df.shape}")
display(df.head())

In [None]:
print(f"\n{'='*80}")
print("8. PHYSICAL CONSTRAINT VALIDATION")
print(f"{'='*80}")

# Check if components sum to total
print("\nüìê Component Sum Validation:")
print(f"  Mean sum error: {df['Sum_Error'].mean():.4f} g")
print(f"  Median sum error: {df['Sum_Error'].median():.4f} g")
print(f"  Std sum error: {df['Sum_Error'].std():.4f} g")
print(f"  Max absolute error: {df['Sum_Error'].abs().max():.4f} g")

# Percentage error distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(df['Sum_Error'].dropna(), bins=50, edgecolor='black')
axes[0].axvline(0, color='red', linestyle='--', label='Perfect Match')
axes[0].set_xlabel('Component Sum - Dry_Total_g (grams)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Sum Error')
axes[0].legend()

axes[1].hist(df['Sum_Error_Pct'].dropna(), bins=50, edgecolor='black')
axes[1].axvline(0, color='red', linestyle='--', label='Perfect Match')
axes[1].set_xlabel('Sum Error (%)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Sum Error (Percentage)')
axes[1].legend()

plt.tight_layout()
plt.show()

# Identify problematic samples
large_errors = df[df['Sum_Error'].abs() > 10].copy()
if len(large_errors) > 0:
    print(f"\n‚ö†Ô∏è  {len(large_errors)} samples have |sum_error| > 10g:")
    display(large_errors[['image_path', 'Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 
                          'Component_Sum', 'Dry_Total_g', 'Sum_Error']].head(10))

# Check for negative values
print(f"\nüîç Negative Value Check:")
for col in EXPECTED_TARGETS:
    n_negative = (df[col] < 0).sum()
    if n_negative > 0:
        print(f"  ‚ö†Ô∏è  {col}: {n_negative} negative values")
    else:
        print(f"  ‚úì {col}: No negative values")

# Check GDM vs Green
df['GDM_Green_Diff'] = df['GDM_g'] - df['Dry_Green_g']
print(f"\nüìä GDM vs Dry_Green relationship:")
print(f"  GDM > Green in {(df['GDM_Green_Diff'] > 0).sum()} samples")
print(f"  GDM = Green in {(df['GDM_Green_Diff'].abs() < 0.1).sum()} samples")
print(f"  GDM < Green in {(df['GDM_Green_Diff'] < 0).sum()} samples")
print(f"  Mean difference: {df['GDM_Green_Diff'].mean():.4f} g")

In [None]:
print(f"\n{'='*80}")
print("9. COMPREHENSIVE STATISTICAL ANALYSIS OF TARGETS")
print(f"{'='*80}")

# Extended descriptive statistics
extended_stats = []
for target in EXPECTED_TARGETS:
    s = df[target].dropna()
    if len(s) > 0:
        stats_dict = {
            'Target': target,
            'Weight': TARGET_WEIGHTS[target],
            'Count': len(s),
            'Missing': df[target].isna().sum(),
            'Mean': s.mean(),
            'Median': s.median(),
            'Std': s.std(),
            'Min': s.min(),
            'Max': s.max(),
            'Q1': s.quantile(0.25),
            'Q3': s.quantile(0.75),
            'IQR': s.quantile(0.75) - s.quantile(0.25),
            'Range': s.max() - s.min(),
            'CV': (s.std() / s.mean() * 100) if s.mean() != 0 else np.nan,  # Coefficient of variation
            'Skewness': s.skew(),
            'Kurtosis': s.kurtosis(),
        }
        
        # Outliers (IQR method)
        Q1, Q3 = s.quantile(0.25), s.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = ((s < lower_bound) | (s > upper_bound)).sum()
        stats_dict['Outliers_IQR'] = outliers
        stats_dict['Outliers_Pct'] = (outliers / len(s) * 100)
        
        # Zeros
        stats_dict['Zeros'] = (s == 0).sum()
        stats_dict['Zeros_Pct'] = (s == 0).sum() / len(s) * 100
        
        extended_stats.append(stats_dict)

extended_stats_df = pd.DataFrame(extended_stats)
display(extended_stats_df.round(4))

# Normality tests
print(f"\n{'='*80}")
print("10. NORMALITY TESTS FOR TARGETS")
print(f"{'='*80}")

normality_results = []
for target in EXPECTED_TARGETS:
    s = df[target].dropna()
    if len(s) > 3:
        # Shapiro-Wilk test (good for n < 5000)
        if len(s) < 5000:
            shapiro_stat, shapiro_p = shapiro(s)
        else:
            shapiro_stat, shapiro_p = np.nan, np.nan
        
        # D'Agostino-Pearson test
        k2_stat, k2_p = normaltest(s)
        
        # Anderson-Darling test
        anderson_result = anderson(s)
        
        normality_results.append({
            'Target': target,
            'Shapiro_Stat': shapiro_stat,
            'Shapiro_p': shapiro_p,
            'Shapiro_Normal': 'Yes' if shapiro_p > 0.05 else 'No',
            'K2_Stat': k2_stat,
            'K2_p': k2_p,
            'K2_Normal': 'Yes' if k2_p > 0.05 else 'No',
            'Anderson_Stat': anderson_result.statistic,
            'Anderson_Critical_5%': anderson_result.critical_values[2],
            'Anderson_Normal': 'Yes' if anderson_result.statistic < anderson_result.critical_values[2] else 'No'
        })

normality_df = pd.DataFrame(normality_results)
display(normality_df.round(4))

print("\nüí° Interpretation:")
print("  ‚Ä¢ p-value > 0.05: Data is likely normally distributed")
print("  ‚Ä¢ p-value < 0.05: Data is NOT normally distributed (reject null hypothesis)")
print("  ‚Ä¢ Most biomass data is right-skewed ‚Üí Consider log transformation")

In [None]:
print(f"\n{'='*80}")
print("11. ADVANCED DISTRIBUTION VISUALIZATIONS")
print(f"{'='*80}")

# Multi-panel distribution analysis
fig, axes = plt.subplots(5, 4, figsize=(20, 20))

for idx, target in enumerate(EXPECTED_TARGETS):
    s = df[target].dropna()
    
    # Histogram
    axes[idx, 0].hist(s, bins=50, edgecolor='black', alpha=0.7)
    axes[idx, 0].axvline(s.mean(), color='red', linestyle='--', label=f'Mean: {s.mean():.1f}')
    axes[idx, 0].axvline(s.median(), color='green', linestyle='--', label=f'Median: {s.median():.1f}')
    axes[idx, 0].set_title(f'{target} - Histogram')
    axes[idx, 0].set_xlabel('Value (g)')
    axes[idx, 0].legend()
    
    # Box plot
    bp = axes[idx, 1].boxplot(s, vert=True, patch_artist=True, showfliers=True)
    bp['boxes'][0].set_facecolor('lightblue')
    axes[idx, 1].set_title(f'{target} - Boxplot')
    axes[idx, 1].set_ylabel('Value (g)')
    
    # Violin plot
    parts = axes[idx, 2].violinplot([s], vert=True, showmeans=True, showmedians=True)
    axes[idx, 2].set_title(f'{target} - Violin Plot')
    axes[idx, 2].set_ylabel('Value (g)')
    
    # Q-Q plot for normality
    stats.probplot(s, dist="norm", plot=axes[idx, 3])
    axes[idx, 3].set_title(f'{target} - Q-Q Plot')
    axes[idx, 3].grid(True)

plt.tight_layout()
plt.show()

# Log-transformed distributions (for skewed data)
print("\nüìä Log-Transformed Distributions (for skewed targets):")
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, target in enumerate(EXPECTED_TARGETS):
    s = df[target].dropna()
    if s.min() > 0 and abs(s.skew()) > 1:
        s_log = np.log1p(s)
        
        axes[idx].hist(s_log, bins=50, edgecolor='black', alpha=0.7, color='green')
        axes[idx].axvline(s_log.mean(), color='red', linestyle='--', label=f'Mean: {s_log.mean():.2f}')
        axes[idx].axvline(s_log.median(), color='blue', linestyle='--', label=f'Median: {s_log.median():.2f}')
        axes[idx].set_title(f'log1p({target}) - Skew: {s_log.skew():.2f}')
        axes[idx].set_xlabel('log1p(Value)')
        axes[idx].legend()
    else:
        axes[idx].text(0.5, 0.5, 'Not heavily skewed\nor contains zeros', 
                       ha='center', va='center', transform=axes[idx].transAxes)
        axes[idx].set_title(f'{target}')

plt.tight_layout()
plt.show()

In [None]:
print(f"\n{'='*80}")
print("12. CORRELATION ANALYSIS WITH STATISTICAL SIGNIFICANCE")
print(f"{'='*80}")

# Pearson correlation
corr_pearson = df[EXPECTED_TARGETS].corr(method='pearson')

# Spearman correlation (for non-linear relationships)
corr_spearman = df[EXPECTED_TARGETS].corr(method='spearman')

# Calculate p-values for correlations
def correlation_with_pvalue(df, method='pearson'):
    """Calculate correlation matrix with p-values"""
    cols = df.columns
    n = len(cols)
    corr_matrix = np.zeros((n, n))
    p_matrix = np.zeros((n, n))
    
    for i in range(n):
        for j in range(n):
            if i == j:
                corr_matrix[i, j] = 1.0
                p_matrix[i, j] = 0.0
            else:
                s1 = df.iloc[:, i].dropna()
                s2 = df.iloc[:, j].dropna()
                mask = df.iloc[:, i].notna() & df.iloc[:, j].notna()
                if mask.sum() > 3:
                    if method == 'pearson':
                        corr, pval = pearsonr(df.iloc[:, i][mask], df.iloc[:, j][mask])
                    else:
                        corr, pval = spearmanr(df.iloc[:, i][mask], df.iloc[:, j][mask])
                    corr_matrix[i, j] = corr
                    p_matrix[i, j] = pval
                else:
                    corr_matrix[i, j] = np.nan
                    p_matrix[i, j] = np.nan
    
    return pd.DataFrame(corr_matrix, index=cols, columns=cols), \
           pd.DataFrame(p_matrix, index=cols, columns=cols)

corr_pearson, p_pearson = correlation_with_pvalue(df[EXPECTED_TARGETS], method='pearson')
corr_spearman, p_spearman = correlation_with_pvalue(df[EXPECTED_TARGETS], method='spearman')

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 7))

# Pearson correlation heatmap
sns.heatmap(corr_pearson, annot=True, fmt='.3f', cmap='RdBu_r', center=0, 
            vmin=-1, vmax=1, square=True, ax=axes[0], cbar_kws={'label': 'Correlation'})
axes[0].set_title('Pearson Correlation Matrix\n(Linear Relationships)', fontsize=14, fontweight='bold')

# Spearman correlation heatmap
sns.heatmap(corr_spearman, annot=True, fmt='.3f', cmap='RdBu_r', center=0, 
            vmin=-1, vmax=1, square=True, ax=axes[1], cbar_kws={'label': 'Correlation'})
axes[1].set_title('Spearman Correlation Matrix\n(Monotonic Relationships)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# Print significant correlations
print("\nüîç Significant Correlations (p < 0.05, |r| > 0.3):")
print("\nPearson:")
for i in range(len(EXPECTED_TARGETS)):
    for j in range(i+1, len(EXPECTED_TARGETS)):
        r = corr_pearson.iloc[i, j]
        p = p_pearson.iloc[i, j]
        if abs(r) > 0.3 and p < 0.05:
            print(f"  {EXPECTED_TARGETS[i]} vs {EXPECTED_TARGETS[j]}: r={r:.3f}, p={p:.4f}")

print("\nSpearman:")
for i in range(len(EXPECTED_TARGETS)):
    for j in range(i+1, len(EXPECTED_TARGETS)):
        r = corr_spearman.iloc[i, j]
        p = p_spearman.iloc[i, j]
        if abs(r) > 0.3 and p < 0.05:
            print(f"  {EXPECTED_TARGETS[i]} vs {EXPECTED_TARGETS[j]}: r={r:.3f}, p={p:.4f}")

In [None]:
print(f"\n{'='*80}")
print("12. CORRELATION ANALYSIS WITH STATISTICAL SIGNIFICANCE")
print(f"{'='*80}")

# Pearson correlation
corr_pearson = df[EXPECTED_TARGETS].corr(method='pearson')

# Spearman correlation (for non-linear relationships)
corr_spearman = df[EXPECTED_TARGETS].corr(method='spearman')

# Calculate p-values for correlations
def correlation_with_pvalue(df, method='pearson'):
    """Calculate correlation matrix with p-values"""
    cols = df.columns
    n = len(cols)
    corr_matrix = np.zeros((n, n))
    p_matrix = np.zeros((n, n))
    
    for i in range(n):
        for j in range(n):
            if i == j:
                corr_matrix[i, j] = 1.0
                p_matrix[i, j] = 0.0
            else:
                s1 = df.iloc[:, i].dropna()
                s2 = df.iloc[:, j].dropna()
                mask = df.iloc[:, i].notna() & df.iloc[:, j].notna()
                if mask.sum() > 3:
                    if method == 'pearson':
                        corr, pval = pearsonr(df.iloc[:, i][mask], df.iloc[:, j][mask])
                    else:
                        corr, pval = spearmanr(df.iloc[:, i][mask], df.iloc[:, j][mask])
                    corr_matrix[i, j] = corr
                    p_matrix[i, j] = pval
                else:
                    corr_matrix[i, j] = np.nan
                    p_matrix[i, j] = np.nan
    
    return pd.DataFrame(corr_matrix, index=cols, columns=cols), \
           pd.DataFrame(p_matrix, index=cols, columns=cols)

corr_pearson, p_pearson = correlation_with_pvalue(df[EXPECTED_TARGETS], method='pearson')
corr_spearman, p_spearman = correlation_with_pvalue(df[EXPECTED_TARGETS], method='spearman')

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 7))

# Pearson correlation heatmap
sns.heatmap(corr_pearson, annot=True, fmt='.3f', cmap='RdBu_r', center=0, 
            vmin=-1, vmax=1, square=True, ax=axes[0], cbar_kws={'label': 'Correlation'})
axes[0].set_title('Pearson Correlation Matrix\n(Linear Relationships)', fontsize=14, fontweight='bold')

# Spearman correlation heatmap
sns.heatmap(corr_spearman, annot=True, fmt='.3f', cmap='RdBu_r', center=0, 
            vmin=-1, vmax=1, square=True, ax=axes[1], cbar_kws={'label': 'Correlation'})
axes[1].set_title('Spearman Correlation Matrix\n(Monotonic Relationships)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# Print significant correlations
print("\nüîç Significant Correlations (p < 0.05, |r| > 0.3):")
print("\nPearson:")
for i in range(len(EXPECTED_TARGETS)):
    for j in range(i+1, len(EXPECTED_TARGETS)):
        r = corr_pearson.iloc[i, j]
        p = p_pearson.iloc[i, j]
        if abs(r) > 0.3 and p < 0.05:
            print(f"  {EXPECTED_TARGETS[i]} vs {EXPECTED_TARGETS[j]}: r={r:.3f}, p={p:.4f}")

print("\nSpearman:")
for i in range(len(EXPECTED_TARGETS)):
    for j in range(i+1, len(EXPECTED_TARGETS)):
        r = corr_spearman.iloc[i, j]
        p = p_spearman.iloc[i, j]
        if abs(r) > 0.3 and p < 0.05:
            print(f"  {EXPECTED_TARGETS[i]} vs {EXPECTED_TARGETS[j]}: r={r:.3f}, p={p:.4f}")

In [None]:
print(f"\n{'='*80}")
print("13. PAIRWISE RELATIONSHIPS ANALYSIS")
print(f"{'='*80}")

# Enhanced scatter matrix with regression lines
from pandas.plotting import scatter_matrix

fig = plt.figure(figsize=(18, 18))
axes = scatter_matrix(df[EXPECTED_TARGETS].dropna(), figsize=(18, 18), 
                     alpha=0.3, diagonal='kde', density_kwds={'color': 'blue'})

# Add regression lines
for i in range(len(EXPECTED_TARGETS)):
    for j in range(len(EXPECTED_TARGETS)):
        if i != j:
            ax = axes[i, j]
            x_data = df[EXPECTED_TARGETS[j]].dropna()
            y_data = df[EXPECTED_TARGETS[i]].dropna()
            mask = df[EXPECTED_TARGETS[j]].notna() & df[EXPECTED_TARGETS[i]].notna()
            
            if mask.sum() > 1:
                x = df.loc[mask, EXPECTED_TARGETS[j]]
                y = df.loc[mask, EXPECTED_TARGETS[i]]
                z = np.polyfit(x, y, 1)
                p = np.poly1d(z)
                ax.plot(x.sort_values(), p(x.sort_values()), "r--", alpha=0.8, linewidth=2)

plt.suptitle('Scatter Matrix of 5 Targets with Regression Lines', 
             y=1.0, fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Joint plots for key relationships
print("\nüìä Detailed Joint Plots for Key Relationships:")

key_pairs = [
    ('Dry_Total_g', 'Dry_Green_g'),
    ('Dry_Total_g', 'GDM_g'),
    ('Dry_Green_g', 'GDM_g'),
]

for x_col, y_col in key_pairs:
    g = sns.jointplot(data=df, x=x_col, y=y_col, kind='reg', height=8,
                      marginal_kws=dict(bins=30, fill=True))
    
    # Add correlation info
    mask = df[x_col].notna() & df[y_col].notna()
    if mask.sum() > 0:
        r_pearson, p_pearson = pearsonr(df.loc[mask, x_col], df.loc[mask, y_col])
        r_spearman, p_spearman = spearmanr(df.loc[mask, x_col], df.loc[mask, y_col])
        
        g.fig.suptitle(f'{x_col} vs {y_col}\n' + 
                       f'Pearson r={r_pearson:.3f} (p={p_pearson:.4f}), ' +
                       f'Spearman œÅ={r_spearman:.3f} (p={p_spearman:.4f})',
                       y=1.02, fontsize=12, fontweight='bold')
    plt.show()

In [None]:
print(f"\n{'='*80}")
print("14. FEATURE RELATIONSHIPS - NDVI & HEIGHT ANALYSIS")
print(f"{'='*80}")

if 'Pre_GSHH_NDVI' in df.columns and 'Height_Ave_cm' in df.columns:
    
    # NDVI distribution and stats
    print("\nüìä NDVI Statistics:")
    ndvi_stats = df['Pre_GSHH_NDVI'].describe(percentiles=[.01, .05, .25, .5, .75, .95, .99])
    display(ndvi_stats.to_frame().T)
    
    # Height distribution and stats
    print("\nüìä Height Statistics:")
    height_stats = df['Height_Ave_cm'].describe(percentiles=[.01, .05, .25, .5, .75, .95, .99])
    display(height_stats.to_frame().T)
    
    # Correlations with all targets
    print("\nüîó Correlations with Targets:")
    feature_corr = pd.DataFrame({
        'NDVI_Pearson': [df[['Pre_GSHH_NDVI', t]].corr().iloc[0, 1] for t in EXPECTED_TARGETS],
        'NDVI_Spearman': [df[['Pre_GSHH_NDVI', t]].corr(method='spearman').iloc[0, 1] for t in EXPECTED_TARGETS],
        'Height_Pearson': [df[['Height_Ave_cm', t]].corr().iloc[0, 1] for t in EXPECTED_TARGETS],
        'Height_Spearman': [df[['Height_Ave_cm', t]].corr(method='spearman').iloc[0, 1] for t in EXPECTED_TARGETS],
    }, index=EXPECTED_TARGETS)
    display(feature_corr.round(3))
    
    # Scatter plots: NDVI vs all targets
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    axes = axes.flatten()
    
    for idx, target in enumerate(EXPECTED_TARGETS):
        mask = df['Pre_GSHH_NDVI'].notna() & df[target].notna()
        if mask.sum() > 0:
            axes[idx].scatter(df.loc[mask, 'Pre_GSHH_NDVI'], df.loc[mask, target], 
                            alpha=0.4, s=20)
            
            # Add regression line
            x = df.loc[mask, 'Pre_GSHH_NDVI']
            y = df.loc[mask, target]
            z = np.polyfit(x, y, 1)
            p = np.poly1d(z)
            axes[idx].plot(x.sort_values(), p(x.sort_values()), "r--", linewidth=2)
            
            # Add correlation
            r, p_val = pearsonr(x, y)
            axes[idx].set_title(f'NDVI vs {target}\nr={r:.3f}, p={p_val:.4f}')
            axes[idx].set_xlabel('Pre_GSHH_NDVI')
            axes[idx].set_ylabel(f'{target} (g)')
            axes[idx].grid(True, alpha=0.3)
    
    axes[5].axis('off')
    plt.tight_layout()
    plt.show()
    
    # Scatter plots: Height vs all targets
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    axes = axes.flatten()
    
    for idx, target in enumerate(EXPECTED_TARGETS):
        mask = df['Height_Ave_cm'].notna() & df[target].notna()
        if mask.sum() > 0:
            axes[idx].scatter(df.loc[mask, 'Height_Ave_cm'], df.loc[mask, target], 
                            alpha=0.4, s=20, color='green')
            
            # Add regression line
            x = df.loc[mask, 'Height_Ave_cm']
            y = df.loc[mask, target]
            z = np.polyfit(x, y, 1)
            p = np.poly1d(z)
            axes[idx].plot(x.sort_values(), p(x.sort_values()), "r--", linewidth=2)
            
            # Add correlation
            r, p_val = pearsonr(x, y)
            axes[idx].set_title(f'Height vs {target}\nr={r:.3f}, p={p_val:.4f}')
            axes[idx].set_xlabel('Height_Ave_cm')
            axes[idx].set_ylabel(f'{target} (g)')
            axes[idx].grid(True, alpha=0.3)
    
    axes[5].axis('off')
    plt.tight_layout()
    plt.show()
    
    # NDVI vs Height relationship
    print("\nüìä NDVI vs Height Relationship:")
    fig, ax = plt.subplots(figsize=(10, 6))
    mask = df['Pre_GSHH_NDVI'].notna() & df['Height_Ave_cm'].notna()
    
    if mask.sum() > 0:
        scatter = ax.scatter(df.loc[mask, 'Height_Ave_cm'], 
                           df.loc[mask, 'Pre_GSHH_NDVI'],
                           c=df.loc[mask, 'Dry_Total_g'], 
                           cmap='viridis', alpha=0.6, s=50)
        
        x = df.loc[mask, 'Height_Ave_cm']
        y = df.loc[mask, 'Pre_GSHH_NDVI']
        z = np.polyfit(x, y, 1)
        p = np.poly1d(z)
        ax.plot(x.sort_values(), p(x.sort_values()), "r--", linewidth=2)
        
        r, p_val = pearsonr(x, y)
        ax.set_title(f'NDVI vs Height (colored by Dry_Total_g)\nr={r:.3f}, p={p_val:.4f}', 
                    fontsize=14, fontweight='bold')
        ax.set_xlabel('Height_Ave_cm', fontsize=12)
        ax.set_ylabel('Pre_GSHH_NDVI', fontsize=12)
        plt.colorbar(scatter, label='Dry_Total_g', ax=ax)
        ax.grid(True, alpha=0.3)
        plt.show()

else:
    print("‚ö†Ô∏è  NDVI and/or Height columns not available")