In [4]:
import pandas as pd
import numpy as np

# --- Config ---
csv_path = "era5_all_window_features.csv"
SAMPLE_SIZE = 5  # number of random rows to inspect

# --- Load small sample efficiently ---
# Read only the header to get column names
cols = pd.read_csv(csv_path, nrows=0).columns.tolist()

# Randomly select N lines using skiprows (efficient for big CSVs)
n_lines = sum(1 for _ in open(csv_path)) - 1  # total data lines (excluding header)
skip = sorted(np.random.choice(np.arange(1, n_lines + 1), n_lines - SAMPLE_SIZE, replace=False))
sample = pd.read_csv(csv_path, skiprows=skip, names=cols)

print("✅ Random sample:")
print(sample.head(SAMPLE_SIZE))

# --- Inspect columns ---
print("\n📊 Columns overview:")
print([c for c in sample.columns if c.endswith('_next')][:10], "...")

# --- Convert relevant columns to numeric ---
# We include '_next', '_mean', '_trend' columns since we use them in checks
num_cols = [c for c in sample.columns if c.endswith(('_next', '_mean', '_trend'))]
for c in num_cols:
    sample[c] = pd.to_numeric(sample[c], errors='coerce')

# --- Sanity check example ---
# We'll inspect a few (latitude, longitude) groups randomly
grouped = sample.groupby(['latitude', 'longitude'])
print("\n🔍 Detailed checks:")

for (lat, lon), g in grouped:
    g = g.sort_values('date')
    print(f"\n📍 Location ({lat}, {lon}) sample check:")
    for _, row in g.iterrows():
        tval = row.get('t2m_min_next', np.nan)
        mean_est = row.get('t2m_min_mean', np.nan)
        trend = row.get('t2m_min_trend', np.nan)

        remarks = []

        # --- Case 1: missing target ---
        if pd.isna(tval):
            remarks.append("⚠️ Missing target value (t2m_min_next).")

        # --- Case 2: numeric sanity check ---
        elif not (-80 <= tval <= 60):
            remarks.append(f"🚨 Unrealistic temperature value: {tval}°C.")

        # --- Case 3: consistency with mean ---
        elif not pd.isna(mean_est):
            if abs(tval - mean_est) < 5:
                remarks.append("✅ OK: target consistent with past 7d mean.")
            elif not pd.isna(trend) and abs(tval - (mean_est + trend)) < 5:
                remarks.append("⚠️ Acceptable if trend applied.")
            else:
                remarks.append(f"❌ Inconsistent: diff={abs(tval - mean_est):.2f}°C from mean.")

        # --- Case 4: default ---
        else:
            remarks.append("ℹ️ Not enough info to evaluate consistency.")

        # Display line summary
        print(f"Date: {row['date']}, t2m_min_next={tval}, mean={mean_est}, trend={trend}")
        print("   → " + " ".join(remarks))


✅ Random sample:
         date  latitude  longitude  t2m_min_mean  t2m_min_std  t2m_min_min  \
0        date  latitude  longitude  t2m_min_mean  t2m_min_std  t2m_min_min   
1  2025-02-04      24.0      -2.05      10.08738    3.1085782    5.1964417   
2  2025-08-07     25.75      -5.55     32.012848    1.1927177    30.088043   
3  2024-03-15     27.25       -9.3     11.502629    1.5027915     9.320709   
4  2025-07-10      28.0      -6.55     30.913832   0.88438654    29.840485   

   t2m_min_max  t2m_min_first  t2m_min_last  t2m_min_trend  ...  skt_mean_std  \
0  t2m_min_max  t2m_min_first  t2m_min_last  t2m_min_trend  ...  skt_mean_std   
1    14.110504      14.110504      8.194489      -5.916015  ...     2.5491722   
2    33.937164      30.088043     33.937164       3.849121  ...     0.9925241   
3   13.2586975      10.584625    13.2586975      2.6740725  ...     1.6790414   
4    32.292145      31.684967     32.292145       0.607178  ...     0.6153525   

   skt_mean_min  skt_mean_m