In [25]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# ========== FILE PATHS ==========
era5_path = "../results/floods/flood_1/gee_era5/daily_precipitation.csv"
gsmap_path = "../results/floods/flood_1/gee_gsmap/daily_precipitation.csv"
imerg_path = "../results/floods/flood_1/gee_imerg/daily_precipitation.csv"
highres_path = "../results/floods/flood_1/highres/high_resolution_precipitation.csv"
observed_path = "../data/filtered_observed_precip.xlsx"

# ========== OUTPUT DIR ==========
output_dir = "precip_comparison_plots"
os.makedirs(output_dir, exist_ok=True)

# ========== LOAD DATA ==========
era5 = pd.read_csv(era5_path)
gsmap = pd.read_csv(gsmap_path)
imerg = pd.read_csv(imerg_path)
highres = pd.read_csv(highres_path)
observed = pd.read_excel(observed_path)

# ========== TO DAILY FUNCTION ==========
def to_daily(df, time_col):
    df = df.copy()
    df[time_col] = pd.to_datetime(df[time_col])
    df["date"] = df[time_col].dt.normalize()
    numeric_cols = df.select_dtypes(include="number").columns
    return df.groupby("date")[numeric_cols].sum()

# ========== RESAMPLE TO DAILY ==========
highres["datetime"] = pd.to_datetime(highres["datetime"])
highres["date"] = highres["datetime"].dt.normalize()
highres_daily = highres.groupby("date")[highres.select_dtypes(include="number").columns].sum()
daily_dates = pd.to_datetime(highres_daily.index).normalize()

era5_daily = to_daily(era5, "Date")
gsmap_daily = to_daily(gsmap, "Date")
imerg_daily = to_daily(imerg, "Date")

# ========== ALIGN DATES ==========
def align_dates(df, valid_dates):
    df.index = pd.to_datetime(df.index).normalize()
    return df.loc[df.index.isin(valid_dates)]

era5_daily = align_dates(era5_daily, daily_dates)
gsmap_daily = align_dates(gsmap_daily, daily_dates)
imerg_daily = align_dates(imerg_daily, daily_dates)
highres_daily = align_dates(highres_daily, daily_dates)

# ========== OBSERVED DATA ==========
observed["date"] = pd.to_datetime(observed[["year", "month", "days"]]).dt.normalize()
observed_daily = observed.set_index("date").drop(columns=["year", "month", "days"])
observed_daily = observed_daily.loc[observed_daily.index.isin(daily_dates)]
observed_daily.columns = observed_daily.columns.astype(str)

# ========== FIND COMMON STATIONS ==========
hr_stations = [col.split("_")[-1] for col in highres_daily.columns]
era5_stations = [col.split("_")[-1] for col in era5_daily.columns]
observed_stations = list(observed_daily.columns)
common_stations = sorted(set(hr_stations) & set(era5_stations) & set(observed_stations))

print(f"✅ Found {len(common_stations)} common stations for comparison: {common_stations}")

# ========== PLOTTING ==========
for sid in common_stations:
    colname = f"Station_{sid}"

    # Debug: print data availability
    print(f"\nStation {sid} - Data availability:")
    print(f"  High Res: {highres_daily[colname].dropna().shape[0] if colname in highres_daily.columns else 'N/A'} days")
    print(f"  ERA5:     {era5_daily[colname].dropna().shape[0] if colname in era5_daily.columns else 'N/A'} days")
    print(f"  GSMaP:    {gsmap_daily[colname].dropna().shape[0] if colname in gsmap_daily.columns else 'N/A'} days")
    print(f"  IMERG:    {imerg_daily[colname].dropna().shape[0] if colname in imerg_daily.columns else 'N/A'} days")
    print(f"  Observed: {observed_daily[sid].dropna().shape[0]} days")

    try:
        plt.figure(figsize=(10, 5))

        if colname in highres_daily.columns and highres_daily[colname].notna().any():
            plt.plot(highres_daily.index, highres_daily[colname], label="High Res", linewidth=2)

        if colname in era5_daily.columns and era5_daily[colname].notna().any():
            plt.plot(era5_daily.index, era5_daily[colname], label="ERA5", linewidth=1.5)

        if colname in gsmap_daily.columns and gsmap_daily[colname].notna().any():
            plt.plot(gsmap_daily.index, gsmap_daily[colname], label="GSMaP", linewidth=1.5)

        if colname in imerg_daily.columns and imerg_daily[colname].notna().any():
            plt.plot(imerg_daily.index, imerg_daily[colname], label="IMERG", linewidth=1.5)

        if sid in observed_daily.columns and observed_daily[sid].notna().any():
            plt.plot(observed_daily.index, observed_daily[sid], label="Observed", linestyle='--', linewidth=1.5)

        plt.title(f"Daily Precipitation Comparison - Station {sid}")
        plt.xlabel("Date")
        plt.ylabel("Precipitation (mm)")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()

        outpath = os.path.join(output_dir, f"station_{sid}_comparison.png")
        plt.savefig(outpath, dpi=300)
        plt.close()
    except Exception as e:
        print(f"⚠️ Skipped station {sid} due to error: {e}")

✅ Found 22 common stations for comparison: ['1007', '1015', '1022', '1029', '1030', '1035', '1039', '1043', '1052', '1059', '1060', '1071', '1073', '1074', '1075', '1076', '1077', '1079', '1080', '1081', '1082', '1083']

Station 1007 - Data availability:
  High Res: 20 days
  ERA5:     19 days
  GSMaP:    19 days
  IMERG:    19 days
  Observed: 20 days

Station 1015 - Data availability:
  High Res: 20 days
  ERA5:     19 days
  GSMaP:    19 days
  IMERG:    19 days
  Observed: 20 days

Station 1022 - Data availability:
  High Res: 20 days
  ERA5:     19 days
  GSMaP:    19 days
  IMERG:    19 days
  Observed: 20 days

Station 1029 - Data availability:
  High Res: 20 days
  ERA5:     19 days
  GSMaP:    19 days
  IMERG:    19 days
  Observed: 20 days

Station 1030 - Data availability:
  High Res: 20 days
  ERA5:     19 days
  GSMaP:    19 days
  IMERG:    19 days
  Observed: 20 days

Station 1035 - Data availability:
  High Res: 20 days
  ERA5:     19 days
  GSMaP:    19 days
  IMERG:  