In [2]:
import pandas as pd
import numpy as np

PATH_RAIN = "../data/weather/midas-open_uk-hourly-rain-obs_dv-202507_avon_62122_almondsbury_qcv-1_2024.csv"
OUT_RAIN  = "../data/weather/rain_2024_almondsbury_clean.csv"

# -----------------------------
# 1) Find start of BADC "data" section
# -----------------------------
start = None
with open(PATH_RAIN, "r", errors="ignore") as f:
    for i, line in enumerate(f):
        if line.strip().lower() == "data":
            start = i + 1
            break
if start is None:
    raise ValueError("Could not find exact 'data' marker in BADC-CSV file.")

# -----------------------------
# 2) Read table
# -----------------------------
rain_raw = pd.read_csv(PATH_RAIN, skiprows=start)
rain_raw.columns = [c.strip().lower() for c in rain_raw.columns]

print("Loaded rain shape:", rain_raw.shape)
print("Columns:", rain_raw.columns.tolist())

# -----------------------------
# 3) Identify time + rain columns (based on your file)
# -----------------------------
time_col = "ob_end_time" if "ob_end_time" in rain_raw.columns else None
if time_col is None:
    raise ValueError("Expected 'ob_end_time' not found. Paste columns again.")

rain_col = "prcp_amt" if "prcp_amt" in rain_raw.columns else None
if rain_col is None:
    raise ValueError("Expected 'prcp_amt' not found. Paste columns again.")

# Optional quality flag column (keep for diagnostics if you want)
q_col = "prcp_amt_q" if "prcp_amt_q" in rain_raw.columns else None

print("Using:", {"time": time_col, "rain": rain_col, "qc": q_col})

# -----------------------------
# 4) Build tidy dataframe
# -----------------------------
r = rain_raw[[time_col, rain_col] + ([q_col] if q_col else [])].copy()

r[rain_col] = pd.to_numeric(r[rain_col], errors="coerce")

# Parse datetime (UK local -> UTC), handle DST
r["datetime"] = pd.to_datetime(r[time_col], errors="coerce")
r["datetime"] = (
    r["datetime"]
      .dt.tz_localize("Europe/London", ambiguous="NaT", nonexistent="shift_forward")
      .dt.tz_convert("UTC")
)

# Drop bad times
r = r.dropna(subset=["datetime"])

# Rename cleanly
r = r.rename(columns={rain_col: "rain_mm"})

# If rain is missing, treat as 0 (common for precipitation series)
r["rain_mm"] = r["rain_mm"].fillna(0)

# Ensure one row per hour (mean handles duplicates safely)
r = (r.sort_values("datetime")
       .groupby("datetime", as_index=False)
       .agg(rain_mm=("rain_mm", "mean")))

print("Clean rain rows:", len(r))
print(r.head())

# -----------------------------
# 5) Save (new file; doesn't overwrite weather file)
# -----------------------------
r.to_csv(OUT_RAIN, index=False)
print("Saved:", OUT_RAIN)

# -----------------------------
# 6) Sanity checks
# -----------------------------
print("\nRain datetime range (UTC):", r["datetime"].min(), "to", r["datetime"].max())
print("Unique hours:", r["datetime"].nunique(), "Rows:", len(r))
print("Percent rainy hours (rain_mm>0):", (r["rain_mm"] > 0).mean())

Loaded rain shape: (8601, 15)
Columns: ['ob_end_time', 'id', 'id_type', 'ob_hour_count', 'version_num', 'met_domain_name', 'src_id', 'rec_st_ind', 'prcp_amt', 'prcp_dur', 'prcp_amt_q', 'prcp_dur_q', 'prcp_amt_j', 'meto_stmp_time', 'midas_stmp_etime']
Using: {'time': 'ob_end_time', 'rain': 'prcp_amt', 'qc': 'prcp_amt_q'}
Clean rain rows: 7935
                   datetime  rain_mm
0 2024-01-01 00:00:00+00:00      0.0
1 2024-01-01 01:00:00+00:00      0.0
2 2024-01-01 02:00:00+00:00      0.0
3 2024-01-01 03:00:00+00:00      0.0
4 2024-01-01 04:00:00+00:00      0.0
Saved: ../data/weather/rain_2024_almondsbury_clean.csv

Rain datetime range (UTC): 2024-01-01 00:00:00+00:00 to 2024-12-31 09:00:00+00:00
Unique hours: 7935 Rows: 7935
Percent rainy hours (rain_mm>0): 0.1454316320100819


In [3]:
big = pd.read_csv("../data/mdm2_data_files/big_table_with_dark.csv")
big["datetime"] = pd.to_datetime(big["datetime"], utc=True, errors="coerce")

w = pd.read_csv("../data/weather/weather_2024_almondsbury_clean.csv")
w["datetime"] = pd.to_datetime(w["datetime"], utc=True, errors="coerce")

r = pd.read_csv("../data/weather/rain_2024_almondsbury_clean.csv")
r["datetime"] = pd.to_datetime(r["datetime"], utc=True, errors="coerce")

merged = big.merge(w, on="datetime", how="left").merge(r, on="datetime", how="left")

print(merged[["temp_c","wind_ms","rain_mm"]].isna().mean())
print("Rainy hours %:", (merged["rain_mm"].fillna(0) > 0).mean())

temp_c     0.003867
wind_ms    0.001333
rain_mm    0.097825
dtype: float64
Rainy hours %: 0.13091866698713595


In [4]:
merged["rain_mm"] = merged["rain_mm"].fillna(0)

merged.to_csv(
    "../data/mdm2_data_files/big_table_with_weather_and_rain.csv",
    index=False
)

print("Saved merged full dataset.")

Saved merged full dataset.
