In [None]:
import os
from pathlib import Path

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

# ---- Date range for the task ----
START_DATE = "2022-05-01"
END_DATE   = "2022-05-30"

# ---- Route ----
FROM_LOC = "LDS"
TO_LOC   = "KGX"

# ---- CEDA URL ----
CEDA_URL = "https://dap.ceda.ac.uk/badc/ukmo-midas-open/data/uk-hourly-rain-obs/dataset-version-202507/west-yorkshire/00534_bramham/qc-version-1/midas-open_uk-hourly-rain-obs_dv-202507_west-yorkshire_00534_bramham_qcv-1_2022.csv?download=1"

# ---- Output filenames ----
RAIN_CSV  = DATA_DIR / "rainfall_nightbefore.csv"
RAIL_CSV  = DATA_DIR / f"rail_metrics_{FROM_LOC}_{TO_LOC}_{START_DATE}_to_{END_DATE}.csv"
FINAL_CSV = DATA_DIR / f"final_{FROM_LOC}_{TO_LOC}_{START_DATE}_to_{END_DATE}.csv"

print("RAIN_CSV :", RAIN_CSV)
print("RAIL_CSV :", RAIL_CSV)
print("FINAL_CSV:", FINAL_CSV)

print("CEDA_ACCESS_TOKEN set:", bool(os.getenv("CEDA_ACCESS_TOKEN")))
print("RAIL_USER set:", bool(os.getenv("RAIL_USER")))
print("RAIL_PASSWORD set:", bool(os.getenv("RAIL_PASSWORD")))


RAIN_CSV : data\rainfall_nightbefore.csv
RAIL_CSV : data\rail_metrics_LDS_KGX_2022-05-01_to_2022-05-30.csv
FINAL_CSV: data\final_LDS_KGX_2022-05-01_to_2022-05-30.csv
CEDA_ACCESS_TOKEN set: True
RAIL_USER set: True
RAIL_PASSWORD set: True


In [None]:
USE_GCS_CACHE = True
BUCKET_NAME = "de-candidate-task-results-sp"
SERVICE_ACCOUNT_FILE = "searchlab-bq-training-sp-key.json"  # <-- your JSON key file

def ensure_local_from_gcs(bucket_name, blob_name, local_path, service_account_file):
    """
    If local_path doesn't exist and blob exists in GCS, download it.
    Returns True if local file exists after this function.
    """
    if local_path.exists():
        return True

    if not USE_GCS_CACHE:
        return False

    from google.cloud import storage
    from google.oauth2 import service_account

    creds = service_account.Credentials.from_service_account_file(service_account_file)
    client = storage.Client(credentials=creds)
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    if blob.exists(client):
        local_path.parent.mkdir(exist_ok=True)
        blob.download_to_filename(str(local_path))
        print(f"[INFO] Downloaded cache gs://{bucket_name}/{blob_name} -> {local_path}")
        return True

    print(f"[INFO] No existing blob found: gs://{bucket_name}/{blob_name}")
    return False


def upload_file_to_gcs(local_file_path, bucket_name, blob_name, service_account_file):
    from google.cloud import storage
    from google.oauth2 import service_account

    creds = service_account.Credentials.from_service_account_file(service_account_file)
    client = storage.Client(credentials=creds)
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    blob.upload_from_filename(str(local_file_path))
    print(f"[INFO] Uploaded -> gs://{bucket_name}/{blob_name}")


In [None]:
import pandas as pd

RAIN_GCS_BLOB = RAIN_CSV.name  # rainfall_nightbefore.csv in the bucket

def compute_night_before_rainfall(weather_df: pd.DataFrame) -> pd.DataFrame:
    df = weather_df.copy()
    df["ob_end_time"] = pd.to_datetime(df["ob_end_time"], errors="coerce")
    df["prcp_amt"] = pd.to_numeric(df["prcp_amt"], errors="coerce").fillna(0.0)

    # Filter 18:00â€“23:00
    df = df[df["ob_end_time"].dt.hour.between(18, 23)]

    agg = (
        df.groupby([df["ob_end_time"].dt.date, "weather_station"])["prcp_amt"]
        .sum()
        .reset_index()
        .rename(columns={"ob_end_time": "obs_date", "prcp_amt": "rainfall_evening_mm"})
    )

    # Shift +1 day to become "night before"
    agg["date"] = pd.to_datetime(agg["obs_date"]) + pd.Timedelta(days=1)
    agg["date"] = agg["date"].dt.strftime("%Y-%m-%d")

    return agg[["date", "weather_station", "rainfall_evening_mm"]].rename(
        columns={"rainfall_evening_mm": "rainfall_nightbefore_mm"}
    )

# Prefer local rainfall cache; else download from GCS; else compute
have_rain = ensure_local_from_gcs(BUCKET_NAME, RAIN_GCS_BLOB, RAIN_CSV, SERVICE_ACCOUNT_FILE)

if have_rain:
    rainfall_df = pd.read_csv(RAIN_CSV)
    print(f"[INFO] Loaded rainfall cache -> {RAIN_CSV}")
else:
    from ceda_utils import get_weather_data

    CEDA_ACCESS_TOKEN = os.getenv("CEDA_ACCESS_TOKEN")
    if not CEDA_ACCESS_TOKEN:
        raise ValueError("CEDA_ACCESS_TOKEN not set")

    weather_df = get_weather_data(url=CEDA_URL, access_token=CEDA_ACCESS_TOKEN)
    rainfall_df = compute_night_before_rainfall(weather_df)

    rainfall_df.to_csv(RAIN_CSV, index=False)
    print(f"[INFO] Saved rainfall CSV -> {RAIN_CSV}")

    if USE_GCS_CACHE:
        upload_file_to_gcs(RAIN_CSV, BUCKET_NAME, RAIN_GCS_BLOB, SERVICE_ACCOUNT_FILE)

rainfall_df.head()


[INFO] Loaded rainfall cache -> data\rainfall_nightbefore.csv


Unnamed: 0,date,weather_station,rainfall_nightbefore_mm
0,2022-01-02,bramham,0.0
1,2022-01-03,bramham,1.6
2,2022-01-04,bramham,1.0
3,2022-01-05,bramham,0.0
4,2022-01-06,bramham,0.0


In [None]:
import os
import time
from base64 import b64encode
from datetime import datetime, timedelta
import pandas as pd
import requests

HSP_URL = "https://hsp-prod.rockshore.net/api/v1/serviceMetrics"
RAIL_GCS_BLOB = RAIL_CSV.name

# Download rail cache only if local missing
ensure_local_from_gcs(BUCKET_NAME, RAIL_GCS_BLOB, RAIL_CSV, SERVICE_ACCOUNT_FILE)

def call_hsp_api(url, headers, payload, retries=3):
    for attempt in range(retries):
        try:
            response = requests.post(
                url,
                headers=headers,
                json=payload,
                timeout=90
            )
            if response.status_code == 200:
                return response.json()

            print(f"[WARNING] Status {response.status_code}: {response.text[:200]}")

        except requests.exceptions.ReadTimeout:
            print(f"[WARNING] Timeout attempt {attempt + 1}")
        except Exception as e:
            print(f"[ERROR] Attempt {attempt + 1}: {e}")

        time.sleep(3)

    print("[ERROR] All retries failed")
    return None

def get_daily_service_metrics(from_loc, to_loc, start_date, end_date):
    username = os.getenv("RAIL_USER")
    password = os.getenv("RAIL_PASSWORD")
    if not username or not password:
        raise ValueError("RAIL_USER and RAIL_PASSWORD must be set")

    token = b64encode(f"{username}:{password}".encode()).decode()
    headers = {"Authorization": f"Basic {token}", "Content-Type": "application/json"}

    current_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date_dt = datetime.strptime(end_date, "%Y-%m-%d")

    records = []
    while current_date <= end_date_dt:
        date_str = current_date.strftime("%Y-%m-%d")

        payload = {
            "from_loc": from_loc,
            "to_loc": to_loc,
            "from_time": "0600",
            "to_time": "0959",
            "from_date": date_str,
            "to_date": date_str,
            "days": "WEEKDAY",
            "tolerance": ["5"],
        }

        print(f"[INFO] Fetching rail data for {date_str}")
        data = call_hsp_api(HSP_URL, headers, payload)

        if not data:
            print(f"[ERROR] Skipping {date_str}")
            current_date += timedelta(days=1)
            continue

        total = 0
        on_time = 0
        for svc in data.get("Services", []):
            metrics = svc.get("Metrics", [])
            if not metrics:
                continue
            m = metrics[0]
            total += int(m["num_not_tolerance"]) + int(m["num_tolerance"])
            on_time += int(m["num_tolerance"])

        records.append({
            "date": date_str,
            "departure_rail_station_crs": from_loc,
            "destination_rail_station_crs": to_loc,
            "service_count_total": total,
            "service_count_ontime": on_time,
        })

        time.sleep(1)  # polite delay
        current_date += timedelta(days=1)

    return pd.DataFrame(records)

def daterange(start_date: str, end_date: str):
    cur = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    while cur <= end:
        yield cur.strftime("%Y-%m-%d")
        cur += timedelta(days=1)

def load_cached_csv(path: Path):
    if path.exists():
        return pd.read_csv(path)
    return None

def cached_dates(df: pd.DataFrame | None):
    if df is None or df.empty or "date" not in df.columns:
        return set()
    return set(df["date"].astype(str))

def upsert(existing_df: pd.DataFrame | None, new_df: pd.DataFrame) -> pd.DataFrame:
    if existing_df is None or existing_df.empty:
        return new_df.copy()
    combined = pd.concat([existing_df, new_df], ignore_index=True)
    combined = combined.drop_duplicates(
        subset=["date","departure_rail_station_crs","destination_rail_station_crs"],
        keep="last"
    )
    return combined.sort_values("date").reset_index(drop=True)

# ---- caching logic ----
existing_rail_df = load_cached_csv(RAIL_CSV)
done = cached_dates(existing_rail_df)

all_dates = set(daterange(START_DATE, END_DATE))
missing = sorted(all_dates - done)

print(f"[INFO] Cached days: {len(done)}")
print(f"[INFO] Missing days: {len(missing)}")
print("[INFO] Missing sample:", missing[:10])

if missing:
    # fetch smallest range covering missing dates, then filter down
    fetch_start = min(missing)
    fetch_end   = max(missing)

    fetched_df = get_daily_service_metrics(FROM_LOC, TO_LOC, fetch_start, fetch_end)
    fetched_df = fetched_df[fetched_df["date"].isin(missing)].reset_index(drop=True)

    rail_df = upsert(existing_rail_df, fetched_df)
else:
    rail_df = existing_rail_df.copy() if existing_rail_df is not None else pd.DataFrame()

rail_df.to_csv(RAIL_CSV, index=False)
print(f"[INFO] Saved rail CSV -> {RAIL_CSV}")

# Upload only if we actually fetched new data
if USE_GCS_CACHE and missing:
    upload_file_to_gcs(RAIL_CSV, BUCKET_NAME, RAIL_GCS_BLOB, SERVICE_ACCOUNT_FILE)

rail_df.head()


[INFO] Cached days: 0
[INFO] Missing days: 30
[INFO] Missing sample: ['2022-05-01', '2022-05-02', '2022-05-03', '2022-05-04', '2022-05-05', '2022-05-06', '2022-05-07', '2022-05-08', '2022-05-09', '2022-05-10']
<html><head>
<title>502 Proxy Error</title>
</head><body>
<h1>Proxy E
<html><head>
<title>502 Proxy Error</title>
</head><body>
<h1>Proxy E


In [None]:
def build_final_dataset(rail_df: pd.DataFrame, rainfall_df: pd.DataFrame) -> pd.DataFrame:
    out = rail_df.merge(rainfall_df, on="date", how="left")

    out["rainfall_nightbefore_mm"] = pd.to_numeric(out["rainfall_nightbefore_mm"], errors="coerce").fillna(0.0)
    out["service_count_total"] = pd.to_numeric(out["service_count_total"], errors="coerce").fillna(0).astype(int)
    out["service_count_ontime"] = pd.to_numeric(out["service_count_ontime"], errors="coerce").fillna(0).astype(int)

    out = out[
        [
            "date",
            "departure_rail_station_crs",
            "destination_rail_station_crs",
            "weather_station",
            "rainfall_nightbefore_mm",
            "service_count_total",
            "service_count_ontime",
        ]
    ]
    return out

final_df = build_final_dataset(rail_df, rainfall_df)
final_df.to_csv(FINAL_CSV, index=False)
print(f"[INFO] Saved final CSV -> {FINAL_CSV}")

if USE_GCS_CACHE:
    upload_file_to_gcs(FINAL_CSV, BUCKET_NAME, FINAL_CSV.name, SERVICE_ACCOUNT_FILE)

final_df.head()


In [None]:
all_dates = set(daterange(START_DATE, END_DATE))
have_dates = set(rail_df["date"].astype(str)) if not rail_df.empty else set()
still_missing = sorted(all_dates - have_dates)

print("[INFO] Still missing rail dates:", len(still_missing))
print(still_missing[:20])
