In [None]:
!pip install requests pandas beautifulsoup4 lxml
!mkdir -p out

In [None]:
import os
import requests
import pandas as pd
from io import StringIO
from datetime import datetime, timezone

URL = "https://publicinfobanjir.water.gov.my/aras-air/data-paras-air/aras-air-data/"
STATE_CODES = ["PLS", "KDH", "PNG", "PRK", "SEL", "WLH","PTJ", "NSN", "MLK", "JHR", "PHG", "TRG", "KEL", "SRK", "SAB", "WLP" ]
OUTFILE = "waterlevel_combined_raw.csv"
HEADERS = {"User-Agent": "Mozilla/5.0"}

TARGET_HEADER = [
    "No.",
    "Station ID",
    "Station Name",
    "District",
    "Main Basin",
    "Sub River Basin",
    "Last Updated",
    "Water Level (m) (Graph)",
    "Threshold Normal",
    "Threshold Alert",
    "Threshold Warning",
    "Threshold Danger",
    "State"
]

#-------------------MAIN LOGIC-------------------
def utc_ts():
    return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")

def main():
    all_states = []

    for s in STATE_CODES:
        params = {"state": s, "district": "ALL", "station": "ALL", "lang": "en"}
        resp = requests.get(URL, params=params, headers=HEADERS, timeout=60)
        resp.raise_for_status()

        # simplest: let pandas decide header (first row as header)
        df = pd.read_html(StringIO(resp.text))[0]
        df["state_param"] = s

        all_states.append(df)
        print(f"Fetched {s}: {len(df)} rows")

    combined = pd.concat(all_states, ignore_index=True)
    combined.columns = TARGET_HEADER
    os.makedirs(os.path.dirname(OUTFILE) or ".", exist_ok=True)
    combined.to_csv(OUTFILE, index=False)

    print("\nâœ… Raw combined CSV saved:", os.path.abspath(OUTFILE))
    print("Total rows:", len(combined))
    print(combined.head())

if __name__ == "__main__":
    main()
