In [None]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup  # pip install beautifulsoup4

URL = "https://publicinfobanjir.water.gov.my/wp-content/themes/shapely/agency/searchresultrainfall.php"
HEADERS = {"User-Agent": "Mozilla/5.0"}  # look like a browser
OUTFILE = "rainfall_trend.csv"
STATE_CODES = ["PLS", "KDH", "PNG", "PRK", "SEL", "WLH","PTJ", "NSN", "MLK", "JHR", "PHG", "TRG", "KEL", "SRK", "SAB", "WLP" ]

#-----------------------------MAIN LOGIC-----------------------------
def main():
    all_states = []
    for s in STATE_CODES:
            params = { "state": s, "district": "ALL", "station": "ALL", "loginStatus": "0",  "language": "1" }
            resp = requests.get(URL, params=params, headers=HEADERS, timeout=60)
            resp.raise_for_status()
            html = resp.text

            soup = BeautifulSoup(html, "html.parser")
            table = soup.find("table", id="normaltable1")
            if table is None:
                      raise RuntimeError("Could not find table with id 'normaltable1'")

            # ---------- Build column names ----------
            ths = table.find("thead").find_all("th")
            th_texts = [th.get_text(" ", strip=True) for th in ths]

             # Expected order
             # 0: No.
             # 1: Station ID
             # 2: Station
             # 3: District
             # 4: Last Updated
             # 5: Daily Rainfall (group label)  -> we skip this
             # 6: Rainfall from Midnight (...)
             # 7: Total 1 Hour (Now)
             # 8..: date columns (under Daily Rainfall group)

            base_cols = th_texts[0:5]           # No., Station ID, Station, District, Last Updated
            date_cols = th_texts[8:]           # e.g. 12/11/2025, 13/11/2025, ...
            tail_cols = th_texts[6:8]          # Rainfall from Midnight (...), Total 1 Hour (Now)

            columns = base_cols + date_cols + tail_cols
            ncols = len(columns)

            # ---------- Extract body rows ----------
            tbody = table.find("tbody")
            tds = [td.get_text(" ", strip=True) for td in tbody.find_all("td")]

            # Group every ncols <td> into one row
            rows = [tds[i:i + ncols] for i in range(0, len(tds), ncols)]

            df = pd.DataFrame(rows, columns=columns)
            df["state_param"] = s

            all_states.append(df)
            print(f"Fetched {s}: {len(df)} rows")

    combined = pd.concat(all_states, ignore_index=True)
    os.makedirs(os.path.dirname(OUTFILE) or ".", exist_ok=True)
    combined.to_csv(OUTFILE, index=False)

    print("\nâœ… Raw combined CSV saved:", os.path.abspath(OUTFILE))
    print("Total rows:", len(combined))
    print(combined.head())

if  __name__ == "__main__":
   main()