In [None]:
import time
import requests
import pandas as pd
import xml.etree.ElementTree as ET
from urllib.parse import urljoin

BASE_URL = "https://apis.data.go.kr/1480523/WaterQualityService/"

CANDIDATE_ENDPOINTS = [
    "getWaterMeasuringList",
    "getWaterMeasuring",
    "getWaterMesuringList",
    "getWQMeasuringList"
]

DEFAULT_HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "Accept": "application/json, text/xml;q=0.9, */*;q=0.8",
}

def _parse_xml_items(text: str):
    root = ET.fromstring(text)

    header_node = root.find(".//header")
    header = {}
    if header_node is not None:
        for ch in list(header_node):
            header[ch.tag] = ch.text

    if header.get("resultCode") != "00":
        raise RuntimeError(f"API 오류(XML): {header.get('resultCode')} / {header.get('resultMsg')}")

    total_txt = root.findtext(".//totalCount")
    total = int(total_txt) if total_txt else None

    items = []
    for item in root.findall(".//items/item"):
        d = {}
        for ch in list(item):
            d[ch.tag] = ch.text
        items.append(d)

    return items, total

def _safe_get_items(res: requests.Response):
    res.raise_for_status()
    text = res.text

    try:
        data = res.json()
        header = data.get("response", {}).get("header", {})
        if header.get("resultCode") != "00":
            raise RuntimeError(f"API 오류(JSON): {header.get('resultCode')} / {header.get('resultMsg')}")

        body = data.get("response", {}).get("body", {})
        items = body.get("items", [])
        total = body.get("totalCount")
        total = int(total) if total is not None else None

        if isinstance(items, dict) and "item" in items:
            items = items["item"]

        if items is None:
            items = []

        return items, total

    except Exception as e_json:
        try:
            return _parse_xml_items(text)
        except Exception as e_xml:
            raise RuntimeError(
                "[ERROR] 응답을 JSON/XML로 모두 파싱 실패.\n"
                f"status={res.status_code}\n"
                f"text(head)={text[:200]}"
            ) from e_xml

def _get_with_retry(url, params, max_retry=3, sleep=0.8):
    last_err = None
    for _ in range(max_retry):
        try:
            res = requests.get(url, params=params, headers=DEFAULT_HEADERS, timeout=30)
            return res
        except Exception as e:
            last_err = e
            time.sleep(sleep)
    raise last_err

def resolve_endpoint(service_key: str, test_year: int = 2024):
    for ep in CANDIDATE_ENDPOINTS:
        url = urljoin(BASE_URL, ep)
        params = {
            "serviceKey": service_key,
            "pageNo": 1,
            "numOfRows": 1,
            "returnType": "json",
            "wmyrList": str(test_year)
        }
        try:
            res = _get_with_retry(url, params)
            _safe_get_items(res)
            return ep
        except Exception:
            continue

    raise RuntimeError(
        "유효한 요청주소를 찾지 못했음. "
    )

def fetch_water_measuring_year(year: int, service_key: str, num_of_rows=1000, verbose=True, endpoint=None):
    if endpoint is None:
        endpoint = resolve_endpoint(service_key, test_year=year)
        if verbose:
            print("[INFO] Using endpoint:", endpoint)

    url = urljoin(BASE_URL, endpoint)

    page_no = 1
    all_items = []
    total = None

    while True:
        params = {
            "serviceKey": service_key,
            "pageNo": page_no,
            "numOfRows": num_of_rows,
            "returnType": "json",
            "wmyrList": str(year)
        }

        res = _get_with_retry(url, params)
        items, total = _safe_get_items(res)

        if isinstance(items, dict):
            items = [items]
        if not items:
            break

        all_items.extend(items)

        if verbose and page_no % 5 == 0:
            print(f"[INFO] year={year} page={page_no} collected={len(all_items)} / total={total}")

        if total is not None and page_no * num_of_rows >= total:
            break

        page_no += 1
        time.sleep(0.1)

    return pd.DataFrame(all_items)

def fetch_water_measuring_2023_2025(service_key: str, num_of_rows=1000, verbose=True):
    ep = resolve_endpoint(service_key, test_year=2024)
    if verbose:
        print("[INFO] Final endpoint:", ep)

    dfs = []
    for y in (2023, 2024, 2025):
        df_y = fetch_water_measuring_year(y, service_key, num_of_rows=num_of_rows, verbose=verbose, endpoint=ep)
        dfs.append(df_y)

    df = pd.concat(dfs, ignore_index=True)
    return df


In [None]:
from google.colab import drive
drive.mount("/content/drive")

SERVICE_KEY = "f95e1e5448f207d52930fe8b951ed6ee3639cea92ab99c38ec71cbc6cfa25ac4"

df = fetch_water_measuring_2023_2025(
    service_key=SERVICE_KEY,
    num_of_rows=1000,
    verbose=True
)

save_path = "/content/drive/MyDrive/water_measuring_2023_2025.csv"
df.to_csv(save_path, index=False, encoding="utf-8-sig")

print("[DONE] saved ->", save_path)
print("rows:", len(df))
df.head()


Mounted at /content/drive
[INFO] Final endpoint: getWaterMeasuringList
[INFO] year=2023 page=5 collected=5000 / total=41081
[INFO] year=2023 page=10 collected=10000 / total=41081
[INFO] year=2023 page=15 collected=15000 / total=41081
[INFO] year=2023 page=20 collected=20000 / total=41081
[INFO] year=2023 page=25 collected=25000 / total=41081
[INFO] year=2023 page=30 collected=30000 / total=41081
[INFO] year=2023 page=35 collected=35000 / total=41081
[INFO] year=2023 page=40 collected=40000 / total=41081
[INFO] year=2024 page=5 collected=5000 / total=35005
[INFO] year=2024 page=10 collected=10000 / total=35005
[INFO] year=2024 page=15 collected=15000 / total=35005
[INFO] year=2024 page=20 collected=20000 / total=35005
[INFO] year=2024 page=25 collected=25000 / total=35005
[INFO] year=2024 page=30 collected=30000 / total=35005
[INFO] year=2024 page=35 collected=35000 / total=35005
[INFO] year=2025 page=5 collected=5000 / total=24206
[INFO] year=2025 page=10 collected=10000 / total=24206


Unnamed: 0,rowno,ptNo,ptNm,addr,orgNm,wmyr,wmod,wmwk,lonDgr,lonMin,...,itemChcl3,itemToc,itemDehp,itemAntimon,itemDiox,itemHcho,itemHcb,itemNi,itemBa,itemSe
0,1,2002A36,길안천1,경상북도 청송군 안덕면 고와리 고와2교,낙동강물환경연구소,2023,6,2회차,128,58,...,,7.1,,,,,,,,
1,2,2002A36,길안천1,경상북도 청송군 안덕면 고와리 고와2교,낙동강물환경연구소,2023,6,1회차,128,58,...,,3.8,,,,,,,,
2,3,2002A36,길안천1,경상북도 청송군 안덕면 고와리 고와2교,낙동강물환경연구소,2023,7,3회차,128,58,...,,4.1,,,,,,,,
3,4,2002A36,길안천1,경상북도 청송군 안덕면 고와리 고와2교,낙동강물환경연구소,2023,7,2회차,128,58,...,,5.4,,,,,,,,
4,5,2002A36,길안천1,경상북도 청송군 안덕면 고와리 고와2교,낙동강물환경연구소,2023,7,1회차,128,58,...,,3.6,,,,,,,,
