In [None]:
import os, json, math, time
import requests
import pandas as pd
from pathlib import Path
from requests.adapters import HTTPAdapter, Retry


if '__file__' in globals():
    REPO_ROOT = Path(__file__).resolve().parent.parent   
else:
    REPO_ROOT = Path.cwd().parent                        
DATA_DIR = REPO_ROOT / "data"
CSV_PATH = DATA_DIR / "avante_cn7_sample.csv"
DATA_DIR.mkdir(parents=True, exist_ok=True)              

# API 기본 설정
BASE_URL = "https://api.encar.com/search/car/list/premium"

HEADERS = {
    "user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/140.0.0.0 Safari/537.36"),
    "accept": "application/json, text/plain, */*",
    "accept-language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
    "origin": "https://www.encar.com",
    "referer": "https://www.encar.com/"
}

# 첫 페이지 파라미터(나중에 sr만 오프셋/페이지 크기로 갱신)
QUERY_PARAMS = {
    "count": "true",
    "q": "(And.Hidden.N._.(C.CarType.Y._.(C.Manufacturer.현대._.(C.ModelGroup.아반떼._.Model.아반떼 (CN7_).))))",
    "sr": "|ModifiedDate|0|50"
}

# 세션/요청 유틸
def make_session() -> requests.Session:
    """재시도/백오프가 설정된 세션 생성"""
    s = requests.Session()
    retries = Retry(
        total=5, # 최대 재시도 횟수
        backoff_factor=1.2,         
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"]
    )
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update(HEADERS)
    return s

def get_json(s: requests.Session, params: dict, error_prefix: str = "encar_error"):
    """GET 호출 후 JSON 응답 반환. JSON이 아니면 디버그용 HTML 저장."""
    r = s.get(BASE_URL, params=params, timeout=15)
    ct = r.headers.get("Content-Type", "")
    if r.status_code != 200:
        raise RuntimeError(f"HTTP {r.status_code} {r.text[:200]}")
    if "application/json" not in ct.lower():

        # 비정상 응답은 data 폴더에 저장해 디버깅
        fname = DATA_DIR / f"{error_prefix}_{int(time.time())}.html"
        fname.write_text(r.text, encoding="utf-8")
        raise ValueError(f"Non-JSON response saved to {fname}")
    return r.json()

# 메인 로직
def main():
    session = make_session()

    # 전체 개수 확인(첫 호출)
    data = get_json(session, dict(QUERY_PARAMS), "encar_first_error")
    total_count = int(data.get("Count", 0) or 0)
    if total_count == 0:
        print("No items found.")
        return
    print(f"Total items to fetch: {total_count}")

    items_per_page = 50
    num_pages = math.ceil(total_count / items_per_page)

    all_items = []
    for page in range(num_pages):
        offset = page * items_per_page
        params = dict(QUERY_PARAMS) # 원본 보존
        params["sr"] = f"|ModifiedDate|{offset}|{items_per_page}"
        try:
            page_data = get_json(session, params, "encar_page_error")
            search_results = page_data.get("SearchResults", [])
            all_items.extend(search_results)
        except Exception as e:
            print(f"[WARN] page={page+1} offset={offset} error={e}")
            time.sleep(3)  # 잠시 대기 후 다음 페이지
            continue
        time.sleep(0.6) # 0.6초 -> 문제없음 확인

    
    df = pd.json_normalize(all_items, max_level=1)
    df.to_csv(CSV_PATH, index=False, encoding="utf-8-sig")
    print(f"총 {len(df)}개 데이터 저장 완료 -> [저장경로]: {CSV_PATH}")

if __name__ == "__main__":
    main()


Total items to fetch: 1358
Saved 1358 rows to c:\Users\User\Desktop\Project\backend\data-pipeline\data\all_items.csv
