### 경차

In [11]:
import os, json, time
import requests
import pandas as pd
from pathlib import Path
from urllib.parse import urlparse, unquote
from requests.adapters import HTTPAdapter, Retry

if '__file__' in globals():
    REPO_ROOT = Path(__file__).resolve().parent.parent
else:
    REPO_ROOT = Path.cwd().parent
DATA_DIR = REPO_ROOT / "data"
CSV_PATH = DATA_DIR / "light_cars.csv"
DATA_DIR.mkdir(parents=True, exist_ok=True)

BASE_URL = "https://api.encar.com/search/car/list/general"
HEADERS = {
    "user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/140.0.0.0 Safari/537.36"),
    "accept": "application/json, text/plain, */*",
    "accept-language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
    "origin": "https://www.encar.com",
    "referer": "https://www.encar.com/dc/dc_carsearchlist.do",
}

def make_session():
    s = requests.Session()
    retries = Retry(total=5, backoff_factor=1.2,
                    status_forcelist=[429, 500, 502, 503, 504],
                    allowed_methods=["GET"])
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update(HEADERS)
    return s

def get_json(s, params, tag):
    r = s.get(BASE_URL, params=params, timeout=15)
    if r.status_code != 200:
        raise RuntimeError(f"HTTP {r.status_code} url={r.url} body={r.text[:200]}")
    if "application/json" not in r.headers.get("Content-Type","").lower():
        (DATA_DIR / f"{tag}_{int(time.time())}.html").write_text(r.text, encoding="utf-8")
        raise ValueError("Non-JSON")
    return r.json()

def normalize_sort(x):
    if not x: return "ModifiedDate"
    t = str(x).strip()
    if t.lower() == "modifieddate" or t == "ModifiedDate": return "ModifiedDate"
    return "ModifiedDate"

def parse_dc_url(url: str):
    frag = urlparse(url).fragment
    if not frag: raise ValueError("no #! fragment")
    decoded = unquote(frag)
    if "!" in decoded:
        decoded = decoded.split("!")[-1]
        decoded = unquote(decoded)
    i, j = decoded.find("{"), decoded.rfind("}")
    if i == -1 or j == -1: raise ValueError("no JSON in fragment")
    obj = json.loads(decoded[i:j+1])
    obj = {k.lower(): v for k, v in obj.items()}
    action = obj.get("action")
    sort = normalize_sort(obj.get("sort"))
    if not action: raise ValueError("action missing")
    return action, sort

def build_action(categories, car_type="Y", inspection=False, record=False):
    names = [str(c).strip() for c in categories if c and str(c).strip()]
    names = list(dict.fromkeys(names))
    elems = [f"C.CarType.{car_type}."]
    if names:
        if len(names) == 1:
            elems.append(f"Category.{names[0]}.")
            inner = "._.".join(elems)
            if inspection: inner += "._.Condition.Inspection."
            if record: inner += "._.Condition.Record."
            return f"(And.Hidden.N._.({inner}))"
        joined = "Category." + "._.Category.".join(names) + "."
        elems.append(f"(Or.{joined})")
    inner = "._.".join(elems)
    if inspection: inner += "._.Condition.Inspection."
    if record: inner += "._.Condition.Record."
    return f"(And.Hidden.N._.({inner}))"

def get_total_count(s, action, sort):
    j = get_json(s, {"count":"true","q":action,"sr":f"|{sort}|0|1"}, "count")
    return int(j.get("Count", 0) or 0)

def crawl_all_from_action(action, sort="ModifiedDate", limit=50, csv_path=CSV_PATH, sleep_sec=0.6):
    s = make_session()
    total = get_total_count(s, action, sort)
    if total == 0:
        print("No items found"); return
    if csv_path.exists(): csv_path.unlink()
    wrote_header = False
    for offset in range(0, total, limit):
        params = {"count":"false","q":action,"sr":f"|{sort}|{offset}|{limit}"}
        data = get_json(s, params, "page")
        rows = data.get("SearchResults", [])
        if not rows: break
        df = pd.json_normalize(rows, max_level=1)
        df.to_csv(csv_path, mode="a", index=False, encoding="utf-8-sig", header=not wrote_header)
        wrote_header = True
        time.sleep(sleep_sec)
    print(f"완료: {csv_path}")

def crawl_all_from_dc_url(dc_url, limit=50, csv_path=CSV_PATH, sleep_sec=0.6):
    action, sort = parse_dc_url(dc_url)
    crawl_all_from_action(action, sort=sort, limit=limit, csv_path=csv_path, sleep_sec=sleep_sec)

def main():
    use_dc_url = True
    if use_dc_url:
        dc_url = r"https://www.encar.com/dc/dc_carsearchlist.do#!%7B%22action%22%3A%22(And.Hidden.N._.C.CarType.Y._.Category.%EA%B2%BD%EC%B0%A8._.Condition.Inspection._.Condition.Record.)%22%2C%22toggle%22%3A%7B%7D%2C%22layer%22%3A%22%22%2C%22sort%22%3A%22ModifiedDate%22%2C%22page%22%3A1%2C%22limit%22%3A20%2C%22searchKey%22%3A%22%22%2C%22loginCheck%22%3Afalse%7D"
        crawl_all_from_dc_url(dc_url, limit=50, csv_path=CSV_PATH)
    else:
        action = build_action(["경차"], car_type="Y", inspection=True, record=True)
        crawl_all_from_action(action, sort="ModifiedDate", limit=50, csv_path=CSV_PATH)

if __name__ == "__main__":
    main()


RuntimeError: HTTP 400 url=https://api.encar.com/search/car/list/general?count=true&q=%28And.Hidden.N._.C.CarType.Y._.Category.%EA%B2%BD%EC%B0%A8._.Condition.Inspection._.Condition.Record.%29&sr=%7CModifiedDate%7C0%7C1 body=

In [10]:
import os, json, math, time
import requests
import pandas as pd
from pathlib import Path
from urllib.parse import urlparse, unquote
from requests.adapters import HTTPAdapter, Retry

if '__file__' in globals():
    REPO_ROOT = Path(__file__).resolve().parent.parent
else:
    REPO_ROOT = Path.cwd().parent
DATA_DIR = REPO_ROOT / "data"
CSV_PATH = DATA_DIR / "domestic_cars.csv"
DATA_DIR.mkdir(parents=True, exist_ok=True)

BASE_URL = "https://api.encar.com/search/car/list/premium"
HEADERS = {
    "user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/140.0.0.0 Safari/537.36"),
    "accept": "application/json, text/plain, */*",
    "accept-language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
    "origin": "https://www.encar.com",
    "referer": "https://www.encar.com/dc/dc_carsearchlist.do",
}

def make_session() -> requests.Session:
    s = requests.Session()
    retries = Retry(total=5, backoff_factor=1.2,
                    status_forcelist=[429, 500, 502, 503, 504],
                    allowed_methods=["GET"])
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update(HEADERS)
    return s

def get_json(s: requests.Session, params: dict, error_prefix: str):
    r = s.get(BASE_URL, params=params, timeout=15)
    ct = r.headers.get("Content-Type", "")
    if r.status_code != 200:
        raise RuntimeError(f"HTTP {r.status_code} url={r.url} body={r.text[:200]}")
    if "application/json" not in ct.lower():
        fname = DATA_DIR / f"{error_prefix}_{int(time.time())}.html"
        fname.write_text(r.text, encoding="utf-8")
        raise ValueError(f"Non-JSON response saved to {fname}")
    return r.json()

def parse_dc_url(url: str):
    frag = urlparse(url).fragment
    if not frag:
        raise ValueError("No hash fragment found in URL.")
    decoded = unquote(frag)
    if "!%7B" in frag or decoded.count("{") == 0:
        parts = decoded.split("!")
        decoded = unquote(parts[-1])
    if not decoded.strip().startswith("{"):
        i, j = decoded.find("{"), decoded.rfind("}")
        if i == -1 or j == -1:
            raise ValueError("No JSON found in fragment.")
        decoded = decoded[i:j+1]
    obj = json.loads(decoded)
    obj = {k.lower(): v for k, v in obj.items()}
    action = obj.get("action")
    sort = obj.get("sort") or "ModifiedDate"
    page = int(obj.get("page") or 1)
    limit = int(obj.get("limit") or 20)
    if not action:
        raise ValueError("action not found in fragment JSON.")
    return action, sort, page, limit

def crawl_with_action(action: str, sort: str = "ModifiedDate", limit: int = 50, csv_path: Path = CSV_PATH):
    s = make_session()
    base_params = {"count": "false", "q": action}
    first = get_json(s, {**base_params, "sr": f"|{sort}|0|1"}, "encar_first_error")
    total = int(first.get("Count", 0) or 0)
    if total == 0:
        print("No items found.")
        return
    items_per_page = limit
    pages = math.ceil(total / items_per_page)
    all_items = []
    for p in range(pages):
        offset = p * items_per_page
        params = {**base_params, "sr": f"|{sort}|{offset}|{items_per_page}"}
        try:
            data = get_json(s, params, "encar_page_error")
            all_items.extend(data.get("SearchResults", []))
        except Exception as e:
            print(f"[WARN] page={p+1} offset={offset} error={e}")
            time.sleep(3)
            continue
        time.sleep(0.6)
    df = pd.json_normalize(all_items, max_level=1)
    df.to_csv(csv_path, index=False, encoding="utf-8-sig")
    print(f"총 {len(df)}개 저장 -> {csv_path}")

def main():
    dc_url = r"https://www.encar.com/dc/dc_carsearchlist.do#!%7B%22action%22%3A%22(And.Hidden.N._.(C.CarType.Y._.(Or.Category.%EA%B2%BD%EC%B0%A8._.Category.%EC%86%8C%ED%98%95%EC%B0%A8._.Category.%EC%A4%80%EC%A4%91%ED%98%95%EC%B0%A8._.Category.%EC%A4%91%ED%98%95%EC%B0%A8._.Category.%EB%8C%80%ED%98%95%EC%B0%A8._.Category.SUV.)))%22,%22sort%22:%22ModifiedDate%22,%22page%22:1,%22limit%22:20,%22layer%22:%22%22,%22loginCheck%22:false%7D"
    action, sort, page, limit = parse_dc_url(dc_url)
    crawl_with_action(action, sort=sort, limit=50, csv_path=CSV_PATH)

if __name__ == "__main__":
    main()


KeyboardInterrupt: 

In [4]:
import os, json, math, time
import requests
import pandas as pd
from pathlib import Path
from urllib.parse import urlparse, unquote
from requests.adapters import HTTPAdapter, Retry

if '__file__' in globals():
    REPO_ROOT = Path(__file__).resolve().parent.parent
else:
    REPO_ROOT = Path.cwd().parent
DATA_DIR = REPO_ROOT / "data"
CSV_PATH = DATA_DIR / "domestic_cars.csv"
DATA_DIR.mkdir(parents=True, exist_ok=True)

BASE_URL = "https://api.encar.com/search/car/list/premium"
HEADERS = {
    "user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/140.0.0.0 Safari/537.36"),
    "accept": "application/json, text/plain, */*",
    "accept-language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
    "origin": "https://www.encar.com",
    "referer": "https://www.encar.com/dc/dc_carsearchlist.do",
}

def make_session() -> requests.Session:
    s = requests.Session()
    retries = Retry(total=5, backoff_factor=1.2,
                    status_forcelist=[429, 500, 502, 503, 504],
                    allowed_methods=["GET"])
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update(HEADERS)
    return s

def get_json(s: requests.Session, params: dict, error_prefix: str):
    r = s.get(BASE_URL, params=params, timeout=15)
    ct = r.headers.get("Content-Type", "")
    if r.status_code != 200:
        raise RuntimeError(f"HTTP {r.status_code} url={r.url} body={r.text[:200]}")
    if "application/json" not in ct.lower():
        fname = DATA_DIR / f"{error_prefix}_{int(time.time())}.html"
        fname.write_text(r.text, encoding="utf-8")
        raise ValueError(f"Non-JSON response saved to {fname}")
    return r.json()

def parse_dc_url(url: str):
    frag = urlparse(url).fragment
    if not frag:
        raise ValueError("No hash fragment found in URL.")
    decoded = unquote(frag)
    if "!%7B" in frag or decoded.count("{") == 0:
        parts = decoded.split("!")
        decoded = unquote(parts[-1])
    if not decoded.strip().startswith("{"):
        i, j = decoded.find("{"), decoded.rfind("}")
        if i == -1 or j == -1:
            raise ValueError("No JSON found in fragment.")
        decoded = decoded[i:j+1]
    obj = json.loads(decoded)
    obj = {k.lower(): v for k, v in obj.items()}
    action = obj.get("action")
    sort = obj.get("sort") or "ModifiedDate"
    page = int(obj.get("page") or 1)
    limit = int(obj.get("limit") or 20)
    if not action:
        raise ValueError("action not found in fragment JSON.")
    return action, sort, page, limit

def build_action_from_categories(categories, car_type="Y"):
    cats = [str(c).strip() for c in categories if c and str(c).strip()]
    cats = sorted(set(cats))
    if not cats:
        return f"(And.Hidden.N._.(C.CarType.{car_type}.))"
    inner = "._.".join([f"Category.{c}." for c in cats])
    return f"(And.Hidden.N._.(C.CarType.{car_type}._.(Or.{inner})))"

def crawl_one_page(action: str, sort: str = "ModifiedDate", page: int = 1, limit: int = 20, csv_path: Path = CSV_PATH):
    s = make_session()
    offset = max(page - 1, 0) * limit
    params = {"count": "false", "q": action, "sr": f"|{sort}|{offset}|{limit}"}
    data = get_json(s, params, "encar_page_error")
    rows = data.get("SearchResults", [])
    df = pd.json_normalize(rows, max_level=1)
    df.to_csv(csv_path, index=False, encoding="utf-8-sig")
    print(f"총 {len(df)}개 저장 -> {csv_path}")

def main():
    categories = ["경차", "소형차", "중형차", "대형차", "SUV"]
    action = build_action_from_categories(categories, car_type="Y")
    sort = "ModifiedDate"
    page = 1
    limit = 20
    crawl_one_page(action, sort=sort, page=page, limit=limit, csv_path=CSV_PATH)

if __name__ == "__main__":
    main()


RuntimeError: HTTP 400 url=https://api.encar.com/search/car/list/premium?count=false&q=%28And.Hidden.N._.%28C.CarType.Y._.%28Or.Category.SUV.._.Category.%EA%B2%BD%EC%B0%A8.._.Category.%EB%8C%80%ED%98%95%EC%B0%A8.._.Category.%EC%86%8C%ED%98%95%EC%B0%A8.._.Category.%EC%A4%91%ED%98%95%EC%B0%A8.%29%29%29&sr=%7CModifiedDate%7C0%7C20 body=

In [21]:
import os, json, time
import requests
import pandas as pd
from pathlib import Path
from requests.adapters import HTTPAdapter, Retry

if '__file__' in globals():
    REPO_ROOT = Path(__file__).resolve().parent.parent
else:
    REPO_ROOT = Path.cwd().parent
DATA_DIR = REPO_ROOT / "data"
CSV_PATH = DATA_DIR / "compact_cars.csv"
DATA_DIR.mkdir(parents=True, exist_ok=True)

BASE_URL = "https://api.encar.com/search/car/list/premium"
HEADERS = {
    "user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/140.0.0.0 Safari/537.36"),
    "accept": "application/json, text/plain, */*",
    "accept-language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
    "origin": "https://www.encar.com",
    "referer": "https://www.encar.com/dc/dc_carsearchlist.do",
}

def make_session():
    s = requests.Session()
    retries = Retry(total=5, backoff_factor=1.2,
                    status_forcelist=[429, 500, 502, 503, 504],
                    allowed_methods=["GET"])
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update(HEADERS)
    return s

def get_json(s, params, tag):
    r = s.get(BASE_URL, params=params, timeout=15)
    if r.status_code != 200:
        raise RuntimeError(f"HTTP {r.status_code} url={r.url} body={r.text[:200]}")
    if "application/json" not in r.headers.get("Content-Type","").lower():
        (DATA_DIR / f"{tag}_{int(time.time())}.html").write_text(r.text, encoding="utf-8")
        raise ValueError("Non-JSON")
    return r.json()

def build_action_from_categories(categories, car_type="Y"):
    names = [str(c).strip() for c in categories if c and str(c).strip()]
    names = list(dict.fromkeys(names))
    if not names:
        return f"(And.Hidden.N._.(C.CarType.{car_type}.))"
    joined = "Category." + "._.Category.".join(names) + "."
    return f"(And.Hidden.N._.(C.CarType.{car_type}._.(Or.{joined})))"

def get_total_count(s, action, sort):
    j = get_json(s, {"count":"true", "q":action, "sr":f"|{sort}|0|1"}, "count")
    return int(j.get("Count", 0) or 0)

def crawl_all_pages(action, sort="ModifiedDate", limit=50, csv_path=CSV_PATH, sleep_sec=0.6):
    s = make_session()
    total = get_total_count(s, action, sort)
    if total == 0:
        print("No items found"); return
    if csv_path.exists():
        csv_path.unlink()
    saved = 0
    wrote_header = False
    for offset in range(0, total, limit):
        params = {"count":"false", "q":action, "sr":f"|{sort}|{offset}|{limit}"}
        data = get_json(s, params, "encar_page_error")
        rows = data.get("SearchResults", [])
        if not rows:
            break
        df = pd.json_normalize(rows, max_level=1)
        df.to_csv(csv_path, mode="a", index=False, encoding="utf-8-sig", header=not wrote_header)
        wrote_header = True
        saved += len(df)
        time.sleep(sleep_sec)
    print(f"총 {saved}개 저장 -> {csv_path}")

def main():
    categories = ["소형차"]
    action = build_action_from_categories(categories, car_type="Y")
    crawl_all_pages(action, sort="ModifiedDate", limit=50, csv_path=CSV_PATH)

if __name__ == "__main__":
    main()


총 1155개 저장 -> c:\Users\User\Desktop\Project\backend\data-pipeline\data\compact_cars.csv


In [20]:
import os, json, time
import requests
import pandas as pd
from pathlib import Path
from requests.adapters import HTTPAdapter, Retry

if '__file__' in globals():
    REPO_ROOT = Path(__file__).resolve().parent.parent
else:
    REPO_ROOT = Path.cwd().parent
DATA_DIR = REPO_ROOT / "data"
CSV_PATH = DATA_DIR / "compact_cars.csv"
DATA_DIR.mkdir(parents=True, exist_ok=True)

BASE_URL = "https://api.encar.com/search/car/list/premium"
HEADERS = {
    "user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/140.0.0.0 Safari/537.36"),
    "accept": "application/json, text/plain, */*",
    "accept-language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
    "origin": "https://www.encar.com",
    "referer": "https://www.encar.com/dc/dc_carsearchlist.do",
}

def make_session():
    s = requests.Session()
    retries = Retry(total=5, backoff_factor=1.2,
                    status_forcelist=[429, 500, 502, 503, 504],
                    allowed_methods=["GET"])
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update(HEADERS)
    return s

def get_json(s, params, tag):
    r = s.get(BASE_URL, params=params, timeout=15)
    if r.status_code != 200:
        raise RuntimeError(f"HTTP {r.status_code} url={r.url} body={r.text[:200]}")
    if "application/json" not in r.headers.get("Content-Type","").lower():
        (DATA_DIR / f"{tag}_{int(time.time())}.html").write_text(r.text, encoding="utf-8")
        raise ValueError("Non-JSON")
    return r.json()

def build_action_from_categories(categories, car_type="Y"):
    names = [str(c).strip() for c in categories if c and str(c).strip()]
    names = list(dict.fromkeys(names))
    if not names:
        return f"(And.Hidden.N._.(C.CarType.{car_type}._.Condition.Record._.Condition.Inspection.))"
    joined = "Category." + "._.Category.".join(names) + "._.Condition.Record._.Condition.Inspection."
    return f"(And.Hidden.N._.(C.CarType.{car_type}._.(Or.{joined})))"

def get_total_count(s, action, sort):
    j = get_json(s, {"count":"true", "q":action, "sr":f"|{sort}|0|1"}, "count")
    return int(j.get("Count", 0) or 0)

def crawl_all_pages(action, sort="ModifiedDate", limit=50, csv_path=CSV_PATH, sleep_sec=0.6):
    s = make_session()
    total = get_total_count(s, action, sort)
    if total == 0:
        print("No items found"); return
    if csv_path.exists():
        csv_path.unlink()
    saved = 0
    wrote_header = False
    for offset in range(0, total, limit):
        params = {"count":"false", "q":action, "sr":f"|{sort}|{offset}|{limit}"}
        data = get_json(s, params, "encar_page_error")
        rows = data.get("SearchResults", [])
        if not rows:
            break
        df = pd.json_normalize(rows, max_level=1)
        df.to_csv(csv_path, mode="a", index=False, encoding="utf-8-sig", header=not wrote_header)
        wrote_header = True
        saved += len(df)
        time.sleep(sleep_sec)
    print(f"총 {saved}개 저장 -> {csv_path}")

def main():
    categories = ["소형차"]
    action = build_action_from_categories(categories, car_type="Y")
    crawl_all_pages(action, sort="ModifiedDate", limit=50, csv_path=CSV_PATH)

if __name__ == "__main__":
    main()


KeyboardInterrupt: 

In [None]:
import os, json, time
import requests
import pandas as pd
from pathlib import Path
from requests.adapters import HTTPAdapter, Retry

if '__file__' in globals():
    REPO_ROOT = Path(__file__).resolve().parent.parent
else:
    REPO_ROOT = Path.cwd().parent
DATA_DIR = REPO_ROOT / "data"
CSV_PATH = DATA_DIR / "compact_cars_kor.csv"
DATA_DIR.mkdir(parents=True, exist_ok=True)

BASE_URL = "https://api.encar.com/search/car/list/premium"
HEADERS = {
    "user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/140.0.0.0 Safari/537.36"),
    "accept": "application/json, text/plain, */*",
    "accept-language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
    "origin": "https://www.encar.com",
    "referer": "https://www.encar.com/dc/dc_carsearchlist.do",
}

def make_session():
    s = requests.Session()
    retries = Retry(total=5, backoff_factor=1.2,
                    status_forcelist=[429, 500, 502, 503, 504],
                    allowed_methods=["GET"])
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update(HEADERS)
    return s

def get_json(s, params, tag):
    r = s.get(BASE_URL, params=params, timeout=15)
    if r.status_code != 200:
        raise RuntimeError(f"HTTP {r.status_code} url={r.url} body={r.text[:200]}")
    if "application/json" not in r.headers.get("Content-Type","").lower():
        (DATA_DIR / f"{tag}_{int(time.time())}.html").write_text(r.text, encoding="utf-8")
        raise ValueError("Non-JSON")
    return r.json()

def build_action_from_categories(categories, car_type="Y"):
    names = [str(c).strip() for c in categories if c and str(c).strip()]
    names = list(dict.fromkeys(names))
    if not names:
        return f"(And.Hidden.N._.(C.CarType.{car_type}.))"
    joined = "Category." + "._.Category.".join(names) + "."
    return f"(And.Hidden.N._.(C.CarType.{car_type}._.(Or.{joined})))"

def get_total_count(s, action, sort):
    j = get_json(s, {"count":"true", "q":action, "sr":f"|{sort}|0|1"}, "count")
    return int(j.get("Count", 0) or 0)

def make_detail_url(cid: str) -> str:
    return f"https://fem.encar.com/cars/detail/{cid}?pageid=dc_carsearch&listAdvType=pic&carid={cid}&view_type=normal"

def crawl_all_pages(action, sort="ModifiedDate", limit=50, csv_path=CSV_PATH, sleep_sec=0.6):
    s = make_session()
    total = get_total_count(s, action, sort)
    if total == 0:
        print("No items found"); return
    if csv_path.exists():
        csv_path.unlink()
    saved = 0
    wrote_header = False
    for offset in range(0, total, limit):
        params = {"count":"false", "q":action, "sr":f"|{sort}|{offset}|{limit}"}
        data = get_json(s, params, "encar_page_error")
        rows = data.get("SearchResults", [])
        if not rows:
            break
        df = pd.json_normalize(rows, max_level=1)
        id_col = next((c for c in ["Id","id","carId","carid"] if c in df.columns), None)
        if id_col:
            df["detail_url"] = df[id_col].astype(str).map(make_detail_url)
        df.to_csv(csv_path, mode="a", index=False, encoding="utf-8-sig", header=not wrote_header)
        wrote_header = True
        saved += len(df)
        time.sleep(sleep_sec)
    print(f"총 {saved}개 저장 -> {csv_path}")

def main():
    categories = ["소형차"]
    action = build_action_from_categories(categories, car_type="Y")
    crawl_all_pages(action, sort="ModifiedDate", limit=50, csv_path=CSV_PATH)

if __name__ == "__main__":
    main()


총 1153개 저장 -> c:\Users\User\Desktop\Project\backend\data-pipeline\data\compact_cars.csv


In [None]:
import os, json, time
import requests
import pandas as pd
from pathlib import Path
from requests.adapters import HTTPAdapter, Retry

if '__file__' in globals():
    REPO_ROOT = Path(__file__).resolve().parent.parent
else:
    REPO_ROOT = Path.cwd().parent
DATA_DIR = REPO_ROOT / "data"
CSV_PATH = DATA_DIR / "small_cars_kor.csv"
DATA_DIR.mkdir(parents=True, exist_ok=True)

BASE_URL = "https://api.encar.com/search/car/list/premium"
HEADERS = {
    "user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/140.0.0.0 Safari/537.36"),
    "accept": "application/json, text/plain, */*",
    "accept-language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
    "origin": "https://www.encar.com",
    "referer": "https://www.encar.com/dc/dc_carsearchlist.do",
}

def make_session():
    s = requests.Session()
    retries = Retry(total=5, backoff_factor=1.2,
                    status_forcelist=[429, 500, 502, 503, 504],
                    allowed_methods=["GET"])
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update(HEADERS)
    return s

def get_json(s, params, tag):
    r = s.get(BASE_URL, params=params, timeout=15)
    if r.status_code != 200:
        raise RuntimeError(f"HTTP {r.status_code} url={r.url} body={r.text[:200]}")
    if "application/json" not in r.headers.get("Content-Type","").lower():
        (DATA_DIR / f"{tag}_{int(time.time())}.html").write_text(r.text, encoding="utf-8")
        raise ValueError("Non-JSON")
    return r.json()

def build_action_from_categories(categories, car_type="Y"):
    names = [str(c).strip() for c in categories if c and str(c).strip()]
    names = list(dict.fromkeys(names))
    if not names:
        return f"(And.Hidden.N._.(C.CarType.{car_type}.))"
    joined = "Category." + "._.Category.".join(names) + "."
    return f"(And.Hidden.N._.(C.CarType.{car_type}._.(Or.{joined})))"

def get_total_count(s, action, sort):
    j = get_json(s, {"count":"true", "q":action, "sr":f"|{sort}|0|1"}, "count")
    return int(j.get("Count", 0) or 0)

def make_detail_url(cid: str) -> str:
    return f"https://fem.encar.com/cars/detail/{cid}?pageid=dc_carsearch&listAdvType=pic&carid={cid}&view_type=normal"

def crawl_all_pages(action, sort="ModifiedDate", limit=50, csv_path=CSV_PATH, sleep_sec=0.6):
    s = make_session()
    total = get_total_count(s, action, sort)
    if total == 0:
        print("No items found"); return
    if csv_path.exists():
        csv_path.unlink()
    saved = 0
    wrote_header = False
    for offset in range(0, total, limit):
        params = {"count":"false", "q":action, "sr":f"|{sort}|{offset}|{limit}"}
        data = get_json(s, params, "encar_page_error")
        rows = data.get("SearchResults", [])
        if not rows:
            break
        df = pd.json_normalize(rows, max_level=1)
        id_col = next((c for c in ["Id","id","carId","carid"] if c in df.columns), None)
        if id_col:
            df["detail_url"] = df[id_col].astype(str).map(make_detail_url)
        df.to_csv(csv_path, mode="a", index=False, encoding="utf-8-sig", header=not wrote_header)
        wrote_header = True
        saved += len(df)
        time.sleep(sleep_sec)
    print(f"총 {saved}개 저장 -> {csv_path}")

def main():
    categories = ["경차"]
    action = build_action_from_categories(categories, car_type="Y")
    crawl_all_pages(action, sort="ModifiedDate", limit=50, csv_path=CSV_PATH)

if __name__ == "__main__":
    main()
