## Encar 크롤러 raw버전

### 1.국내차 목록

In [None]:
import os, json, time
import requests
import pandas as pd
from pathlib import Path
from requests.adapters import HTTPAdapter, Retry

if '__file__' in globals():
    REPO_ROOT = Path(__file__).resolve().parent.parent
else:
    REPO_ROOT = Path.cwd().parent
DATA_DIR = REPO_ROOT / "data"
CSV_PATH = DATA_DIR / "korean_cars.csv"
DATA_DIR.mkdir(parents=True, exist_ok=True)

BASE_URL = "https://api.encar.com/search/car/list/premium"
HEADERS = {
    "user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/140.0.0.0 Safari/537.36"),
    "accept": "application/json, text/plain, */*",
    "accept-language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
    "origin": "https://www.encar.com",
    "referer": "https://www.encar.com/dc/dc_carsearchlist.do",
}

# 영어 카테고리명을 엔카 DSL용 한글로 정규화
ENG2KOR = {
    "small car": "경차",
    "compact car": "소형차",
    "semi-medium car": "준중형차",
    "medium car": "중형차",
    "large car": "대형차",
    "sports car": "스포츠카",
    "suv": "SUV",
}

def norm_cat_for_dsl(name: str) -> str:
    n = str(name).strip()
    key = n.lower()
    return ENG2KOR.get(key, n)  

def make_session():
    s = requests.Session()
    retries = Retry(total=5, backoff_factor=1.2,
                    status_forcelist=[429, 500, 502, 503, 504],
                    allowed_methods=["GET"])
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update(HEADERS)
    return s

def get_json(s, params, tag):
    r = s.get(BASE_URL, params=params, timeout=15)
    if r.status_code != 200:
        raise RuntimeError(f"HTTP {r.status_code} url={r.url} body={r.text[:200]}")
    if "application/json" not in r.headers.get("Content-Type","").lower():
        (DATA_DIR / f"{tag}_{int(time.time())}.html").write_text(r.text, encoding="utf-8")
        raise ValueError("Non-JSON")
    return r.json()

def build_action_from_categories(categories, car_type="Y"):
    names = [str(c).strip() for c in categories if c and str(c).strip()]
    names = list(dict.fromkeys(names))
    if not names:
        return f"(And.Hidden.N._.(C.CarType.{car_type}.))"
    joined = "Category." + "._.Category.".join(names) + "."
    return f"(And.Hidden.N._.(C.CarType.{car_type}._.(Or.{joined})))"

def get_total_count(s, action, sort):
    j = get_json(s, {"count":"true", "q":action, "sr":f"|{sort}|0|1"}, "count")
    return int(j.get("Count", 0) or 0)

def make_detail_url(cid: str) -> str:
    return f"https://fem.encar.com/cars/detail/{cid}?pageid=dc_carsearch&listAdvType=pic&carid={cid}&view_type=normal"

def crawl_all_pages_single_category(cat_label_en: str, sort="ModifiedDate", limit=50):
    s = make_session()
    cat_kor = norm_cat_for_dsl(cat_label_en)
    action = build_action_from_categories([cat_kor], car_type="Y")
    total = get_total_count(s, action, sort)
    if total == 0:
        return pd.DataFrame()
    all_chunks = []
    for offset in range(0, total, limit):
        params = {"count":"false", "q":action, "sr":f"|{sort}|{offset}|{limit}"}
        data = get_json(s, params, "encar_page_error")
        rows = data.get("SearchResults", [])
        if not rows:
            break
        df = pd.json_normalize(rows, max_level=1)
        id_col = next((c for c in ["Id", "id", "carId", "carid"] if c in df.columns), None)
        if id_col:
            df["detail_url"] = df[id_col].astype(str).map(make_detail_url)
        df["Category"] = cat_label_en 
        all_chunks.append(df)
        time.sleep(0.6)
    return pd.concat(all_chunks, ignore_index=True) if all_chunks else pd.DataFrame()

def main():
    categories_en = ["light car", "small car", "semi-medium car", "large car", "SUV"]
    if CSV_PATH.exists():
        CSV_PATH.unlink()
    combined = []
    for cat_en in categories_en:
        df_cat = crawl_all_pages_single_category(cat_en, sort="ModifiedDate", limit=50)
        if not df_cat.empty:
            combined.append(df_cat)
    if combined:
        out = pd.concat(combined, ignore_index=True)
        out.to_csv(CSV_PATH, index=False, encoding="utf-8-sig")
        print(f"총 {len(out)}개 저장 -> {CSV_PATH}")
    else:
        print("No items found for given categories.")

if __name__ == "__main__":
    main()


총 87048개 저장 -> c:\Users\User\Desktop\Project\backend\data-pipeline\data\korean_cars.csv


### 2.외제차 목록

In [None]:
import os, json, time
import requests
import pandas as pd
from pathlib import Path
from requests.adapters import HTTPAdapter, Retry

if '__file__' in globals():
    REPO_ROOT = Path(__file__).resolve().parent.parent
else:
    REPO_ROOT = Path.cwd().parent
DATA_DIR = REPO_ROOT / "data"
CSV_PATH = DATA_DIR / "foreign_cars.csv"
DATA_DIR.mkdir(parents=True, exist_ok=True)

BASE_URL = "https://api.encar.com/search/car/list/premium"
HEADERS = {
    "user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/140.0.0.0 Safari/537.36"),
    "accept": "application/json, text/plain, */*",
    "accept-language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
    "origin": "https://www.encar.com",
    "referer": "https://www.encar.com/fc/fc_carsearchlist.do",
}

# 영어 카테고리명을 엔카 DSL용 한글로 정규화
ENG2KOR = {
    "small car": "경차",
    "compact car": "소형차",
    "semi-medium car": "준중형차",
    "medium car": "중형차",
    "large car": "대형차",
    "sports car": "스포츠카",
    "suv": "SUV",
}

def norm_cat_for_dsl(name: str) -> str:
    n = str(name).strip()
    key = n.lower()
    return ENG2KOR.get(key, n)  

def make_session():
    s = requests.Session()
    retries = Retry(total=5, backoff_factor=1.2,
                    status_forcelist=[429, 500, 502, 503, 504],
                    allowed_methods=["GET"])
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update(HEADERS)
    return s

def get_json(s, params, tag):
    r = s.get(BASE_URL, params=params, timeout=15)
    if r.status_code != 200:
        raise RuntimeError(f"HTTP {r.status_code} url={r.url} body={r.text[:200]}")
    if "application/json" not in r.headers.get("Content-Type","").lower():
        (DATA_DIR / f"{tag}_{int(time.time())}.html").write_text(r.text, encoding="utf-8")
        raise ValueError("Non-JSON")
    return r.json()

def build_action_from_categories(categories, car_type="Y"):
    names = [str(c).strip() for c in categories if c and str(c).strip()]
    names = list(dict.fromkeys(names))
    if not names:
        return f"(And.Hidden.N._.(C.CarType.{car_type}.))"
    joined = "Category." + "._.Category.".join(names) + "."
    return f"(And.Hidden.N._.(C.CarType.{car_type}._.(Or.{joined})))"

def get_total_count(s, action, sort):
    j = get_json(s, {"count":"true", "q":action, "sr":f"|{sort}|0|1"}, "count")
    return int(j.get("Count", 0) or 0)

def make_detail_url(cid: str) -> str:
    return f"https://fem.encar.com/cars/detail/{cid}?pageid=dc_carsearch&listAdvType=pic&carid={cid}&view_type=normal"

def crawl_all_pages_single_category(cat_label_en: str, sort="ModifiedDate", limit=50):
    s = make_session()
    cat_kor = norm_cat_for_dsl(cat_label_en)
    action = build_action_from_categories([cat_kor], car_type="Y")
    total = get_total_count(s, action, sort)
    if total == 0:
        return pd.DataFrame()
    all_chunks = []
    for offset in range(0, total, limit):
        params = {"count":"false", "q":action, "sr":f"|{sort}|{offset}|{limit}"}
        data = get_json(s, params, "encar_page_error")
        rows = data.get("SearchResults", [])
        if not rows:
            break
        df = pd.json_normalize(rows, max_level=1)
        id_col = next((c for c in ["Id", "id", "carId", "carid"] if c in df.columns), None)
        if id_col:
            df["detail_url"] = df[id_col].astype(str).map(make_detail_url)
        df["Category"] = cat_label_en  
        all_chunks.append(df)
        time.sleep(0.6)
    return pd.concat(all_chunks, ignore_index=True) if all_chunks else pd.DataFrame()

def main():
    categories_en = ["light car", "small car", "semi-medium car", "large car", "SUV"]
    if CSV_PATH.exists():
        CSV_PATH.unlink()
    combined = []
    for cat_en in categories_en:
        df_cat = crawl_all_pages_single_category(cat_en, sort="ModifiedDate", limit=50)
        if not df_cat.empty:
            combined.append(df_cat)
    if combined:
        out = pd.concat(combined, ignore_index=True)
        out.to_csv(CSV_PATH, index=False, encoding="utf-8-sig")
        print(f"총 {len(out)}개 저장 -> {CSV_PATH}")
    else:
        print("No items found for given categories.")

if __name__ == "__main__":
    main()


총 87024개 저장 -> c:\Users\User\Desktop\Project\backend\data-pipeline\data\foreign_cars.csv
