### Encar 차량매물

In [None]:
import re
import time
import requests
import pandas as pd
from requests.adapters import HTTPAdapter, Retry

# 카테고리 매핑
ENG2KOR = {
    "light car": "경차",
    "compact car": "소형차",
    "small car": "소형차",
    "semi-medium car": "준중형차",
    "medium car": "중형차",
    "large car": "대형차",
    "sports car": "스포츠카",
    "suv": "SUV",
    "rv":"RV",
    "van":"승합차",
    "truck":"화물차",
}

def norm_cat_for_dsl(name: str) -> str:
    return ENG2KOR.get(str(name).strip().lower(), name)

MARKET = {
    "korean": {"car_type": "Y", "referer": "https://www.encar.com/dc/dc_carsearchlist.do", "pageid": "dc_carsearch"},
    "foreign": {"car_type": "N", "referer": "https://www.encar.com/fc/fc_carsearchlist.do", "pageid": "fc_carsearch"},
}

BASE_URL = "https://api.encar.com/search/car/list/premium"

def make_session(referer: str) -> requests.Session:
    s = requests.Session()
    s.trust_env = False
    s.proxies = {}
    retries = Retry(
        total=5, backoff_factor=1.2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
    )
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update({
        "user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/140.0.0.0 Safari/537.36"),
        "accept": "application/json, text/plain, */*",
        "accept-language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
        "origin": "https://www.encar.com",
        "referer": referer,
    })
    return s

def get_json(s: requests.Session, params: dict):
    r = s.get(BASE_URL, params=params, timeout=15)
    r.raise_for_status()
    if "application/json" not in r.headers.get("Content-Type", "").lower():
        raise ValueError(f"Non-JSON: {r.url}")
    return r.json()

def build_action_from_categories(categories, car_type="Y"):
    names = [str(c).strip() for c in categories if c and str(c).strip()]
    names = list(dict.fromkeys(names))
    if not names:
        return f"(And.Hidden.N._.(C.CarType.{car_type}.))"
    joined = "Category." + "._.Category.".join(names) + "."
    return f"(And.Hidden.N._.(C.CarType.{car_type}._.(Or.{joined})))"

def get_total_count(s, action, sort="ModifiedDate"):
    j = get_json(s, {"count": "true", "q": action, "sr": f"|{sort}|0|1"})
    return int(j.get("Count", 0) or 0)

def make_detail_url(cid: int, pageid: str) -> str:
    return f"https://fem.encar.com/cars/detail/{cid}?pageid={pageid}&listAdvType=pic&carid={cid}&view_type=normal"

def to_int_safe(x):
    if x is None:
        return None
    if isinstance(x, (int, float)):
        try:
            return int(x)
        except Exception:
            return None
    if isinstance(x, str):
        m = re.findall(r"\d+", x.replace(",", ""))
        return int("".join(m)) if m else None
    return None

def extract_photo(row: pd.Series):
    if isinstance(row.get("Photo"), str) and row.get("Photo"):
        return row["Photo"]
    photos = row.get("Photos")
    if isinstance(photos, list) and photos:
        first = photos[0]
        if isinstance(first, dict):
            for k in ("url", "Url", "uri", "Uri", "imageUrl", "ImageUrl"):
                if k in first and first[k]:
                    return first[k]
        elif isinstance(first, str):
            return first
    return None

WANTED = [
    "vehicleId", "Market", "Category", "Manufacturer", "Model", "Badge", "BadgeDetail",
    "Transmission", "FuelType", "Year", "Mileage", "Price",
    "SellType", "OfficeCityState", "detail_url", "Photo"
]

def shape_rows(df_raw: pd.DataFrame, pageid: str, category_fallback: str, market_key: str) -> pd.DataFrame:
    id_col = next((c for c in ["vehicleId", "VehicleId", "id", "Id", "carId", "carid"] if c in df_raw.columns), None)
    if id_col is None:
        raise KeyError("vehicleId column not found in SearchResults")
    df = pd.DataFrame()
    df["vehicleId"] = df_raw[id_col].apply(to_int_safe)
    df["Market"] = market_key
    df["Model"] = df_raw.get("Model")
    if "Category" in df_raw.columns and df_raw["Category"].notna().any():
        df["Category"] = df_raw["Category"]
    elif "CategoryName" in df_raw.columns and df_raw["CategoryName"].notna().any():
        df["Category"] = df_raw["CategoryName"]
    else:
        df["Category"] = pd.Series([category_fallback] * len(df_raw), dtype="string")
    df["Manufacturer"] = df_raw.get("Manufacturer")
    df["Badge"] = df_raw.get("Badge")
    df["BadgeDetail"] = df_raw.get("BadgeDetail")
    df["Transmission"] = df_raw.get("Transmission")
    df["FuelType"] = df_raw.get("FuelType")
    df["Year"] = df_raw.get("Year").apply(to_int_safe) if "Year" in df_raw else None
    df["Mileage"] = df_raw.get("Mileage").apply(to_int_safe) if "Mileage" in df_raw else None
    df["Price"] = df_raw.get("Price").apply(to_int_safe) if "Price" in df_raw else None
    df["SellType"] = df_raw.get("SellType")
    df["OfficeCityState"] = df_raw.get("OfficeCityState")
    df["detail_url"] = df["vehicleId"].map(lambda x: make_detail_url(x, pageid) if pd.notna(x) else None)
    df["Photo"] = df_raw.apply(extract_photo, axis=1)
    for c in ["Market", "Manufacturer", "Model", "Category", "Badge", "BadgeDetail",
              "Transmission", "FuelType", "SellType", "OfficeCityState", "detail_url", "Photo"]:
        df[c] = df[c].astype("string")
    for c in ["vehicleId", "Year", "Mileage", "Price"]:
        df[c] = df[c].astype("Int64")
    return df[WANTED]

# 차량 리스트 크롤링 함수
def crawl_market(market_key: str, categories_en, sort="ModifiedDate", limit=10):
    conf = MARKET[market_key]
    s = make_session(conf["referer"])
    all_data = []
    for cat_en in categories_en:
        cat_kor = norm_cat_for_dsl(cat_en)
        action = build_action_from_categories([cat_kor], car_type=conf["car_type"])
        total = get_total_count(s, action, sort)
        if total == 0:
            print(f"[{market_key}] {cat_en} → 0건 (skip)")
            continue
        saved = 0
        for offset in range(0, min(total, limit), limit):
            params = {"count": "false", "q": action, "sr": f"|{sort}|{offset}|{limit}"}
            data = get_json(s, params)
            rows = data.get("SearchResults", [])
            if not rows:
                break
            raw = pd.json_normalize(rows, max_level=1)
            shaped = shape_rows(raw, pageid=conf["pageid"], category_fallback=cat_en, market_key=market_key)
            all_data.append(shaped)
            saved += len(shaped)
            break  # 첫 1회만 실행해서 10개 가져오기 위해 반복 중단
    if all_data:
        df_all = pd.concat(all_data, ignore_index=True)
        return df_all.head(10)
    else:
        return pd.DataFrame()

# vehicleNo 추출용 정규식
RE_VEH_NO = re.compile(r'"vehicleNo"\s*:\s*"([^"]+)"', re.S)

def fetch_vehicle_no(url, session):
    try:
        resp = session.get(url, timeout=6)
        if resp.ok:
            m = RE_VEH_NO.search(resp.text)
            if m:
                return m.group(1)
    except:
        pass
    return None

# vehicleNo
def attach_vehicle_no(df, max_count=10):
    s = requests.Session()
    s.trust_env = False
    retries = Retry(total=1, backoff_factor=0.2,
                    status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=10)
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    s.headers.update({
        "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/140.0.0.0 Safari/537.36"),
        "Referer": "https://fem.encar.com/",
        "Origin": "https://fem.encar.com",
    })

    vehicle_no_list = []
    count = 0
    for idx, row in df.iterrows():
        if count >= max_count:
            break
        url = row['detail_url']
        veh_no = fetch_vehicle_no(url, s)
        vehicle_no_list.append(veh_no)
        count += 1
        time.sleep(0.1) 

    df = df.head(max_count).copy()
    df["vehicleNo"] = pd.Series(vehicle_no_list, index=df.index[:max_count])

    # vehicleNo 컬럼을 vehicleId 바로 뒤로 이동
    cols = df.columns.tolist()
    cols.insert(1, cols.pop(cols.index('vehicleNo')))
    df = df[cols]
    return df

# 실행 예제
categories_en = ["light car", "compact car", "semi-medium car", "medium car", "large car"]
df_vehicles = crawl_market("korean", categories_en, limit=10)
df_with_vehicleNo = attach_vehicle_no(df_vehicles, max_count=10)

df_with_vehicleNo


Unnamed: 0,vehicleId,vehicleNo,Market,Category,Manufacturer,Model,Badge,BadgeDetail,Transmission,FuelType,Year,Mileage,Price,SellType,OfficeCityState,detail_url,Photo
0,40324975,21다2154,korean,light car,쉐보레(GM대우),더 넥스트 스파크,LTZ,,오토,가솔린,201702,58738,800,일반,대구,https://fem.encar.com/cars/detail/40324975?pag...,/carpicture01/pic4031/40314369_
1,40420353,103모5461,korean,light car,기아,더 뉴 레이,밴,프레스티지,오토,가솔린,202106,62770,830,일반,서울,https://fem.encar.com/cars/detail/40420353?pag...,/carpicture01/pic4041/40419277_
2,40063189,151다1867,korean,light car,기아,더 뉴 레이,프레스티지,,오토,가솔린,202107,30501,1250,일반,경기,https://fem.encar.com/cars/detail/40063189?pag...,/carpicture05/pic4005/40056575_
3,40384049,115보9540,korean,light car,기아,더 뉴 레이,밴,스탠다드,오토,가솔린,202108,47748,1050,일반,경기,https://fem.encar.com/cars/detail/40384049?pag...,/carpicture08/pic4038/40382973_
4,40040739,195다8338,korean,light car,기아,모닝 어반 (JA),시그니처,(세부등급 없음),오토,가솔린,202008,4795,1430,일반,부산,https://fem.encar.com/cars/detail/40040739?pag...,/carpicture03/pic4003/40034268_
5,40387393,138소6636,korean,light car,기아,더 뉴 레이,프레스티지,,오토,가솔린,202109,24708,1320,일반,서울,https://fem.encar.com/cars/detail/40387393?pag...,/carpicture08/pic4038/40383468_
6,40537536,126모4017,korean,light car,현대,캐스퍼,터보 인스퍼레이션,,오토,가솔린,202311,40653,1780,일반,경기,https://fem.encar.com/cars/detail/40537536?pag...,/carpicture03/pic4053/40536721_
7,40527910,381노2695,korean,light car,기아,올 뉴 모닝 (JA),럭셔리,,오토,가솔린,202002,54716,820,일반,경기,https://fem.encar.com/cars/detail/40527910?pag...,/carpicture01/pic4051/40513756_
8,40503020,225서8049,korean,light car,기아,더 뉴 레이,밴,프레스티지,오토,가솔린,202106,63218,9999,일반,경기,https://fem.encar.com/cars/detail/40503020?pag...,/carpicture08/pic4048/40487759_
9,40504020,180저1335,korean,light car,쉐보레(GM대우),더 뉴 스파크,프리미어,,오토,가솔린,202004,57417,920,일반,서울,https://fem.encar.com/cars/detail/40504020?pag...,/carpicture10/pic4050/40501547_
