In [None]:
# 필요 라이브러리 가져오기
import re
import time
import requests
import pandas as pd
from requests.adapters import HTTPAdapter, Retry

# 카테고리 매핑
ENG2KOR = {
    "light car": "경차",
    "compact car": "소형차",
    "small car": "소형차",
    "semi-medium car": "준중형차",
    "medium car": "중형차",
    "large car": "대형차",
    "sports car": "스포츠카",
    "suv": "SUV",
    "rv":"RV",
    "van":"승합차",
    "truck":"화물차",
}

def norm_cat_for_dsl(name: str) -> str:
    return ENG2KOR.get(str(name).strip().lower(), name)

MARKET = {
    "korean": {"car_type": "Y", "referer": "https://www.encar.com/dc/dc_carsearchlist.do", "pageid": "dc_carsearch"},
    "foreign": {"car_type": "N", "referer": "https://www.encar.com/fc/fc_carsearchlist.do", "pageid": "fc_carsearch"},
}

BASE_URL = "https://api.encar.com/search/car/list/premium"

def make_session(referer: str) -> requests.Session:
    s = requests.Session()
    s.trust_env = False
    s.proxies = {}
    retries = Retry(
        total=5, backoff_factor=1.2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
    )
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update({
        "user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/140.0.0.0 Safari/537.36"),
        "accept": "application/json, text/plain, */*",
        "accept-language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
        "origin": "https://www.encar.com",
        "referer": referer,
    })
    return s

def get_json(s: requests.Session, params: dict):
    r = s.get(BASE_URL, params=params, timeout=15)
    r.raise_for_status()
    if "application/json" not in r.headers.get("Content-Type", "").lower():
        raise ValueError(f"Non-JSON: {r.url}")
    return r.json()

def build_action_from_categories(categories, car_type="Y"):
    names = [str(c).strip() for c in categories if c and str(c).strip()]
    names = list(dict.fromkeys(names))
    if not names:
        return f"(And.Hidden.N._.(C.CarType.{car_type}.))"
    joined = "Category." + "._.Category.".join(names) + "."
    return f"(And.Hidden.N._.(C.CarType.{car_type}._.(Or.{joined})))"

def get_total_count(s, action, sort="ModifiedDate"):
    j = get_json(s, {"count": "true", "q": action, "sr": f"|{sort}|0|1"})
    return int(j.get("Count", 0) or 0)

def make_detail_url(cid: int, pageid: str) -> str:
    return f"https://fem.encar.com/cars/detail/{cid}?pageid={pageid}&listAdvType=pic&carid={cid}&view_type=normal"

def to_int_safe(x):
    if x is None:
        return None
    if isinstance(x, (int, float)):
        try:
            return int(x)
        except Exception:
            return None
    if isinstance(x, str):
        m = re.findall(r"\d+", x.replace(",", ""))
        return int("".join(m)) if m else None
    return None

def extract_photo(row: pd.Series):
    if isinstance(row.get("Photo"), str) and row.get("Photo"):
        return row["Photo"]
    photos = row.get("Photos")
    if isinstance(photos, list) and photos:
        first = photos[0]
        if isinstance(first, dict):
            for k in ("url", "Url", "uri", "Uri", "imageUrl", "ImageUrl"):
                if k in first and first[k]:
                    return first[k]
        elif isinstance(first, str):
            return first
    return None

WANTED = [
    "vehicleId", "Market", "Manufacturer", "Model", "Category", "Badge", "BadgeDetail",
    "Transmission", "FuelType", "Year", "Mileage", "Price",
    "SellType", "OfficeCityState", "detail_url", "Photo"
]

def shape_rows(df_raw: pd.DataFrame, pageid: str, category_fallback: str, market_key: str) -> pd.DataFrame:
    id_col = next((c for c in ["vehicleId", "VehicleId", "id", "Id", "carId", "carid"] if c in df_raw.columns), None)
    if id_col is None:
        raise KeyError("vehicleId column not found in SearchResults")
    df = pd.DataFrame()
    df["vehicleId"] = df_raw[id_col].apply(to_int_safe)
    df["Market"] = market_key
    df["Manufacturer"] = df_raw.get("Manufacturer")
    df["Model"] = df_raw.get("Model")
    if "Category" in df_raw.columns and df_raw["Category"].notna().any():
        df["Category"] = df_raw["Category"]
    elif "CategoryName" in df_raw.columns and df_raw["CategoryName"].notna().any():
        df["Category"] = df_raw["CategoryName"]
    else:
        df["Category"] = pd.Series([category_fallback] * len(df_raw), dtype="string")
    df["Badge"] = df_raw.get("Badge")
    df["BadgeDetail"] = df_raw.get("BadgeDetail")
    df["Transmission"] = df_raw.get("Transmission")
    df["FuelType"] = df_raw.get("FuelType")
    df["Year"] = df_raw.get("Year").apply(to_int_safe) if "Year" in df_raw else None
    df["Mileage"] = df_raw.get("Mileage").apply(to_int_safe) if "Mileage" in df_raw else None
    df["Price"] = df_raw.get("Price").apply(to_int_safe) if "Price" in df_raw else None
    df["SellType"] = df_raw.get("SellType")
    df["OfficeCityState"] = df_raw.get("OfficeCityState")
    df["detail_url"] = df["vehicleId"].map(lambda x: make_detail_url(x, pageid) if pd.notna(x) else None)
    df["Photo"] = df_raw.apply(extract_photo, axis=1)
    for c in ["Market", "Manufacturer", "Model", "Category", "Badge", "BadgeDetail",
              "Transmission", "FuelType", "SellType", "OfficeCityState", "detail_url", "Photo"]:
        df[c] = df[c].astype("string")
    for c in ["vehicleId", "Year", "Mileage", "Price"]:
        df[c] = df[c].astype("Int64")
    return df[WANTED]

def crawl_market(market_key: str, categories_en, sort="ModifiedDate", limit=10):
    conf = MARKET[market_key]
    s = make_session(conf["referer"])
    all_data = []
    for cat_en in categories_en:
        cat_kor = norm_cat_for_dsl(cat_en)
        action = build_action_from_categories([cat_kor], car_type=conf["car_type"])
        total = get_total_count(s, action, sort)
        if total == 0:
            print(f"[{market_key}] {cat_en} → 0건 (skip)")
            continue
        saved = 0
        for offset in range(0, min(total, limit), limit):
            params = {"count": "false", "q": action, "sr": f"|{sort}|{offset}|{limit}"}
            data = get_json(s, params)
            rows = data.get("SearchResults", [])
            if not rows:
                break
            raw = pd.json_normalize(rows, max_level=1)
            shaped = shape_rows(raw, pageid=conf["pageid"], category_fallback=cat_en, market_key=market_key)
            all_data.append(shaped)
            saved += len(shaped)
            break  
    if all_data:
        df_all = pd.concat(all_data, ignore_index=True)
        return df_all.head(10)
    else:
        return pd.DataFrame()


categories_en = ["light car", "compact car", "semi-medium car", "medium car", "large car"]

# 출력 test용 -> 국산차중에 10개만
df_vehicles = crawl_market("korean", categories_en, limit=10) 
df_vehicles


Unnamed: 0,vehicleId,Market,Manufacturer,Model,Category,Badge,BadgeDetail,Transmission,FuelType,Year,Mileage,Price,SellType,OfficeCityState,detail_url,Photo
0,39499636,korean,기아,레이,light car,EV,,오토,전기,201712,38543,1099,일반,경기,https://fem.encar.com/cars/detail/39499636?pag...,/carpicture09/pic3949/39499436_
1,39861475,korean,기아,올 뉴 모닝 (JA),light car,레이디,,오토,가솔린,201706,9453,870,일반,경기,https://fem.encar.com/cars/detail/39861475?pag...,/carpicture04/pic3984/39845981_
2,39674728,korean,쉐보레(GM대우),더 넥스트 스파크,light car,LT,플러스,오토,가솔린,201712,127994,460,일반,부산,https://fem.encar.com/cars/detail/39674728?pag...,/carpicture06/pic3966/39669838_
3,40174694,korean,기아,더 뉴 레이,light car,프레스티지,,오토,가솔린,201907,45843,1230,일반,서울,https://fem.encar.com/cars/detail/40174694?pag...,/carpicture06/pic4016/40166372_
4,40308949,korean,기아,더 뉴 레이,light car,럭셔리,,오토,가솔린,202003,63228,1160,일반,경기,https://fem.encar.com/cars/detail/40308949?pag...,/carpicture10/pic4030/40302367_
5,40423613,korean,기아,모닝 어반 (JA),light car,프레스티지,,오토,가솔린,202010,43773,1150,일반,경기,https://fem.encar.com/cars/detail/40423613?pag...,/carpicture01/pic4041/40410504_
6,40436866,korean,기아,더 뉴 기아 레이,light car,시그니처,,오토,가솔린,202411,3616,1870,일반,서울,https://fem.encar.com/cars/detail/40436866?pag...,/carpicture02/pic4042/40422366_
7,40404507,korean,쉐보레(GM대우),스파크,light car,LT,기본형,오토,가솔린,201105,151532,199,일반,경남,https://fem.encar.com/cars/detail/40404507?pag...,/carpicture10/pic4040/40404501_
8,40464199,korean,현대,캐스퍼,light car,터보 인스퍼레이션,,오토,가솔린,202311,11033,1890,일반,인천,https://fem.encar.com/cars/detail/40464199?pag...,/carpicture05/pic4045/40456246_
9,40468783,korean,쉐보레(GM대우),더 뉴 스파크,light car,마이핏 에디션,(세부등급 없음),오토,가솔린,201908,73345,899,일반,서울,https://fem.encar.com/cars/detail/40468783?pag...,/carpicture06/pic4046/40468766_


In [3]:
import re
import time
import requests
import pandas as pd
from requests.adapters import HTTPAdapter, Retry

# 카테고리 매핑
ENG2KOR = {
    "light car": "경차",
    "compact car": "소형차",
    "small car": "소형차",
    "semi-medium car": "준중형차",
    "medium car": "중형차",
    "large car": "대형차",
    "sports car": "스포츠카",
    "suv": "SUV",
    "rv":"RV",
    "van":"승합차",
    "truck":"화물차",
}

def norm_cat_for_dsl(name: str) -> str:
    return ENG2KOR.get(str(name).strip().lower(), name)

MARKET = {
    "korean": {"car_type": "Y", "referer": "https://www.encar.com/dc/dc_carsearchlist.do", "pageid": "dc_carsearch"},
    "foreign": {"car_type": "N", "referer": "https://www.encar.com/fc/fc_carsearchlist.do", "pageid": "fc_carsearch"},
}

BASE_URL = "https://api.encar.com/search/car/list/premium"

def make_session(referer: str) -> requests.Session:
    s = requests.Session()
    s.trust_env = False
    s.proxies = {}
    retries = Retry(
        total=5, backoff_factor=1.2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
    )
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update({
        "user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/140.0.0.0 Safari/537.36"),
        "accept": "application/json, text/plain, */*",
        "accept-language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
        "origin": "https://www.encar.com",
        "referer": referer,
    })
    return s

def get_json(s: requests.Session, params: dict):
    r = s.get(BASE_URL, params=params, timeout=15)
    r.raise_for_status()
    if "application/json" not in r.headers.get("Content-Type", "").lower():
        raise ValueError(f"Non-JSON: {r.url}")
    return r.json()

def build_action_from_categories(categories, car_type="Y"):
    names = [str(c).strip() for c in categories if c and str(c).strip()]
    names = list(dict.fromkeys(names))
    if not names:
        return f"(And.Hidden.N._.(C.CarType.{car_type}.))"
    joined = "Category." + "._.Category.".join(names) + "."
    return f"(And.Hidden.N._.(C.CarType.{car_type}._.(Or.{joined})))"

def get_total_count(s, action, sort="ModifiedDate"):
    j = get_json(s, {"count": "true", "q": action, "sr": f"|{sort}|0|1"})
    return int(j.get("Count", 0) or 0)

def make_detail_url(cid: int, pageid: str) -> str:
    return f"https://fem.encar.com/cars/detail/{cid}?pageid={pageid}&listAdvType=pic&carid={cid}&view_type=normal"

def to_int_safe(x):
    if x is None:
        return None
    if isinstance(x, (int, float)):
        try:
            return int(x)
        except Exception:
            return None
    if isinstance(x, str):
        m = re.findall(r"\d+", x.replace(",", ""))
        return int("".join(m)) if m else None
    return None

def extract_photo(row: pd.Series):
    if isinstance(row.get("Photo"), str) and row.get("Photo"):
        return row["Photo"]
    photos = row.get("Photos")
    if isinstance(photos, list) and photos:
        first = photos[0]
        if isinstance(first, dict):
            for k in ("url", "Url", "uri", "Uri", "imageUrl", "ImageUrl"):
                if k in first and first[k]:
                    return first[k]
        elif isinstance(first, str):
            return first
    return None

WANTED = [
    "vehicleId", "Market", "Manufacturer", "Model", "Category", "Badge", "BadgeDetail",
    "Transmission", "FuelType", "Year", "Mileage", "Price",
    "SellType", "OfficeCityState", "detail_url", "Photo"
]

def shape_rows(df_raw: pd.DataFrame, pageid: str, category_fallback: str, market_key: str) -> pd.DataFrame:
    id_col = next((c for c in ["vehicleId", "VehicleId", "id", "Id", "carId", "carid"] if c in df_raw.columns), None)
    if id_col is None:
        raise KeyError("vehicleId column not found in SearchResults")
    df = pd.DataFrame()
    df["vehicleId"] = df_raw[id_col].apply(to_int_safe)
    df["Market"] = market_key
    df["Manufacturer"] = df_raw.get("Manufacturer")
    df["Model"] = df_raw.get("Model")
    if "Category" in df_raw.columns and df_raw["Category"].notna().any():
        df["Category"] = df_raw["Category"]
    elif "CategoryName" in df_raw.columns and df_raw["CategoryName"].notna().any():
        df["Category"] = df_raw["CategoryName"]
    else:
        df["Category"] = pd.Series([category_fallback] * len(df_raw), dtype="string")
    df["Badge"] = df_raw.get("Badge")
    df["BadgeDetail"] = df_raw.get("BadgeDetail")
    df["Transmission"] = df_raw.get("Transmission")
    df["FuelType"] = df_raw.get("FuelType")
    df["Year"] = df_raw.get("Year").apply(to_int_safe) if "Year" in df_raw else None
    df["Mileage"] = df_raw.get("Mileage").apply(to_int_safe) if "Mileage" in df_raw else None
    df["Price"] = df_raw.get("Price").apply(to_int_safe) if "Price" in df_raw else None
    df["SellType"] = df_raw.get("SellType")
    df["OfficeCityState"] = df_raw.get("OfficeCityState")
    df["detail_url"] = df["vehicleId"].map(lambda x: make_detail_url(x, pageid) if pd.notna(x) else None)
    df["Photo"] = df_raw.apply(extract_photo, axis=1)
    for c in ["Market", "Manufacturer", "Model", "Category", "Badge", "BadgeDetail",
              "Transmission", "FuelType", "SellType", "OfficeCityState", "detail_url", "Photo"]:
        df[c] = df[c].astype("string")
    for c in ["vehicleId", "Year", "Mileage", "Price"]:
        df[c] = df[c].astype("Int64")
    return df[WANTED]

def crawl_market(market_key: str, categories_en, sort="ModifiedDate", limit=10):
    conf = MARKET[market_key]
    s = make_session(conf["referer"])
    all_data = []
    for cat_en in categories_en:
        cat_kor = norm_cat_for_dsl(cat_en)
        action = build_action_from_categories([cat_kor], car_type=conf["car_type"])
        total = get_total_count(s, action, sort)
        if total == 0:
            print(f"[{market_key}] {cat_en} → 0건 (skip)")
            continue
        saved = 0
        for offset in range(0, min(total, limit), limit):
            params = {"count": "false", "q": action, "sr": f"|{sort}|{offset}|{limit}"}
            data = get_json(s, params)
            rows = data.get("SearchResults", [])
            if not rows:
                break
            raw = pd.json_normalize(rows, max_level=1)
            shaped = shape_rows(raw, pageid=conf["pageid"], category_fallback=cat_en, market_key=market_key)
            all_data.append(shaped)
            saved += len(shaped)
            break  # 첫 1회만 실행해서 10개 가져오기 위해 반복 중단
    if all_data:
        df_all = pd.concat(all_data, ignore_index=True)
        return df_all.head(10)
    else:
        return pd.DataFrame()

# vehicleNo 추출용 정규식
RE_VEH_NO = re.compile(r'"vehicleNo"\s*:\s*"([^"]+)"', re.S)

def fetch_vehicle_no(url, session):
    try:
        resp = session.get(url, timeout=6)
        if resp.ok:
            m = RE_VEH_NO.search(resp.text)
            if m:
                return m.group(1)
    except:
        pass
    return None

def attach_vehicle_no(df, max_count=10):
    s = requests.Session()
    s.trust_env = False
    retries = Retry(total=1, backoff_factor=0.2, 
                    status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=10)
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    s.headers.update({
        "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/140.0.0.0 Safari/537.36"),
        "Referer": "https://fem.encar.com/",
        "Origin": "https://fem.encar.com",
    })

    vehicle_no_list = []
    count = 0
    for idx, row in df.iterrows():
        if count >= max_count:
            break
        url = row['detail_url']
        veh_no = fetch_vehicle_no(url, s)
        vehicle_no_list.append(veh_no)
        count += 1
        time.sleep(0.1)  # 서버 부담 완화

    df = df.head(max_count).copy()
    df["vehicleNo"] = pd.Series(vehicle_no_list, index=df.index[:max_count])
    return df

# 실행 예제: 한국 시장에서 차량 10개 리스트 받아서 vehicleNo 추가
categories_en = ["light car", "compact car", "semi-medium car", "medium car", "large car"]
df_vehicles = crawl_market("korean", categories_en, limit=10)

df_with_vehicleNo = attach_vehicle_no(df_vehicles, max_count=10)

df_with_vehicleNo


Unnamed: 0,vehicleId,Market,Manufacturer,Model,Category,Badge,BadgeDetail,Transmission,FuelType,Year,Mileage,Price,SellType,OfficeCityState,detail_url,Photo,vehicleNo
0,39329953,korean,현대,캐스퍼,light car,터보 인스퍼레이션,,오토,가솔린,202206,59842,1710,일반,경기,https://fem.encar.com/cars/detail/39329953?pag...,/carpicture01/pic3931/39318951_,108거8113
1,39756391,korean,기아,올 뉴 모닝 (JA),light car,럭셔리,,오토,가솔린,201806,59849,790,일반,충북,https://fem.encar.com/cars/detail/39756391?pag...,/carpicture05/pic3975/39751560_,131누4277
2,39703083,korean,기아,올 뉴 모닝 (JA),light car,프레스티지,,오토,가솔린,202003,54850,950,일반,대구,https://fem.encar.com/cars/detail/39703083?pag...,/carpicture08/pic3968/39688698_,237보6827
3,39541273,korean,기아,올 뉴 모닝 (JA),light car,프레스티지,,오토,가솔린,201810,56072,830,일반,대구,https://fem.encar.com/cars/detail/39541273?pag...,/carpicture03/pic3953/39531401_,46머2214
4,40120952,korean,쉐보레(GM대우),더 뉴 스파크,light car,프리미어,,오토,가솔린,202012,65084,690,일반,인천,https://fem.encar.com/cars/detail/40120952?pag...,/carpicture01/pic4011/40117518_,134부6899
5,40034363,korean,기아,더 뉴 레이,light car,밴,럭셔리,오토,가솔린,202006,88733,659,일반,서울,https://fem.encar.com/cars/detail/40034363?pag...,/carpicture07/pic3867/38676060_,113노9257
6,40289658,korean,쉐보레(GM대우),더 넥스트 스파크,light car,LTZ,,오토,가솔린,201601,70667,690,일반,경기,https://fem.encar.com/cars/detail/40289658?pag...,/carpicture05/pic3995/39955536_,07노0423
7,40262866,korean,기아,올 뉴 모닝,light car,럭셔리,,오토,가솔린,201105,74913,540,일반,부산,https://fem.encar.com/cars/detail/40262866?pag...,/carpicture06/pic4026/40261452_,67루9061
8,40425125,korean,현대,캐스퍼,light car,터보 인스퍼레이션,,오토,가솔린,202309,11378,1989,일반,경기,https://fem.encar.com/cars/detail/40425125?pag...,/carpicture02/pic4042/40421652_,193머9791
9,40427948,korean,쉐보레(GM대우),더 넥스트 스파크,light car,LT,플러스,오토,가솔린,201608,30451,720,일반,대구,https://fem.encar.com/cars/detail/40427948?pag...,/carpicture01/pic4041/40419534_,52무0349


In [7]:
import re
import time
import requests
import pandas as pd
from requests.adapters import HTTPAdapter, Retry

# 카테고리 매핑
ENG2KOR = {
    "light car": "경차",
    "compact car": "소형차",
    "small car": "소형차",
    "semi-medium car": "준중형차",
    "medium car": "중형차",
    "large car": "대형차",
    "sports car": "스포츠카",
    "suv": "SUV",
    "rv":"RV",
    "van":"승합차",
    "truck":"화물차",
}

def norm_cat_for_dsl(name: str) -> str:
    return ENG2KOR.get(str(name).strip().lower(), name)

MARKET = {
    "korean": {"car_type": "Y", "referer": "https://www.encar.com/dc/dc_carsearchlist.do", "pageid": "dc_carsearch"},
    "foreign": {"car_type": "N", "referer": "https://www.encar.com/fc/fc_carsearchlist.do", "pageid": "fc_carsearch"},
}

BASE_URL = "https://api.encar.com/search/car/list/premium"

def make_session(referer: str) -> requests.Session:
    s = requests.Session()
    s.trust_env = False
    s.proxies = {}
    retries = Retry(
        total=5, backoff_factor=1.2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
    )
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update({
        "user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/140.0.0.0 Safari/537.36"),
        "accept": "application/json, text/plain, */*",
        "accept-language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
        "origin": "https://www.encar.com",
        "referer": referer,
    })
    return s

def get_json(s: requests.Session, params: dict):
    r = s.get(BASE_URL, params=params, timeout=15)
    r.raise_for_status()
    if "application/json" not in r.headers.get("Content-Type", "").lower():
        raise ValueError(f"Non-JSON: {r.url}")
    return r.json()

def build_action_from_categories(categories, car_type="Y"):
    names = [str(c).strip() for c in categories if c and str(c).strip()]
    names = list(dict.fromkeys(names))
    if not names:
        return f"(And.Hidden.N._.(C.CarType.{car_type}.))"
    joined = "Category." + "._.Category.".join(names) + "."
    return f"(And.Hidden.N._.(C.CarType.{car_type}._.(Or.{joined})))"

def get_total_count(s, action, sort="ModifiedDate"):
    j = get_json(s, {"count": "true", "q": action, "sr": f"|{sort}|0|1"})
    return int(j.get("Count", 0) or 0)

def make_detail_url(cid: int, pageid: str) -> str:
    return f"https://fem.encar.com/cars/detail/{cid}?pageid={pageid}&listAdvType=pic&carid={cid}&view_type=normal"

def to_int_safe(x):
    if x is None:
        return None
    if isinstance(x, (int, float)):
        try:
            return int(x)
        except Exception:
            return None
    if isinstance(x, str):
        m = re.findall(r"\d+", x.replace(",", ""))
        return int("".join(m)) if m else None
    return None

def extract_photo(row: pd.Series):
    if isinstance(row.get("Photo"), str) and row.get("Photo"):
        return row["Photo"]
    photos = row.get("Photos")
    if isinstance(photos, list) and photos:
        first = photos[0]
        if isinstance(first, dict):
            for k in ("url", "Url", "uri", "Uri", "imageUrl", "ImageUrl"):
                if k in first and first[k]:
                    return first[k]
        elif isinstance(first, str):
            return first
    return None

WANTED = [
    "vehicleId", "Market", "Category", "Manufacturer", "Model", "Badge", "BadgeDetail",
    "Transmission", "FuelType", "Year", "Mileage", "Price",
    "SellType", "OfficeCityState", "detail_url", "Photo"
]

def shape_rows(df_raw: pd.DataFrame, pageid: str, category_fallback: str, market_key: str) -> pd.DataFrame:
    id_col = next((c for c in ["vehicleId", "VehicleId", "id", "Id", "carId", "carid"] if c in df_raw.columns), None)
    if id_col is None:
        raise KeyError("vehicleId column not found in SearchResults")
    df = pd.DataFrame()
    df["vehicleId"] = df_raw[id_col].apply(to_int_safe)
    df["Market"] = market_key
    df["Model"] = df_raw.get("Model")
    if "Category" in df_raw.columns and df_raw["Category"].notna().any():
        df["Category"] = df_raw["Category"]
    elif "CategoryName" in df_raw.columns and df_raw["CategoryName"].notna().any():
        df["Category"] = df_raw["CategoryName"]
    else:
        df["Category"] = pd.Series([category_fallback] * len(df_raw), dtype="string")
    df["Manufacturer"] = df_raw.get("Manufacturer")
    df["Badge"] = df_raw.get("Badge")
    df["BadgeDetail"] = df_raw.get("BadgeDetail")
    df["Transmission"] = df_raw.get("Transmission")
    df["FuelType"] = df_raw.get("FuelType")
    df["Year"] = df_raw.get("Year").apply(to_int_safe) if "Year" in df_raw else None
    df["Mileage"] = df_raw.get("Mileage").apply(to_int_safe) if "Mileage" in df_raw else None
    df["Price"] = df_raw.get("Price").apply(to_int_safe) if "Price" in df_raw else None
    df["SellType"] = df_raw.get("SellType")
    df["OfficeCityState"] = df_raw.get("OfficeCityState")
    df["detail_url"] = df["vehicleId"].map(lambda x: make_detail_url(x, pageid) if pd.notna(x) else None)
    df["Photo"] = df_raw.apply(extract_photo, axis=1)
    for c in ["Market", "Manufacturer", "Model", "Category", "Badge", "BadgeDetail",
              "Transmission", "FuelType", "SellType", "OfficeCityState", "detail_url", "Photo"]:
        df[c] = df[c].astype("string")
    for c in ["vehicleId", "Year", "Mileage", "Price"]:
        df[c] = df[c].astype("Int64")
    return df[WANTED]

# 차량 리스트 크롤링 함수
def crawl_market(market_key: str, categories_en, sort="ModifiedDate", limit=10):
    conf = MARKET[market_key]
    s = make_session(conf["referer"])
    all_data = []
    for cat_en in categories_en:
        cat_kor = norm_cat_for_dsl(cat_en)
        action = build_action_from_categories([cat_kor], car_type=conf["car_type"])
        total = get_total_count(s, action, sort)
        if total == 0:
            print(f"[{market_key}] {cat_en} → 0건 (skip)")
            continue
        saved = 0
        for offset in range(0, min(total, limit), limit):
            params = {"count": "false", "q": action, "sr": f"|{sort}|{offset}|{limit}"}
            data = get_json(s, params)
            rows = data.get("SearchResults", [])
            if not rows:
                break
            raw = pd.json_normalize(rows, max_level=1)
            shaped = shape_rows(raw, pageid=conf["pageid"], category_fallback=cat_en, market_key=market_key)
            all_data.append(shaped)
            saved += len(shaped)
            break  # 첫 1회만 실행해서 10개 가져오기 위해 반복 중단
    if all_data:
        df_all = pd.concat(all_data, ignore_index=True)
        return df_all.head(10)
    else:
        return pd.DataFrame()

# vehicleNo 추출용 정규식
RE_VEH_NO = re.compile(r'"vehicleNo"\s*:\s*"([^"]+)"', re.S)

def fetch_vehicle_no(url, session):
    try:
        resp = session.get(url, timeout=6)
        if resp.ok:
            m = RE_VEH_NO.search(resp.text)
            if m:
                return m.group(1)
    except:
        pass
    return None

# vehicleNo를 vehicleId 바로 뒤에 오도록 붙임
def attach_vehicle_no(df, max_count=10):
    s = requests.Session()
    s.trust_env = False
    retries = Retry(total=1, backoff_factor=0.2,
                    status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=10)
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    s.headers.update({
        "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/140.0.0.0 Safari/537.36"),
        "Referer": "https://fem.encar.com/",
        "Origin": "https://fem.encar.com",
    })

    vehicle_no_list = []
    count = 0
    for idx, row in df.iterrows():
        if count >= max_count:
            break
        url = row['detail_url']
        veh_no = fetch_vehicle_no(url, s)
        vehicle_no_list.append(veh_no)
        count += 1
        time.sleep(0.1)  # 서버 부담 완화

    df = df.head(max_count).copy()
    df["vehicleNo"] = pd.Series(vehicle_no_list, index=df.index[:max_count])

    # vehicleNo 컬럼을 vehicleId 바로 뒤로 이동
    cols = df.columns.tolist()
    cols.insert(1, cols.pop(cols.index('vehicleNo')))
    df = df[cols]
    return df

# 실행 예제
categories_en = ["light car", "compact car", "semi-medium car", "medium car", "large car"]
df_vehicles = crawl_market("korean", categories_en, limit=10)
df_with_vehicleNo = attach_vehicle_no(df_vehicles, max_count=10)

df_with_vehicleNo


Unnamed: 0,vehicleId,vehicleNo,Market,Category,Manufacturer,Model,Badge,BadgeDetail,Transmission,FuelType,Year,Mileage,Price,SellType,OfficeCityState,detail_url,Photo
0,40324975,21다2154,korean,light car,쉐보레(GM대우),더 넥스트 스파크,LTZ,,오토,가솔린,201702,58738,800,일반,대구,https://fem.encar.com/cars/detail/40324975?pag...,/carpicture01/pic4031/40314369_
1,40420353,103모5461,korean,light car,기아,더 뉴 레이,밴,프레스티지,오토,가솔린,202106,62770,830,일반,서울,https://fem.encar.com/cars/detail/40420353?pag...,/carpicture01/pic4041/40419277_
2,40063189,151다1867,korean,light car,기아,더 뉴 레이,프레스티지,,오토,가솔린,202107,30501,1250,일반,경기,https://fem.encar.com/cars/detail/40063189?pag...,/carpicture05/pic4005/40056575_
3,40384049,115보9540,korean,light car,기아,더 뉴 레이,밴,스탠다드,오토,가솔린,202108,47748,1050,일반,경기,https://fem.encar.com/cars/detail/40384049?pag...,/carpicture08/pic4038/40382973_
4,40040739,195다8338,korean,light car,기아,모닝 어반 (JA),시그니처,(세부등급 없음),오토,가솔린,202008,4795,1430,일반,부산,https://fem.encar.com/cars/detail/40040739?pag...,/carpicture03/pic4003/40034268_
5,40387393,138소6636,korean,light car,기아,더 뉴 레이,프레스티지,,오토,가솔린,202109,24708,1320,일반,서울,https://fem.encar.com/cars/detail/40387393?pag...,/carpicture08/pic4038/40383468_
6,40537536,126모4017,korean,light car,현대,캐스퍼,터보 인스퍼레이션,,오토,가솔린,202311,40653,1780,일반,경기,https://fem.encar.com/cars/detail/40537536?pag...,/carpicture03/pic4053/40536721_
7,40527910,381노2695,korean,light car,기아,올 뉴 모닝 (JA),럭셔리,,오토,가솔린,202002,54716,820,일반,경기,https://fem.encar.com/cars/detail/40527910?pag...,/carpicture01/pic4051/40513756_
8,40503020,225서8049,korean,light car,기아,더 뉴 레이,밴,프레스티지,오토,가솔린,202106,63218,9999,일반,경기,https://fem.encar.com/cars/detail/40503020?pag...,/carpicture08/pic4048/40487759_
9,40504020,180저1335,korean,light car,쉐보레(GM대우),더 뉴 스파크,프리미어,,오토,가솔린,202004,57417,920,일반,서울,https://fem.encar.com/cars/detail/40504020?pag...,/carpicture10/pic4050/40501547_


In [None]:
import re
import time
import requests
import pandas as pd
from requests.adapters import HTTPAdapter, Retry

# 카테고리 매핑
ENG2KOR = {
    "light car": "경차",
    "compact car": "소형차",
    "small car": "소형차",
    "semi-medium car": "준중형차",
    "medium car": "중형차",
    "large car": "대형차",
    "sports car": "스포츠카",
    "suv": "SUV",
    "rv":"RV",
    "van":"승합차",
    "truck":"화물차",
}

def norm_cat_for_dsl(name: str) -> str:
    return ENG2KOR.get(str(name).strip().lower(), name)

MARKET = {
    "korean": {"car_type": "Y", "referer": "https://www.encar.com/dc/dc_carsearchlist.do", "pageid": "dc_carsearch"},
    "foreign": {"car_type": "N", "referer": "https://www.encar.com/fc/fc_carsearchlist.do", "pageid": "fc_carsearch"},
}

BASE_URL = "https://api.encar.com/search/car/list/premium"

def make_session(referer: str) -> requests.Session:
    s = requests.Session()
    s.trust_env = False
    s.proxies = {}
    retries = Retry(
        total=5, backoff_factor=1.2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
    )
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update({
        "user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/140.0.0.0 Safari/537.36"),
        "accept": "application/json, text/plain, */*",
        "accept-language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
        "origin": "https://www.encar.com",
        "referer": referer,
    })
    return s

def get_json(s: requests.Session, params: dict):
    r = s.get(BASE_URL, params=params, timeout=15)
    r.raise_for_status()
    if "application/json" not in r.headers.get("Content-Type", "").lower():
        raise ValueError(f"Non-JSON: {r.url}")
    return r.json()

def build_action_from_categories(categories, car_type="Y"):
    names = [str(c).strip() for c in categories if c and str(c).strip()]
    names = list(dict.fromkeys(names))
    if not names:
        return f"(And.Hidden.N._.(C.CarType.{car_type}.))"
    joined = "Category." + "._.Category.".join(names) + "."
    return f"(And.Hidden.N._.(C.CarType.{car_type}._.(Or.{joined})))"

def get_total_count(s, action, sort="ModifiedDate"):
    j = get_json(s, {"count": "true", "q": action, "sr": f"|{sort}|0|1"})
    return int(j.get("Count", 0) or 0)

def make_detail_url(cid: int, pageid: str) -> str:
    return f"https://fem.encar.com/cars/detail/{cid}?pageid={pageid}&listAdvType=pic&carid={cid}&view_type=normal"

def to_int_safe(x):
    if x is None:
        return None
    if isinstance(x, (int, float)):
        try:
            return int(x)
        except Exception:
            return None
    if isinstance(x, str):
        m = re.findall(r"\d+", x.replace(",", ""))
        return int("".join(m)) if m else None
    return None

def extract_photo(row: pd.Series):
    if isinstance(row.get("Photo"), str) and row.get("Photo"):
        return row["Photo"]
    photos = row.get("Photos")
    if isinstance(photos, list) and photos:
        first = photos[0]
        if isinstance(first, dict):
            for k in ("url", "Url", "uri", "Uri", "imageUrl", "ImageUrl"):
                if k in first and first[k]:
                    return first[k]
        elif isinstance(first, str):
            return first
    return None

def shape_rows(df_raw: pd.DataFrame, pageid: str, category_fallback: str, market_key: str) -> pd.DataFrame:
    id_col = next((c for c in ["vehicleId", "VehicleId", "id", "Id", "carId", "carid"] if c in df_raw.columns), None)
    if id_col is None:
        raise KeyError("vehicleId column not found in SearchResults")
    df = pd.DataFrame()
    df["vehicleId"] = df_raw[id_col].apply(to_int_safe)
    df["Market"] = market_key

    # Model, Badge, BadgeDetail 합치기
    model = df_raw.get("Model")
    badge = df_raw.get("Badge")
    badge_detail = df_raw.get("BadgeDetail")
    def merge_model(row):
        parts = [
            str(row["Model"]).strip() if pd.notna(row["Model"]) else "",
            str(row["Badge"]).strip() if pd.notna(row["Badge"]) else "",
            str(row["BadgeDetail"]).strip() if pd.notna(row["BadgeDetail"]) else ""
        ]
        return " ".join([p for p in parts if p and p != "<NA>"])
    df["Model"] = pd.DataFrame({
        "Model": model,
        "Badge": badge,
        "BadgeDetail": badge_detail
    }).apply(merge_model, axis=1)

    if "Category" in df_raw.columns and df_raw["Category"].notna().any():
        df["Category"] = df_raw["Category"]
    elif "CategoryName" in df_raw.columns and df_raw["CategoryName"].notna().any():
        df["Category"] = df_raw["CategoryName"]
    else:
        df["Category"] = pd.Series([category_fallback] * len(df_raw), dtype="string")
    df["Manufacturer"] = df_raw.get("Manufacturer")
    df["Transmission"] = df_raw.get("Transmission")
    df["FuelType"] = df_raw.get("FuelType")
    df["Year"] = df_raw.get("Year").apply(to_int_safe) if "Year" in df_raw else None
    df["Mileage"] = df_raw.get("Mileage").apply(to_int_safe) if "Mileage" in df_raw else None
    df["Price"] = df_raw.get("Price").apply(to_int_safe) if "Price" in df_raw else None
    df["SellType"] = df_raw.get("SellType")
    df["OfficeCityState"] = df_raw.get("OfficeCityState")
    df["detail_url"] = df["vehicleId"].map(lambda x: make_detail_url(x, pageid) if pd.notna(x) else None)
    df["Photo"] = df_raw.apply(extract_photo, axis=1)
    for c in ["Market", "Manufacturer", "Model", "Category",
              "Transmission", "FuelType", "SellType",
              "OfficeCityState", "detail_url", "Photo"]:
        df[c] = df[c].astype("string")
    for c in ["vehicleId", "Year", "Mileage", "Price"]:
        df[c] = df[c].astype("Int64")
    wanted_cols = [
        "vehicleId", "Market", "Category", "Manufacturer", "Model",
        "Transmission", "FuelType", "Year", "Mileage", "Price",
        "SellType", "OfficeCityState", "detail_url", "Photo"
    ]
    return df[wanted_cols]

# 차량 리스트 크롤링 함수
def crawl_market(market_key: str, categories_en, sort="ModifiedDate", limit=10):
    conf = MARKET[market_key]
    s = make_session(conf["referer"])
    all_data = []
    for cat_en in categories_en:
        cat_kor = norm_cat_for_dsl(cat_en)
        action = build_action_from_categories([cat_kor], car_type=conf["car_type"])
        total = get_total_count(s, action, sort)
        if total == 0:
            print(f"[{market_key}] {cat_en} → 0건 (skip)")
            continue
        saved = 0
        for offset in range(0, min(total, limit), limit):
            params = {"count": "false", "q": action, "sr": f"|{sort}|{offset}|{limit}"}
            data = get_json(s, params)
            rows = data.get("SearchResults", [])
            if not rows:
                break
            raw = pd.json_normalize(rows, max_level=1)
            shaped = shape_rows(raw, pageid=conf["pageid"], category_fallback=cat_en, market_key=market_key)
            all_data.append(shaped)
            saved += len(shaped)
            break  # 첫 1회만 실행해서 10개 가져오기 위해 반복 중단
    if all_data:
        df_all = pd.concat(all_data, ignore_index=True)
        return df_all.head(10)
    else:
        return pd.DataFrame()

# vehicleNo 추출용 정규식
RE_VEH_NO = re.compile(r'"vehicleNo"\s*:\s*"([^"]+)"', re.S)

def fetch_vehicle_no(url, session):
    try:
        resp = session.get(url, timeout=6)
        if resp.ok:
            m = RE_VEH_NO.search(resp.text)
            if m:
                return m.group(1)
    except:
        pass
    return None

def attach_vehicle_no(df, max_count=10):
    s = requests.Session()
    s.trust_env = False
    retries = Retry(total=1, backoff_factor=0.2,
                    status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=10)
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    s.headers.update({
        "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/140.0.0.0 Safari/537.36"),
        "Referer": "https://fem.encar.com/",
        "Origin": "https://fem.encar.com",
    })

    vehicle_no_list = []
    count = 0
    for idx, row in df.iterrows():
        if count >= max_count:
            break
        url = row['detail_url']
        veh_no = fetch_vehicle_no(url, s)
        vehicle_no_list.append(veh_no)
        count += 1
        time.sleep(0.1)  # 서버 부담 완화

    df = df.head(max_count).copy()
    df["vehicleNo"] = pd.Series(vehicle_no_list, index=df.index[:max_count])

    # vehicleNo 컬럼을 vehicleId 바로 뒤로 이동
    cols = df.columns.tolist()
    cols.insert(1, cols.pop(cols.index('vehicleNo')))
    df = df[cols]
    return df

# 실행 예제
categories_en = ["light car", "compact car", "semi-medium car", "medium car", "large car"]
df_vehicles = crawl_market("korean", categories_en, limit=10)
df_with_vehicleNo = attach_vehicle_no(df_vehicles, max_count=10)

df_with_vehicleNo


Unnamed: 0,vehicleId,vehicleNo,Market,Category,Manufacturer,Model,Transmission,FuelType,Year,Mileage,Price,SellType,OfficeCityState,detail_url,Photo
0,40487083,375누5819,korean,suv,KG모빌리티(쌍용),뷰티풀 코란도 가솔린 1.5 2WD C7,오토,가솔린,202005,62100,1950,일반,경기,https://fem.encar.com/cars/detail/40487083?pag...,/carpicture08/pic4048/40487070_
1,40481518,153모4980,korean,suv,쉐보레(GM대우),트레일블레이저 1.3 터보 2WD RS,오토,가솔린,202303,16741,2200,일반,경기,https://fem.encar.com/cars/detail/40481518?pag...,/carpicture07/pic4047/40479313_
2,40460618,215더6652,korean,suv,KG모빌리티(쌍용),토레스 가솔린 1.5 2WD T7,오토,가솔린,202211,40473,2699,일반,경기,https://fem.encar.com/cars/detail/40460618?pag...,/carpicture04/pic4044/40447930_
3,40464465,63나2043,korean,suv,현대,코나 1.6 터보 2WD 모던 아트,오토,가솔린,201811,39910,1660,일반,서울,https://fem.encar.com/cars/detail/40464465?pag...,/carpicture06/pic4046/40461716_
4,40461013,42무3457,korean,suv,기아,더 뉴 쏘렌토 디젤 2.2 2WD 프레스티지,오토,디젤,201711,112522,1520,일반,경기,https://fem.encar.com/cars/detail/40461013?pag...,/carpicture05/pic4045/40454440_
5,40457765,342머1509,korean,suv,르노코리아(삼성),XM3 1.3 TCe RE 시그니처,오토,가솔린,202109,70185,1699,일반,대구,https://fem.encar.com/cars/detail/40457765?pag...,/carpicture04/pic4044/40444651_
6,40456675,369서9066,korean,suv,현대,싼타페 (MX5) HEV 1.6 2WD 캘리그래피,오토,가솔린+전기,202402,22423,4680,일반,부산,https://fem.encar.com/cars/detail/40456675?pag...,/carpicture04/pic4044/40447801_
7,40517248,249가8301,korean,suv,기아,스포티지 5세대 가솔린 1.6 터보 2WD 시그니처,오토,가솔린,202407,19805,3450,일반,경기,https://fem.encar.com/cars/detail/40517248?pag...,/carpicture01/pic4051/40512730_
8,40513896,351가6531,korean,suv,KG모빌리티(쌍용),토레스 가솔린 1.5 2WD T7,오토,가솔린,202212,42298,2699,일반,경기,https://fem.encar.com/cars/detail/40513896?pag...,/carpicture10/pic4050/40500064_
9,40502597,285도9518,korean,suv,르노코리아(삼성),XM3 1.6 GTe LE Plus,오토,가솔린,202004,59116,1390,일반,경기,https://fem.encar.com/cars/detail/40502597?pag...,/carpicture10/pic4050/40502277_


In [None]:
import re
import time
import requests
import pandas as pd
from requests.adapters import HTTPAdapter, Retry

# MARKET 한글 라벨 매핑
MARKET_LABEL = {
    "korean": "국산",
    "foreign": "수입"
}

BASE_URL = "https://api.encar.com/search/car/list/premium"

def make_session(referer: str) -> requests.Session:
    s = requests.Session()
    s.trust_env = False
    s.proxies = {}
    retries = Retry(
        total=5, backoff_factor=1.2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
    )
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update({
        "user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/140.0.0.0 Safari/537.36"),
        "accept": "application/json, text/plain, */*",
        "accept-language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
        "origin": "https://www.encar.com",
        "referer": referer,
    })
    return s

def get_json(s: requests.Session, params: dict):
    r = s.get(BASE_URL, params=params, timeout=15)
    r.raise_for_status()
    if "application/json" not in r.headers.get("Content-Type", "").lower():
        raise ValueError(f"Non-JSON: {r.url}")
    return r.json()

def build_action_from_categories(categories, car_type="Y"):
    names = [str(c).strip() for c in categories if c and str(c).strip()]
    names = list(dict.fromkeys(names))
    if not names:
        return f"(And.Hidden.N._.(C.CarType.{car_type}.))"
    joined = "Category." + "._.Category.".join(names) + "."
    return f"(And.Hidden.N._.(C.CarType.{car_type}._.(Or.{joined})))"

def get_total_count(s, action, sort="ModifiedDate"):
    j = get_json(s, {"count": "true", "q": action, "sr": f"|{sort}|0|1"})
    return int(j.get("Count", 0) or 0)

def make_detail_url(cid: int, pageid: str) -> str:
    return f"https://fem.encar.com/cars/detail/{cid}?pageid={pageid}&listAdvType=pic&carid={cid}&view_type=normal"

def to_int_safe(x):
    if x is None:
        return None
    if isinstance(x, (int, float)):
        try:
            return int(x)
        except Exception:
            return None
    if isinstance(x, str):
        m = re.findall(r"\d+", x.replace(",", ""))
        return int("".join(m)) if m else None
    return None

def extract_photo(row: pd.Series):
    if isinstance(row.get("Photo"), str) and row.get("Photo"):
        return row["Photo"]
    photos = row.get("Photos")
    if isinstance(photos, list) and photos:
        first = photos[0]
        if isinstance(first, dict):
            for k in ("url", "Url", "uri", "Uri", "imageUrl", "ImageUrl"):
                if k in first and first[k]:
                    return first[k]
        elif isinstance(first, str):
            return first
    return None

def shape_rows(df_raw: pd.DataFrame, pageid: str, category_fallback: str, market_key: str) -> pd.DataFrame:
    id_col = next((c for c in ["vehicleId", "VehicleId", "id", "Id", "carId", "carid"] if c in df_raw.columns), None)
    if id_col is None:
        raise KeyError("vehicleId column not found in SearchResults")
    df = pd.DataFrame()
    df["vehicleId"] = df_raw[id_col].apply(to_int_safe)
    df["Market"] = MARKET_LABEL.get(market_key, market_key)  # 한글 라벨로 변환
    # Model, Badge, BadgeDetail 합치기
    model = df_raw.get("Model")
    badge = df_raw.get("Badge")
    badge_detail = df_raw.get("BadgeDetail")
    def merge_model(row):
        parts = [
            str(row["Model"]).strip() if pd.notna(row["Model"]) else "",
            str(row["Badge"]).strip() if pd.notna(row["Badge"]) else "",
            str(row["BadgeDetail"]).strip() if pd.notna(row["BadgeDetail"]) else ""
        ]
        return " ".join([p for p in parts if p and p != "<NA>"])
    df["Model"] = pd.DataFrame({
        "Model": model,
        "Badge": badge,
        "BadgeDetail": badge_detail
    }).apply(merge_model, axis=1)
    # 카테고리 한글값 그대로 사용
    if "Category" in df_raw.columns and df_raw["Category"].notna().any():
        df["Category"] = df_raw["Category"]
    elif "CategoryName" in df_raw.columns and df_raw["CategoryName"].notna().any():
        df["Category"] = df_raw["CategoryName"]
    else:
        df["Category"] = pd.Series([category_fallback] * len(df_raw), dtype="string")
    df["Manufacturer"] = df_raw.get("Manufacturer")
    df["Transmission"] = df_raw.get("Transmission")
    df["FuelType"] = df_raw.get("FuelType")
    df["Year"] = df_raw.get("Year").apply(to_int_safe) if "Year" in df_raw else None
    df["Mileage"] = df_raw.get("Mileage").apply(to_int_safe) if "Mileage" in df_raw else None
    df["Price"] = df_raw.get("Price").apply(to_int_safe) if "Price" in df_raw else None
    df["SellType"] = df_raw.get("SellType")
    df["OfficeCityState"] = df_raw.get("OfficeCityState")
    df["detail_url"] = df["vehicleId"].map(lambda x: make_detail_url(x, pageid) if pd.notna(x) else None)
    df["Photo"] = df_raw.apply(extract_photo, axis=1)
    for c in ["Market", "Manufacturer", "Model", "Category",
              "Transmission", "FuelType", "SellType",
              "OfficeCityState", "detail_url", "Photo"]:
        df[c] = df[c].astype("string")
    for c in ["vehicleId", "Year", "Mileage", "Price"]:
        df[c] = df[c].astype("Int64")
    wanted_cols = [
        "vehicleId", "Market", "Category", "Manufacturer", "Model",
        "Transmission", "FuelType", "Year", "Mileage", "Price",
        "SellType", "OfficeCityState", "detail_url", "Photo"
    ]
    return df[wanted_cols]

def crawl_market(market_key: str, categories_kor, sort="ModifiedDate", limit=10):
    # 한글 카테고리 그대로 사용
    conf = {
        "korean": {"car_type": "Y", "referer": "https://www.encar.com/dc/dc_carsearchlist.do", "pageid": "dc_carsearch"},
        "foreign": {"car_type": "N", "referer": "https://www.encar.com/fc/fc_carsearchlist.do", "pageid": "fc_carsearch"},
    }[market_key]
    s = make_session(conf["referer"])
    all_data = []
    for cat_kor in categories_kor:
        action = build_action_from_categories([cat_kor], car_type=conf["car_type"])
        total = get_total_count(s, action, sort)
        if total == 0:
            print(f"[{market_key}] {cat_kor} → 0건 (skip)")
            continue
        saved = 0
        for offset in range(0, min(total, limit), limit):
            params = {"count": "false", "q": action, "sr": f"|{sort}|{offset}|{limit}"}
            data = get_json(s, params)
            rows = data.get("SearchResults", [])
            if not rows:
                break
            raw = pd.json_normalize(rows, max_level=1)
            shaped = shape_rows(raw, pageid=conf["pageid"], category_fallback=cat_kor, market_key=market_key)
            all_data.append(shaped)
            saved += len(shaped)
            break
    if all_data:
        df_all = pd.concat(all_data, ignore_index=True)
        return df_all.head(10)
    else:
        return pd.DataFrame()

RE_VEH_NO = re.compile(r'"vehicleNo"\s*:\s*"([^"]+)"', re.S)

def fetch_vehicle_no(url, session):
    try:
        resp = session.get(url, timeout=6)
        if resp.ok:
            m = RE_VEH_NO.search(resp.text)
            if m:
                return m.group(1)
    except:
        pass
    return None

def attach_vehicle_no(df, max_count=10):
    s = requests.Session()
    s.trust_env = False
    retries = Retry(total=1, backoff_factor=0.2,
                    status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=10)
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    s.headers.update({
        "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/140.0.0.0 Safari/537.36"),
        "Referer": "https://fem.encar.com/",
        "Origin": "https://fem.encar.com",
    })

    vehicle_no_list = []
    count = 0
    for idx, row in df.iterrows():
        if count >= max_count:
            break
        url = row['detail_url']
        veh_no = fetch_vehicle_no(url, s)
        vehicle_no_list.append(veh_no)
        count += 1
        time.sleep(0.1)  # 서버 부담 완화

    df = df.head(max_count).copy()
    df["vehicleNo"] = pd.Series(vehicle_no_list, index=df.index[:max_count])

    cols = df.columns.tolist()
    cols.insert(1, cols.pop(cols.index('vehicleNo')))
    df = df[cols]
    return df

# 사용 예시: 한글 카테고리 리스트 입력
categories_kor = ["경차", "소형차", "준중형차", "중형차", "대형차"]
df_vehicles = crawl_market("korean", categories_kor, limit=100)
df_with_vehicleNo = attach_vehicle_no(df_vehicles, max_count=100)
df_with_vehicleNo


KeyError: 'SUV'

In [None]:
import re
import time
import requests
import pandas as pd
from requests.adapters import HTTPAdapter, Retry

# MARKET 한글 라벨 매핑
MARKET_LABEL = {
    "korean": "국산",
    "foreign": "수입"
}

BASE_URL = "https://api.encar.com/search/car/list/premium"

def make_session(referer: str) -> requests.Session:
    s = requests.Session()
    s.trust_env = False
    s.proxies = {}
    retries = Retry(
        total=5, backoff_factor=1.2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
    )
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update({
        "user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/140.0.0.0 Safari/537.36"),
        "accept": "application/json, text/plain, */*",
        "accept-language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
        "origin": "https://www.encar.com",
        "referer": referer,
    })
    return s

def get_json(s: requests.Session, params: dict):
    r = s.get(BASE_URL, params=params, timeout=15)
    r.raise_for_status()
    if "application/json" not in r.headers.get("Content-Type", "").lower():
        raise ValueError(f"Non-JSON: {r.url}")
    return r.json()

def build_action_from_categories(categories, car_type="Y"):
    names = [str(c).strip() for c in categories if c and str(c).strip()]
    names = list(dict.fromkeys(names))
    if not names:
        return f"(And.Hidden.N._.(C.CarType.{car_type}.))"
    joined = "Category." + "._.Category.".join(names) + "."
    return f"(And.Hidden.N._.(C.CarType.{car_type}._.(Or.{joined})))"

def get_total_count(s, action, sort="ModifiedDate"):
    j = get_json(s, {"count": "true", "q": action, "sr": f"|{sort}|0|1"})
    return int(j.get("Count", 0) or 0)

def make_detail_url(cid: int, pageid: str) -> str:
    return f"https://fem.encar.com/cars/detail/{cid}?pageid={pageid}&listAdvType=pic&carid={cid}&view_type=normal"

def to_int_safe(x):
    if x is None:
        return None
    if isinstance(x, (int, float)):
        try:
            return int(x)
        except Exception:
            return None
    if isinstance(x, str):
        m = re.findall(r"\d+", x.replace(",", ""))
        return int("".join(m)) if m else None
    return None

def extract_photo(row: pd.Series):
    if isinstance(row.get("Photo"), str) and row.get("Photo"):
        return row["Photo"]
    photos = row.get("Photos")
    if isinstance(photos, list) and photos:
        first = photos[0]
        if isinstance(first, dict):
            for k in ("url", "Url", "uri", "Uri", "imageUrl", "ImageUrl"):
                if k in first and first[k]:
                    return first[k]
        elif isinstance(first, str):
            return first
    return None

def shape_rows(df_raw: pd.DataFrame, pageid: str, category_fallback: str, market_key: str) -> pd.DataFrame:
    id_col = next((c for c in ["vehicleId", "VehicleId", "id", "Id", "carId", "carid"] if c in df_raw.columns), None)
    if id_col is None:
        raise KeyError("vehicleId column not found in SearchResults")
    df = pd.DataFrame()
    df["vehicleId"] = df_raw[id_col].apply(to_int_safe)
    df["Market"] = MARKET_LABEL.get(market_key, market_key)  # 한글 라벨로 변환
    # Model, Badge, BadgeDetail 합치기
    model = df_raw.get("Model")
    badge = df_raw.get("Badge")
    badge_detail = df_raw.get("BadgeDetail")
    def merge_model(row):
        parts = [
            str(row["Model"]).strip() if pd.notna(row["Model"]) else "",
            str(row["Badge"]).strip() if pd.notna(row["Badge"]) else "",
            str(row["BadgeDetail"]).strip() if pd.notna(row["BadgeDetail"]) else ""
        ]
        return " ".join([p for p in parts if p and p != "<NA>"])
    df["Model"] = pd.DataFrame({
        "Model": model,
        "Badge": badge,
        "BadgeDetail": badge_detail
    }).apply(merge_model, axis=1)
    # 카테고리 한글값 그대로 사용
    if "Category" in df_raw.columns and df_raw["Category"].notna().any():
        df["Category"] = df_raw["Category"]
    elif "CategoryName" in df_raw.columns and df_raw["CategoryName"].notna().any():
        df["Category"] = df_raw["CategoryName"]
    else:
        df["Category"] = pd.Series([category_fallback] * len(df_raw), dtype="string")
    df["Manufacturer"] = df_raw.get("Manufacturer")
    df["Transmission"] = df_raw.get("Transmission")
    df["FuelType"] = df_raw.get("FuelType")
    df["Year"] = df_raw.get("Year").apply(to_int_safe) if "Year" in df_raw else None
    df["Mileage"] = df_raw.get("Mileage").apply(to_int_safe) if "Mileage" in df_raw else None
    df["Price"] = df_raw.get("Price").apply(to_int_safe) if "Price" in df_raw else None
    df["SellType"] = df_raw.get("SellType")
    df["OfficeCityState"] = df_raw.get("OfficeCityState")
    df["detail_url"] = df["vehicleId"].map(lambda x: make_detail_url(x, pageid) if pd.notna(x) else None)
    df["Photo"] = df_raw.apply(extract_photo, axis=1)
    for c in ["Market", "Manufacturer", "Model", "Category",
              "Transmission", "FuelType", "SellType",
              "OfficeCityState", "detail_url", "Photo"]:
        df[c] = df[c].astype("string")
    for c in ["vehicleId", "Year", "Mileage", "Price"]:
        df[c] = df[c].astype("Int64")
    wanted_cols = [
        "vehicleId", "Market", "Category", "Manufacturer", "Model",
        "Transmission", "FuelType", "Year", "Mileage", "Price",
        "SellType", "OfficeCityState", "detail_url", "Photo"
    ]
    return df[wanted_cols]

def crawl_market(market_key: str, categories_kor, sort="ModifiedDate", limit=10):
    # 한글 카테고리 그대로 사용
    conf = {
        "korean": {"car_type": "Y", "referer": "https://www.encar.com/dc/dc_carsearchlist.do", "pageid": "dc_carsearch"},
        "foreign": {"car_type": "N", "referer": "https://www.encar.com/fc/fc_carsearchlist.do", "pageid": "fc_carsearch"},
    }[market_key]
    s = make_session(conf["referer"])
    all_data = []
    for cat_kor in categories_kor:
        action = build_action_from_categories([cat_kor], car_type=conf["car_type"])
        total = get_total_count(s, action, sort)
        if total == 0:
            print(f"[{market_key}] {cat_kor} → 0건 (skip)")
            continue
        saved = 0
        for offset in range(0, min(total, limit), limit):
            params = {"count": "false", "q": action, "sr": f"|{sort}|{offset}|{limit}"}
            data = get_json(s, params)
            rows = data.get("SearchResults", [])
            if not rows:
                break
            raw = pd.json_normalize(rows, max_level=1)
            shaped = shape_rows(raw, pageid=conf["pageid"], category_fallback=cat_kor, market_key=market_key)
            all_data.append(shaped)
            saved += len(shaped)
            break
    if all_data:
        df_all = pd.concat(all_data, ignore_index=True)
        return df_all.head(10)
    else:
        return pd.DataFrame()

RE_VEH_NO = re.compile(r'"vehicleNo"\s*:\s*"([^"]+)"', re.S)

def fetch_vehicle_no(url, session):
    try:
        resp = session.get(url, timeout=6)
        if resp.ok:
            m = RE_VEH_NO.search(resp.text)
            if m:
                return m.group(1)
    except:
        pass
    return None

def attach_vehicle_no(df, max_count=10):
    s = requests.Session()
    s.trust_env = False
    retries = Retry(total=1, backoff_factor=0.2,
                    status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=10)
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    s.headers.update({
        "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/140.0.0.0 Safari/537.36"),
        "Referer": "https://fem.encar.com/",
        "Origin": "https://fem.encar.com",
    })

    vehicle_no_list = []
    count = 0
    for idx, row in df.iterrows():
        if count >= max_count:
            break
        url = row['detail_url']
        veh_no = fetch_vehicle_no(url, s)
        vehicle_no_list.append(veh_no)
        count += 1
        time.sleep(0.1)  # 서버 부담 완화

    df = df.head(max_count).copy()
    df["vehicleNo"] = pd.Series(vehicle_no_list, index=df.index[:max_count])

    cols = df.columns.tolist()
    cols.insert(1, cols.pop(cols.index('vehicleNo')))
    df = df[cols]
    return df

df_vehicles = crawl_market("korean", categories_kor, limit=10)
df_with_vehicleNo = attach_vehicle_no(df_vehicles, max_count=10)
df_with_vehicleNo


Unnamed: 0,vehicleId,vehicleNo,Market,Category,Manufacturer,Model,Transmission,FuelType,Year,Mileage,Price,SellType,OfficeCityState,detail_url,Photo
0,39978807,109버2830,국산,경차,기아,더 뉴 레이 밴 고급형,오토,가솔린,201907,66751,750,일반,서울,https://fem.encar.com/cars/detail/39978807?pag...,/carpicture06/pic3996/39960846_
1,40481733,152호4628,국산,경차,기아,모닝 어반 (JA) 프레스티지,오토,가솔린,202206,55155,1310,일반,서울,https://fem.encar.com/cars/detail/40481733?pag...,/carpicture07/pic4047/40476629_
2,39368385,290구7822,국산,경차,기아,더 뉴 레이 밴 스탠다드,오토,가솔린,202101,94556,750,일반,경기,https://fem.encar.com/cars/detail/39368385?pag...,/carpicture06/pic3936/39367887_
3,39368528,213로5228,국산,경차,기아,더 뉴 레이 밴 스탠다드,오토,가솔린,202009,60045,750,일반,경기,https://fem.encar.com/cars/detail/39368528?pag...,/carpicture06/pic3936/39367895_
4,39793494,192두4853,국산,경차,기아,더 뉴 레이 스탠다드,오토,가솔린,202201,41293,1360,일반,대전,https://fem.encar.com/cars/detail/39793494?pag...,/carpicture08/pic3978/39786844_
5,40154328,153노2744,국산,경차,기아,모닝 어반 (JA) 시그니처 (세부등급 없음),오토,가솔린,202106,34787,1540,일반,인천,https://fem.encar.com/cars/detail/40154328?pag...,/carpicture04/pic4014/40143190_
6,39967714,329마7327,국산,경차,기아,더 뉴 레이 밴 럭셔리 스페셜,오토,가솔린,202004,52049,920,일반,경기,https://fem.encar.com/cars/detail/39967714?pag...,/carpicture06/pic3996/39965246_
7,40316831,280버9625,국산,경차,기아,더 뉴 레이 프레스티지,오토,가솔린,202104,93422,1099,일반,대구,https://fem.encar.com/cars/detail/40316831?pag...,/carpicture10/pic4030/40303370_
8,40267920,378어2255,국산,경차,기아,더 뉴 레이 밴 프레스티지 스페셜(1인승),오토,가솔린,202204,7324,1190,일반,경기,https://fem.encar.com/cars/detail/40267920?pag...,/carpicture06/pic4026/40266829_
9,40166386,348버1233,국산,경차,기아,더 뉴 레이 밴 고급형,오토,가솔린,201903,84883,850,일반,울산,https://fem.encar.com/cars/detail/40166386?pag...,/carpicture06/pic4016/40164516_
