## 기존 csv데이터의 url로 들어가 필요한 데이터만 크롤링
- 포지션명,경력,주요업무,자격요건,우대사항

In [2]:
import json, re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# 전처리 함수 
def preprocess_text(text: str) -> str:
    text = re.sub(r'[•ㆍ]', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s+', ' ', text)
    lines = text.split('\n')
    unique_lines = []
    seen = set()

    for line in lines:
        line = line.strip()
        if line and line not in seen:
            unique_lines.append(line)
            seen.add(line)

    cleaned_text = "\n".join(unique_lines).strip()
    return cleaned_text

def scrape_jobs_to_json(csv_file_path, output_json_path):
    CHROMEDRIVER_PATH = r"./data/chromedriver.exe"
    service = Service(CHROMEDRIVER_PATH)
    driver = webdriver.Chrome(service=service)

    df = pd.read_csv(csv_file_path)
    job_list = []

    try:
        for idx, row in df.iterrows():
            job_url = row["URL"]
            print(f"\n🔍 크롤링 중 ({idx+1}/{len(df)}): {job_url}")

            driver.get(job_url)
            time.sleep(2)

            wait = WebDriverWait(driver, 10)
            
            # 1) 포지션명, 경력
            try:
                position_name = driver.find_element(By.CSS_SELECTOR, "h1").text.strip()
            except:
                position_name = "N/A"

            try:
                exp_elements = driver.find_elements(By.CSS_SELECTOR, "span.JobHeader_JobHeader__Tools__Company__Info__yT4OD")
                experience = exp_elements[1].text.strip() if len(exp_elements) > 1 else "N/A"
            except:
                experience = "N/A"

            # 2) 상세 정보 버튼 클릭
            try:
                more_btn = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[span[contains(text(),'상세 정보 더 보기')]]")))
                driver.execute_script("arguments[0].click();", more_btn)
                time.sleep(2)
            except:
                pass

            # (3) 섹션별 리스트 준비
            main_task_list = []
            qualification_list = []
            preferred_list = []
            current_section = None

            # (4) h3/span 태그만 수집
            try:
                desc_container = driver.find_element(By.CSS_SELECTOR, "article.JobDescription_JobDescription__dq8G5")
                elements = desc_container.find_elements(By.XPATH, ".//*")

                for el in elements:
                    tag_name = el.tag_name.lower()
                    text = el.text.strip()

                    # ★ h2는 무조건 무시
                    if tag_name == "h2":
                        continue  # 수집 안 함

                    elif tag_name == "h3":
                        # h3 태그가 "주요업무", "자격요건", "우대사항"인지 확인
                        if "주요업무" in text:
                            current_section = "MainTask"
                        elif "자격요건" in text:
                            current_section = "Qualification"
                        elif "우대사항" in text:
                            current_section = "Preferred"
                        else:
                            current_section = None

                    elif tag_name == "span" and current_section:
                        # 현재 섹션이 무엇인지에 따라 다른 리스트에 텍스트 추가
                        if current_section == "MainTask":
                            main_task_list.append(text)
                        elif current_section == "Qualification":
                            qualification_list.append(text)
                        elif current_section == "Preferred":
                            preferred_list.append(text)

            except Exception as e:
                print(f"오류 발생: {e}")
                continue

          #  전처리 적용
            main_task_clean = preprocess_text("\n".join(main_task_list))
            qualification_clean = preprocess_text("\n".join(qualification_list))
            preferred_clean = preprocess_text("\n".join(preferred_list))

            job_list.append({
                "PositionName": position_name,
                "Experience": experience,
                "MainTask": main_task_clean,
                "Qualification": qualification_clean,
                "Preferred": preferred_clean
            })

        # JSON 저장
        with open(output_json_path, "w", encoding="utf-8") as f:
            json.dump(job_list, f, ensure_ascii=False, indent=4)

        print("\n✅ 크롤링 완료 및 JSON 저장 성공")

    finally:
        driver.quit()

# 실행
input_csv = "./wanted_merged(1).csv"
output_json = "./jobs.json"
scrape_jobs_to_json(input_csv, output_json)


🔍 크롤링 중 (1/3): https://www.wanted.co.kr/wd/270338

🔍 크롤링 중 (2/3): https://www.wanted.co.kr/wd/263703

🔍 크롤링 중 (3/3): https://www.wanted.co.kr/wd/268265

✅ 크롤링 완료 및 JSON 저장 성공
