In [22]:
from bs4 import BeautifulSoup
from util.file_handler import load_file, saveAsJson, make_file_path, iterator_dir_children, create_directory_if_not_exists

In [23]:
# 특정 경로 내부의 모든 파일을 Generator로 가져오기.
SOURCE_DIR = "wanted"

file_children_for_json = iterator_dir_children(SOURCE_DIR)
file_children_for_file_name = iterator_dir_children(SOURCE_DIR)

In [24]:
# 저장 공간 확보
ROOT_DIR = "wanted_json"

create_directory_if_not_exists(make_file_path(ROOT_DIR))

In [25]:
def html_to_soup(file_path):
    # 파일 가져오기
    html_content = load_file(file_path)
    
    # BeautifulSoup를 사용하여 HTML 파싱
    return BeautifulSoup(html_content, "html.parser")

In [26]:
def get_image_list(section_image):
    if not section_image: return None
    
    image_tags = section_image.find_all(name="img")
    return [i["src"] for i in image_tags]

def get_job_name(section_job_header):
    if not section_job_header: return None
    
    return section_job_header.h2.string

def get_company_name(section_job_header):
    if not section_job_header: return None
    
    return section_job_header.find(name="a", attrs={"data-attribute-id": "company__click"}).string

def get_tags(section_job_header):
    if not section_job_header: return None
    
    ul_tags = section_job_header.find(name="div", attrs={"class": "Tags_tagsClass__mvehZ"})
    li_list_tags = ul_tags.find_all(name="li")
    return [i.a.string for i in li_list_tags]

def get_job_description(section_job_description):
    if not section_job_description: return None
    
    return str(section_job_description)

def get_location(section_location):
    if not section_location: return None
    
    return section_location.find(name="span", attrs={"class": "body"})


In [27]:
def soup_to_json(soup):
    root_element = soup.find(name="div", attrs={"class": "JobContent_className___ca57"})
    
    if not root_element: return {}

    section_image = root_element.find(name="section", attrs={"class": "JobImage_JobImage__OFUyr"})
    section_job_header = root_element.find(name="section", attrs={"class": "JobHeader_className__HttDA"})
    section_job_description = root_element.find(name="section", attrs={"class": "JobDescription_JobDescription__VWfcb"})
    section_location = root_element.find(name="section", attrs={"class": "JobWorkPlace_className__ra6rp"})
    
    root_element = None
    
    return {
        "images": get_image_list(section_image),
        "job_name": get_job_name(section_job_header),
        "company_name": get_company_name(section_job_header),
        "tags": get_tags(section_job_header),
        "job_description": get_job_description(section_job_description),
        "location": get_location(section_location),
    }

In [28]:
def extract_id_from_url(url : str):
    return url.rpartition("\\")[-1].partition(".")[0]

def get_file_name_from_id(id):
    return f"wanted_parsed_json_{id}.json"

In [29]:
# 파이프라인 생성
iter_url_for_json = file_children_for_json
iter_soup = map(html_to_soup, iter_url_for_json)
iter_json = map(soup_to_json, iter_soup)

iter_url_for_file_name = file_children_for_file_name
iter_id = map(extract_id_from_url, iter_url_for_json)
iter_file_name = map(get_file_name_from_id, iter_id)


In [30]:
# 저장
for json, file_name in zip(iter_json, iter_file_name):
    saveAsJson(make_file_path(f"{ROOT_DIR}/{file_name}"), json)