# Python Scrap

In [1]:
import requests

In [2]:
import csv

In [3]:
from bs4 import BeautifulSoup as bs

## Indeed Job Page Crawling

In [4]:
LIMIT = 50
# indeed url
Q = "python"
INDEED_URL = f"https://kr.indeed.com/jobs?q={Q}&limit={LIMIT}"

In [5]:
# test indeed_page
def get_indeed_last_page():
    get_request = requests.get(INDEED_URL)
    html_parse = bs(get_request.text, 'html.parser') # HTML 파싱

    # pagination > a > span 찾기
    pages = html_parse.find("div", {"class": "pagination"}).find_all('a')

    page_list = []

    # 마지막 값(Next)을 제외한 page를 저장
    for page in pages[:-1]:
        page_value = int(page.find('span').string)
        page_list.append(page_value)

    max_page = page_list[-1]
    
    return max_page

In [6]:
def extract_ineed_job(html):
    # title
    h2_title = html.find("h2", {"class": "title"})
    title = h2_title.find("a")["title"]
    # company
    span_company = html.find("span", {"class": "company"})
    a_company = span_company.find("a")

    company = ""
    if a_company is not None:
        company = str(a_company.string)
    else:
        company = str(span_company.string)
    company = company.strip()

    location = html.find("div", {"class": "recJobLoc"})["data-rc-loc"]
    job_id = html["data-jk"]
    
    return {"title": title, "company": company, "location": location, "link": f"https://kr.indeed.com/viewjob?jk={job_id}"}

In [7]:
def extract_ineed_jobs(last_page):
    jobs = []
    for page in range(last_page):
        print(f" scrapping {page}")
    
    # https://kr.indeed.com/jobs?q=python&limit=50&start=0
    get_request = requests.get(f"{INDEED_URL}&start={page*LIMIT}")
    
    # print(f" request status: {get_request.status_code}")
    html_parse = bs(get_request.text, 'html.parser') # HTML 파싱
    results = html_parse.find_all("div", {"class": "jobsearch-SerpJobCard"})
    
    for result in results:
      job = extract_ineed_job(result)
      jobs.append(job)
    return jobs

In [8]:
# indeed jobs
def get_indeed_jobs():
    last_page = get_indeed_last_page()
    jobs = extract_ineed_jobs(last_page)
    return jobs

## Stack of Flow Job Page Crawling

In [9]:
# indeed url
STACK_OF_FLOW_URL = f"https://stackoverflow.com/jobs?q={Q}&sort=i"

In [10]:
# 1. page
def get_stack_last_page():
    # 2. requests
    request = requests.get(STACK_OF_FLOW_URL)
    html_parse = bs(request.text, "html.parser")
    # pagination
    pagination = html_parse.find("div", {"class": "s-pagination"})
    pages = pagination.find_all("a")
    # next 제거후 마지막 page
    last_page = pages[-2].get_text(strip=True)
    return int(last_page)

In [11]:
# 2. requests
def extract_stack_job(html):
    div = html.find("div", {"class": "grid--cell fl1"})
    # title  
    h2_title = div.find("h2", {"class": "fs-body3"})
    title = h2_title.find("a")["title"]

    if title is not None:
        title = title
    # company, location
    company, location = div.find("h3", {"class": "fs-body1"}).find_all("span", recursive=False)
    company = company.get_text(strip=True)
    location = location.get_text(strip=True)

    # link
    job_id = html['data-jobid']

    return {
        "title": title, 
        "company": company, 
        "location": location,
        "link": f"https://stackoverflow.com/jobs/{job_id}"
    }

In [12]:
# 3. job
def extract_stack_jobs(last_page):
    jobs = []
    for page in range(last_page):
        print(f" scrapping stack_of_flow page: {page}")
        get_request = requests.get(f"{STACK_OF_FLOW_URL}&pg={page + 1}")
        html_parse = bs(get_request.text, "html.parser")
        results = html_parse.find_all("div", {"class": "-job"})
        for result in results:
            job = extract_stack_job(result)
            jobs.append(job)
    return jobs

In [13]:
# main 
def get_stack_jobs():

    last_page = get_stack_last_page()
    jobs = extract_stack_jobs(last_page)

    return jobs

## Job Crawling

In [14]:
def save_to_file(jobs):
    file = open("jobs.csv", mode="w", encoding='utf-8', newline='')
    write = csv.writer(file)
    header = ["title", "company", "location", "link"]
    write.writerow(header)
    
    for job in jobs:
        write.writerow(list(job.values()))

In [15]:
indeed = get_indeed_jobs()
stack = get_stack_jobs()

all_jobs = indeed + stack
print(f" Scraping total data: {len(all_jobs)}")

 scrapping 0
 scrapping 1
 scrapping 2
 scrapping 3
 scrapping 4
 scrapping 5
 scrapping 6
 scrapping 7
 scrapping 8
 scrapping 9
 scrapping 10
 scrapping 11
 scrapping 12
 scrapping stack_of_flow page: 0
 scrapping stack_of_flow page: 1
 scrapping stack_of_flow page: 2
 scrapping stack_of_flow page: 3
 scrapping stack_of_flow page: 4
 scrapping stack_of_flow page: 5
 scrapping stack_of_flow page: 6
 scrapping stack_of_flow page: 7
 scrapping stack_of_flow page: 8
 scrapping stack_of_flow page: 9
 scrapping stack_of_flow page: 10
 scrapping stack_of_flow page: 11
 scrapping stack_of_flow page: 12
 scrapping stack_of_flow page: 13
 scrapping stack_of_flow page: 14
 scrapping stack_of_flow page: 15
 scrapping stack_of_flow page: 16
 scrapping stack_of_flow page: 17
 scrapping stack_of_flow page: 18
 scrapping stack_of_flow page: 19
 scrapping stack_of_flow page: 20
 scrapping stack_of_flow page: 21
 scrapping stack_of_flow page: 22
 scrapping stack_of_flow page: 23
 scrapping stack_of_fl

## Save CSV File

In [16]:
save_to_file(all_jobs)