# Python Web Crawling

In [1]:
import csv
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from urllib import parse

In [2]:
OK_URL = "https://remoteok.io"
STACK_OF_FLOW_URL = f"https://stackoverflow.com/jobs?sort=i"
WEWORK_URL = "https://weworkremotely.com"

## remoeok Web Crawling

In [3]:
# job 만들기
def extract_ok_job(tr_job):
    title = tr_job.find("td", {"class": "company_and_position"}).find("h2").get_text(strip=True)
    company = tr_job['data-company']
    link = tr_job['data-url']
    
    # job dict 만들기
    job = {
        "link": f"{OK_URL}/{link}",
        "title": title,
        "company": company
    }
    return job

# job List 가져오기
def get_ok_jobs(word):
    OK_JOB_URL = f"{OK_URL}/remote-{word}-jobs"
    URL = parse.urlparse(OK_JOB_URL)
    print(URL.geturl())
    get_request = requests.get(URL.geturl())
    
    if get_request.status_code == 200:
        html_parse = bs(get_request.text, "html.parser")
        container = html_parse.find("div", {"class": "container"})
        table = container.find("table", {"id": "jobsboard"})
    
        tr_jobs = table.find_all("tr", {"class", "job"})
        jobs = []
        # job List만들기
        for tr_job in tr_jobs:
            job = extract_ok_job(tr_job)
            jobs.append(job)

    return jobs

# remoteok 사이트 Crawling
def get_remoteok_jobs(word):
    print(f"renmoteok scrapper...")
    jobs = get_ok_jobs(word)
    return jobs

## Stack of Flow Web Crawling

In [4]:
# 페이징을 통한 큰 데이터 가져오기
def get_stack_last_page(word):
    try:
        request = requests.get(f"{STACK_OF_FLOW_URL}&q={word}")
        html_parse = bs(request.text, "html.parser")
        # pagination
        pagination = html_parse.find("div", {"class": "s-pagination"})
        pages = pagination.find_all("a")
        # next 제거후 마지막 page
        last_page = pages[-2].get_text(strip=True)
    except AttributeError:
        return 0
    return int(last_page)

# job 만들기
def extract_stack_job(html):
  
    div = html.find("div", {"class": "grid--cell fl1"})
    # title  
    title = div.find("h2").find("a")["title"]
    # company, location
    company, location = div.find("h3", {"class": "fs-body1"}).find_all("span", recursive=False)
    company = company.get_text(strip=True)
  
    # link
    job_id = html['data-jobid']

    return {
        "link": f"https://stackoverflow.com/jobs/{job_id}",
        "title": title, 
        "company": company
    }

# job List 가져오기
def get_stack_jobs(word, last_page):
    jobs = []
    for page in range(last_page):
        STACK_URL = f"{STACK_OF_FLOW_URL}&q={word}&pg={page + 1}"
        URL = parse.urlparse(STACK_URL)
        print(URL.geturl())
        
        get_request = requests.get(URL.geturl())
        html_parse = bs(get_request.text, "html.parser")
        results = html_parse.find_all("div", {"class": "-job"})
        
        for result in results:
            job = extract_stack_job(result)
            jobs.append(job)

    return jobs

# stack_of_flow 사이트 Crawling
def get_stack_of_flow_jobs(word):
    print(f"stack_of_flow scrapper...")
    last_page = get_stack_last_page(word)
    if last_page == 0:
        return
    jobs = get_stack_jobs(word, last_page)
    return jobs

## weworkremotely Web Crawling

In [5]:
# job 가져오기
def extract_wework_job(job_section):
    article = job_section.find("article")
    if article:
        job_list = article.find("ul").find_all("li")[:-1]
        jobs = []
            # li tags
        for job in job_list:
            a_section = job.select_one("li > a")
            if a_section:
                link = a_section['href'].strip()
                title = a_section.find("span", {"class": "title"}).get_text(strip=True)
                try:
                    (company, _, _,) = a_section.find_all("span", {"class": "company"})
                except ValueError:
                    (company, _,) = a_section.find_all("span", {"class": "company"})
          
                company = company.get_text(strip=True)

                cleaned_job = {
                    "link": f"{WEWORK_URL}/{link}",
                    "title": title,
                    "company": company
                }
                jobs.append(cleaned_job)

    return jobs

# job List 가져오기
def get_wework_jobs(word):
    WE_JOB_URL = f"{WEWORK_URL}/remote-jobs/search?term={word}"
    URL = parse.urlparse(WE_JOB_URL)
    print(URL.geturl())
    get_request = requests.get(URL.geturl())
    if get_request.status_code == 200:
        html_parse = bs(get_request.text, "html.parser")
        # jobs-container
        jobs_container = html_parse.find("div", {"class": "jobs-container"})
        # jobs
        job_section_list = jobs_container.find_all("section", {"class": "jobs"})

        aggregated_wework = []
        # section tags
        for job_section in job_section_list:
            job_list = extract_wework_job(job_section);
            aggregated_wework = aggregated_wework + job_list

        return aggregated_wework

# wework 사이트 Crawling
def get_weworkremotely_jobs(word):
    """ weworkremotely site scrapper """
    print(f"weworkremotely scrapper...")
    jobs = get_wework_jobs(word)

    return jobs

## Aggregate Web Crawling 

In [6]:
word = "python"
we = get_weworkremotely_jobs(word)
ok = get_remoteok_jobs(word)
stack = get_stack_of_flow_jobs(word)
jobs = we + ok + stack

weworkremotely scrapper...
https://weworkremotely.com/remote-jobs/search?term=python
renmoteok scrapper...
https://remoteok.io/remote-python-jobs
stack_of_flow scrapper...
https://stackoverflow.com/jobs?sort=i&q=python&pg=1
https://stackoverflow.com/jobs?sort=i&q=python&pg=2
https://stackoverflow.com/jobs?sort=i&q=python&pg=3
https://stackoverflow.com/jobs?sort=i&q=python&pg=4
https://stackoverflow.com/jobs?sort=i&q=python&pg=5
https://stackoverflow.com/jobs?sort=i&q=python&pg=6
https://stackoverflow.com/jobs?sort=i&q=python&pg=7
https://stackoverflow.com/jobs?sort=i&q=python&pg=8
https://stackoverflow.com/jobs?sort=i&q=python&pg=9
https://stackoverflow.com/jobs?sort=i&q=python&pg=10
https://stackoverflow.com/jobs?sort=i&q=python&pg=11
https://stackoverflow.com/jobs?sort=i&q=python&pg=12
https://stackoverflow.com/jobs?sort=i&q=python&pg=13
https://stackoverflow.com/jobs?sort=i&q=python&pg=14
https://stackoverflow.com/jobs?sort=i&q=python&pg=15
https://stackoverflow.com/jobs?sort=i&q=py

https://stackoverflow.com/jobs?sort=i&q=python&pg=152
https://stackoverflow.com/jobs?sort=i&q=python&pg=153
https://stackoverflow.com/jobs?sort=i&q=python&pg=154
https://stackoverflow.com/jobs?sort=i&q=python&pg=155
https://stackoverflow.com/jobs?sort=i&q=python&pg=156
https://stackoverflow.com/jobs?sort=i&q=python&pg=157
https://stackoverflow.com/jobs?sort=i&q=python&pg=158
https://stackoverflow.com/jobs?sort=i&q=python&pg=159
https://stackoverflow.com/jobs?sort=i&q=python&pg=160
https://stackoverflow.com/jobs?sort=i&q=python&pg=161
https://stackoverflow.com/jobs?sort=i&q=python&pg=162
https://stackoverflow.com/jobs?sort=i&q=python&pg=163


## Save CSV File

In [7]:
def save_to_file(term, jobs):
    file = open(f"csv_file/{term}.csv", mode="w", encoding='UTF-8', newline='')
    write = csv.writer(file)
    write.writerow([*jobs[0].keys()])
    for job in jobs:
        write.writerow([*job.values()])
    file.close()

In [8]:
save_to_file(word, jobs)

## Read CSV File

In [9]:
df = pd.read_csv(f"csv_file/{word}.csv", low_memory=False, encoding='UTF-8')
df

Unnamed: 0,link,title,company
0,https://weworkremotely.com//remote-jobs/good-e...,Senior Software Engineer - Mobile,Good Eggs
1,https://weworkremotely.com//remote-jobs/qualio...,Senior Software Developer,Qualio
2,https://weworkremotely.com//remote-jobs/mokriy...,Senior Scala Engineer,Mokriya Inc
3,https://weworkremotely.com//remote-jobs/fyi-se...,Senior Integrations Engineer,FYI
4,https://weworkremotely.com//remote-jobs/clickf...,Senior Fullstack Engineer (VueJS + Rails API /...,ClickFlow
...,...,...,...
4300,https://stackoverflow.com/jobs/286010,C++ Embedded Engineer,OSPIN GmbH
4301,https://stackoverflow.com/jobs/356110,Teamlead Frontend Engineering,Make.TV
4302,https://stackoverflow.com/jobs/369459,Full Stack -kehittäjä Sitowisen Smart City -ra...,Sitowise
4303,https://stackoverflow.com/jobs/377821,Information Security Professional Lead Analyst,"Citibank, N.A.viaPandoLogic"
