In [1]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
#from tqdm import tqdm
import json
import time
import re
import datetime

In [5]:
# rocketpunch crawler
def rocketpunch_crawler(url, headers):
    session = requests.Session()
    res = session.get(url.format(1), headers=headers)
    res = json.loads(res.text)
    soup = BS(res['data']['template'], 'html.parser')

    page_size = soup.find('div', {'class': 'tablet computer large screen widescreen only'}).find_all('a', {'class': 'item'})[-1].text.strip()

    data_list = parse_page(soup)

    for i in range(2, int(page_size) + 1):
        res = session.get(url.format(i), headers=headers)
        res = json.loads(res.text)
        soup = BS(res['data']['template'], 'html.parser')
        data_list.extend(parse_page(soup))
        time.sleep(2) # for sake of politeness

    return data_list

# API 호출
# company_id, company_name, job_id, description, job_title, job_career
def parse_page(soup):
    data_list = []
    for company in soup.find_all('div', {'class': 'company item'}):
        company_data = {}
        company_data['company_id'] = company['data-company_id']
        for content in company.find_all('div', {'class': 'content'}):
            company_data['company_name'] = content.find('a', {'class': 'company-name nowrap header name'}).text.strip()
            company_data['description'] = content.find('div', {'class': 'description'}).text.strip()
            
            for job_detail in content.find_all('div', {'class': 'job-detail'}):
                job_data = company_data.copy()
                job_data['job_id'] = job_detail.find('a', {'class': 'nowrap job-title'})['href'].split('/')[2]
                job_data['job_title'] = job_detail.find('a', {'class': 'nowrap job-title'}).text.strip()
                job_data['job_career'] = job_detail.find('div', {'class': 'job-stat-info'}).text.strip().split(' / ')
                job_data['timestamp'] = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
                job_data['crawl_domain'] = 'www.rocketpunch.com'
                data_list.append(job_data)
                
    return data_list


In [6]:

# 공고 크롤링
# job_task, job_detail, job_industry, job_specialties, date_start, date_end, timestamp
def parse_job_page(data, headers):
    job_url = 'https://www.rocketpunch.com/jobs/{}'
    session = requests.Session()
    pattern = re.compile('[ㄱ-ㅎ가-힣]+')

    for job in data:
        res = session.get(job_url.format(job['job_id']), headers=headers)
        soup = BS(res.text, 'html.parser')
        
        # 주요 업무(업무 내용) : job_task
        job_task_div = soup.find('div', class_='duty break')
        
        task_span_hidden = job_task_div.find('span', class_='hide full-text')
        task_span_short = job_task_div.find('span', class_='short-text') if not task_span_hidden else None
        task_span = task_span_hidden.text if task_span_hidden else (task_span_short.text if task_span_short else "")
        job['job_task'] = task_span.strip() if task_span else ""
        
        # 업무 기술/활동분야 : job_specialties
        specialties_raw = soup.find('div', class_='job-specialties')
        specialties = [a.text for a in specialties_raw.find_all('a')]
        job['job_specialties'] = ', '.join(specialties)
        
        # 채용 상세 : job_detail
        detail_div = soup.find('div', class_='content break')
        detail_span_hidden = detail_div.find('span', class_='hide full-text')
        detail_span_short = detail_div.find('span', class_='short-text') if not detail_span_hidden else None
        detail_span = detail_span_hidden.text if detail_span_hidden else detail_span_short.text
        job['job_detail'] = detail_span.strip() if detail_span else ""
        
        # 산업 분야 : job_industry
        industry_div = soup.find('div', class_='job-company-areas')
        industry_text = [a.text for a in industry_div.find_all('a')]
        job['job_industry'] = ', '.join(industry_text)
        
        # 채용 시작일/만료일 : date_start, date_end
        job_date = soup.find('div', class_='job-dates')
        date_span = job_date.find_all('span')
        
        #수시채용, 상시채용 예외처리
        if any(pattern.search(span.text) for span in date_span):
            job['date_start'] = datetime.datetime.now().strftime('%Y-%m-%d')
            job['date_end'] = None
            
        else:
            if len(date_span) > 1:
                job['date_start'] = datetime.datetime.strptime(date_span[0].text.strip(), '%Y.%m.%d').date()
                job['date_end'] = datetime.datetime.strptime(date_span[1].text.strip(), '%Y.%m.%d').date()
            elif len(date_span) == 1:
                job['date_start'] = datetime.datetime.strptime(date_span[0].text.strip(), '%Y.%m.%d').date()

    return data

In [7]:
if __name__ == "__main__" :
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    }
    url = 'https://www.rocketpunch.com/api/jobs/template?page={}'
    data_dic = rocketpunch_crawler(url, headers)
    detailed_data = parse_job_page(data_dic, headers)

AttributeError: 'NoneType' object has no attribute 'text'

# 수정 코드

In [10]:
def parse_job_page(data, headers):
    job_url = 'https://www.rocketpunch.com/jobs/{}'
    session = requests.Session()
    pattern = re.compile('[ㄱ-ㅎ가-힣]+')

    for job in data:
        res = session.get(job_url.format(job['job_id']), headers=headers)
        soup = BS(res.text, 'html.parser')
        
        # 주요 업무(업무 내용) : job_task
        job_task_div = soup.find('div', class_='duty break')
        task_span_hidden = job_task_div.find('span', class_='hide full-text') if job_task_div else None
        task_span_short = job_task_div.find('span', class_='short-text') if job_task_div and not task_span_hidden else None
        task_span = task_span_hidden.text if task_span_hidden else (task_span_short.text if task_span_short else "")
        job['job_task'] = task_span.strip() if task_span else ""
        
        # 업무 기술/활동분야 : job_specialties
        specialties_raw = soup.find('div', class_='job-specialties')
        specialties = [a.text for a in specialties_raw.find_all('a')] if specialties_raw else []
        job['job_specialties'] = ', '.join(specialties)
        
        # 채용 상세 : job_detail
        detail_div = soup.find('div', class_='content break')
        detail_span_hidden = detail_div.find('span', class_='hide full-text') if detail_div else None
        detail_span_short = detail_div.find('span', class_='short-text') if detail_div and not detail_span_hidden else None
        detail_span = detail_span_hidden.text if detail_span_hidden else (detail_span_short.text if detail_span_short else "")
        job['job_detail'] = detail_span.strip() if detail_span else ""
        
        # 산업 분야 : job_industry
        industry_div = soup.find('div', class_='job-company-areas')
        industry_text = [a.text for a in industry_div.find_all('a')] if industry_div else []
        job['job_industry'] = ', '.join(industry_text)
        
        # 채용 시작일/만료일 : date_start, date_end
        job_date = soup.find('div', class_='job-dates')
        date_span = job_date.find_all('span') if job_date else []
        
        # 수시채용, 상시채용 예외처리
        if any(pattern.search(span.text) for span in date_span):
            job['date_start'] = datetime.datetime.now().strftime('%Y-%m-%d')
            job['date_end'] = None
        else:
            if len(date_span) > 1:
                job['date_start'] = datetime.datetime.strptime(date_span[0].text.strip(), '%Y.%m.%d').date()
                job['date_end'] = datetime.datetime.strptime(date_span[1].text.strip(), '%Y.%m.%d').date()
            elif len(date_span) == 1:
                job['date_start'] = datetime.datetime.strptime(date_span[0].text.strip(), '%Y.%m.%d').date()

    return data


In [11]:
if __name__ == "__main__" :
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    }
    url = 'https://www.rocketpunch.com/api/jobs/template?page={}'
    data_dic = rocketpunch_crawler(url, headers)
    detailed_data = parse_job_page(data_dic, headers)

### 수정사항 
parse_job_page(data, headers) 함수안의 변수마다 예외처리 코드 추가

In [13]:
import pandas as pd

pd.DataFrame(data_list).head(3)

Unnamed: 0,company_id,company_name,description,job_id,job_title,job_career,timestamp,crawl_domain,job_task,job_specialties,job_detail,job_industry,date_start,date_end
0,6311,다날,데이터베이스 및 온라인정보 제공업체,151507,다날 결제시스템 BE개발 경력 채용,[경력],2024-08-08_03:08:55,www.rocketpunch.com,,"Java, Java Spring Framework, Spring Boot, spri...","[담당업무]- 결제 내/외부 연계 시스템 개발 및 운영 [지원자격]- JAVA, 객...","모바일게임, 메신저, 휴대폰결제, 통화연결음",2024-08-08,
1,6311,다날,데이터베이스 및 온라인정보 제공업체,151190,다날 결제시스템 서버 개발자 채용,[경력],2024-08-08_03:08:55,www.rocketpunch.com,,"백엔드 개발, Java, Sping framework, JPA, Gradle, EL...",[담당업무]- 안정적인 시스템 제공을 위해 레거시를 신규 프로젝트로 전환하는 등 여...,"모바일게임, 메신저, 휴대폰결제, 통화연결음",2024-08-08,
2,6311,다날,데이터베이스 및 온라인정보 제공업체,151200,다날 정보시스템 개발 경력 채용,[경력],2024-08-08_03:08:55,www.rocketpunch.com,,"웹 백엔드 개발(Java/Spring/Oracle/MySQL), mssql, SQL...",[담당업무] - 정산 시스템 개발 및 정산 자동화 개발 - 정산/ 매출/ 회계/ 대...,"모바일게임, 메신저, 휴대폰결제, 통화연결음",2024-08-08,


In [18]:
test = '상시채용'
pattern = re.compile('[ㄱ-ㅎ가-힣]+')
if pattern.search(test):
    print('true')

true


In [None]:
rocketpunch_list = rocketpunch_crawler()

In [22]:
rocketpunch_list.__len__()

148

In [23]:
import pandas as pd

In [24]:
df=pd.DataFrame(rocketpunch_list)

In [25]:
df.head(3)

Unnamed: 0,company_id,company_name,description,job_list,job_id,job_title,job_info,job_date
0,6532,마카롱팩토리,대한민국 1등 운전자 차량관리 필수앱 - 마이클,"[{'company_id': '6532', 'company_name': '마카롱팩토...",151375,프론트엔드 개발 (5년 이상),[경력],09/30 마감
1,6311,다날,데이터베이스 및 온라인정보 제공업체,"[{'company_id': '6311', 'company_name': '다날', ...",151201,다날 테스트 자동화 개발 담당 경력 채용,[경력],09/29 마감
2,160928,에이치디메디,쉽고 편한 의료 서비스 i약,"[{'company_id': '160928', 'company_name': '에이치...",151453,i약 SaaS 서버 엔지니어(Java/SpringBoot),"[5,000 - 8,000만원, 0.5% - 3.0%, 경력]",08/29 마감


In [26]:
# 유료광고와 겹치는지 확인용 -> 유료공고와 겹침으로 유료쪽 광고 크롤링x
df.loc[df['company_name'].isin(['제이제이앤컴퍼니스'])]

Unnamed: 0,company_id,company_name,description,job_list,job_id,job_title,job_info,job_date
64,203363,제이제이앤컴퍼니스,인공지능과 공정제어 기반의 해양 엔지니어링 전문 기업,"[{'company_id': '203363', 'company_name': '제이제...",151418,백엔드 엔지니어 정규직 채용 공고,[경력],08/23 마감


In [18]:
# 공고 크롤링
def parse_job_page():
    #job_url = 'https://www.rocketpunch.com/jobs/151267'
    job_url = 'https://www.rocketpunch.com/jobs/150226'
    session = requests.Session()
    #print(data['job_id'])
    #res = session.get(job_url.format(data['job_id']), headers=headers)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    }
    res = session.get(job_url, headers=headers)
    soup = BS(res.text, 'html.parser')
    
    div = soup.find('div', class_='duty break')
    #print(div)
    if div:
      span_hidden = div.find('span', class_='hide full-text')
      span_short = div.find('span', class_='short-text') if not span_hidden else None
      span = span_hidden.text if span_hidden else span_short
    #print(span)
    
    pattern = re.compile('[ㄱ-힣]+')
    job_date = soup.find('div', class_='job-dates')
    date_span = job_date.find_all('span')
    only_date_span = [re.sub(pattern, '', span.get_text(strip=True)) for span in date_span]
    
    print(only_date_span)
    
    current_year = datetime.datetime.now().year
    
    for mmdd in only_date_span:
      
      if mmdd =="" :
        print(mmdd)        
      else:
        mmdd = mmdd.strip()
        date_obj = datetime.datetime.strptime(f'{current_year}/{mmdd}', '%Y/%m/%d')
        formatted_date = date_obj.strftime('%Y.%m.%d')

      print(formatted_date)
    
    # if (span.isnull() for span in only_date_span) == True:
    #   date_end = datetime.datetime.strptime(only_date_span, '%Y.%m.%d').date()
    #   date_start = None
    # else:
    #   if len(date_span) > 1:
    #     date_end = datetime.datetime.strptime(only_date_span[0].strip(), '%Y.%m.%d').date()
    #     date_start = datetime.datetime.strptime(only_date_span[1].strip(), '%Y.%m.%d').date()
    #   elif len(date_span) == 1:
    #     date_end = datetime.datetime.strptime(only_date_span[0].strip(), '%Y.%m.%d').date()
    
    #print(date_start)
    #print(date_end)
    
    # for i, date in enumerate(date_span):
    #   text = date.get_text(strip=True)
    #   date_only = text.split()[0]
      
    #   year = datetime.datetime.now().year
    #   mon = date_only.split('/')[0]
    #   day = date_only.split('/')[1]
      
    #   if i == 0 :
    #     date_start = datetime.datetime.strptime(f'{year}-{mon}-{day}', '%Y-%m-%d').date()
    #     print(date_start)
    #   else:
    #     date_end = datetime.datetime.strptime(f'{year}-{mon}-{day}', '%Y-%m-%d').date()
    #     print(date_end)
      
      

In [19]:
parse_job_page()

['08/31 ', '07/31 ']
2024.08.31
2024.07.31


'2024-08-07'

In [30]:
import requests
from bs4 import BeautifulSoup as bs
import re

In [36]:
r=requests.get("https://www.rocketpunch.com/jobs/151583")
rbs=bs(r.text, 'html.parser')
#find_all("div", class_="ui job-infoset-content items")
item=rbs.find('div', class_='title', text='마감일').find_next_sibling('div').get_text(strip=True)
pattern = re.compile(r'\d{4}-\d{2}-\d{2}')
if pattern.match(item):
    print("true")
#print(item)
register_date = rbs.find('div', class_='title', text='등록일')
if not register_date :
    register_date = rbs.find('div', class_='title', text='수정일').find_next_sibling('div').get_text(strip=True)
print(register_date)
#item_ch=item.find_parent('div', class_='item').find('div', class_='content').get_text(strip=True)
#print(item_ch)
# date=item.bs.find_all('div', class_='title', text='마감일').find_next_sibling('div').get_text(strip=True)

true
2024-08-19


  item=rbs.find('div', class_='title', text='마감일').find_next_sibling('div').get_text(strip=True)
  register_date = rbs.find('div', class_='title', text='등록일')
  register_date = rbs.find('div', class_='title', text='수정일').find_next_sibling('div').get_text(strip=True)
