# Items.py

In [14]:
import scrapy               

'''pip insall Scrapy'''
'''scrapy startproject jobscrapper to create a new project in the command line'''
class JobOpeningItem(scrapy.Item):
    id = scrapy.Field()
    title = scrapy.Field()
    company = scrapy.Field()
    department = scrapy.Field()
    location = scrapy.Field()
    description = scrapy.Field()
    requirements = scrapy.Field()
    link = scrapy.Field()

    def __repr__(self):
        """only print out title after exiting the Pipeline"""
        return repr({"title": self['title']})

# recruitee_v1.py

In [17]:
import jobscrapper

'''pip insall jobs-scrapper'''

class RecruiteeSpider(scrapy.Spider):
    name = "recruitee"

    start_urls = [
        # Replace with your list of recruitee-backed job pages
        'https://someone.recruitee.com/',
    ]

    def parse(self, response):
        company = response.url[8:].split('.')[0]
        jobs = response.css('div.job')
        for job in jobs:
            yield scrapy.Request(url=response.urljoin(job.css('h5.job-title a::attr(href)')[0].extract()),
                                callback=self.parse_job)

# recruitee.py

In [18]:
import scrapy
import jobscrapper

class RecruiteeSpider(scrapy.Spider):
    name = "recruitee"

    start_urls = [
        # Replace with your list of recruitee-backed job pages
        'https://someone.recruitee.com/',
    ]

    def parse(self, response):
        company = response.url[8:].split('.')[0]
        jobs = response.css('div.job')
        for job in jobs:
            yield scrapy.Request(url=response.urljoin(job.css('h5.job-title a::attr(href)')[0].extract()),
                                 meta={
                                     'company': company,
                                     'department': job.css('div.department::text').get(),
                                     'location': job.css('li.job-location::text').get(),
                                 }, callback=self.parse_job)

    def parse_job(self, response):
        yield JobOpeningItem(
            id=response.meta['id'],
            title=response.css('h2.title::text').get(),
            company=response.meta['company'],
            department=response.meta['department'],
            location=response.meta['location'],
            description='\n'.join(response.xpath('//div[@class="description"]')[0].xpath('./*').getall()),
            requirements='\n'.join(response.xpath('//div[@class="description"]')[1].xpath('./*').getall()),
            link=response.url
        )

# workable_v1.py

In [20]:
import scrapy
import scrapy.http

from urllib.parse import urljoin

import jobscrapper

WORKABLE_API_URL = 'https://careers-page.workable.com'
WORKABLE_ACCOUNTS_API_URL = urljoin(WORKABLE_API_URL, 'api/v3/accounts')
WORKABLE_JOB_API_URL = urljoin(WORKABLE_API_URL, 'api/v2/accounts')


class WorkableSpider(scrapy.Spider):
    name = "workable"
    start_urls = [
        # Replace with your list of URLS
        'https://apply.workable.com/someone/',
    ]

    def start_requests(self):
        for url in self.start_urls:
            company = url.split('/')[-2]
            yield scrapy.Request(url=f'{WORKABLE_ACCOUNTS_API_URL}/{company}/jobs',
                                 meta={'base_url': url},
                                 method='POST',
                                 callback=self.parse)

    def parse(self, response):
        company = response.url.split("/")[-2]
        data = response.json()
        for job in data.get('results', []):
            yield scrapy.Request(url=f'{WORKABLE_JOB_API_URL}/{company}/jobs/{job["shortcode"]}',
                                 meta={
                                     **job,
                                     **response.meta,
                                     'company': company
                                 },
                                 callback=self.parse_job)

# workable_v2.py

In [32]:
def parse_job(self, response):
        job = response.json()
        yield JobOpeningItem(
            title=job.get('title'),
            company=response.meta['company'],
            department=job.get('department')[0] if job.get('department') else '',
            location=job.get('location', {}).get('city'),
            description=job.get('description'),
            requirements=job.get('requirements'),
            link=urljoin(response.meta['base_url'], f'j/{response.meta["shortcode"]}')
        )

# workable_parse.py

In [21]:
 def parse(self, response):
        company = response.url.split("/")[-2]
        data = response.json()
        for job in data.get('results', []):
            yield scrapy.Request(url=f'{WORKABLE_JOB_API_URL}/{company}/jobs/{job["shortcode"]}',
                                 meta={
                                     **job,
                                     **response.meta,
                                     'company': company
                                 },
                                 callback=self.parse_job)

        # Handle paging
        if data.get('nextPage'):
            yield scrapy.http.JsonRequest(url=f'{WORKABLE_ACCOUNTS_API_URL}/{company}/jobs',
                                          meta=response.meta,
                                          data={"token": data['nextPage'], "query": "", "location": [],
                                                "department": [],
                                                "worktype": [], "remote": []},
                                          callback=self.parse)


# pipelines.py

In [22]:
class JobOpeningsPipeline:
    def _clean(self, value):
        if value:
            return ' '.join(value.split())

        return value

    def process_item(self, item, spider):
        item = JobOpeningItem(item)
        for field in ('title', 'location', 'department'):
            item[field] = self._clean(item[field])
        return item


# settings_v1.py

In [23]:
ITEM_PIPELINES = {
    'jobscrapper.pipelines.JobOpeningsPipeline': 300,
}

In [24]:
'''# Run Recruitee spider
scrapy runspider openings/spiders/recruitee.py -o jobs.json
# Run Workable spider
scrapy runspider openings/spiders/workable.py -o jobs.json'''

'# Run Recruitee spider\nscrapy runspider openings/spiders/recruitee.py -o jobs.json\n# Run Workable spider\nscrapy runspider openings/spiders/workable.py -o jobs.json'

# piplines_exporter.py

In [25]:
from itemadapter import ItemAdapter
from scrapy.exporters import JsonItemExporter


class PerCompanyExportPipeline:
    """Distribute items across multiple XML files according to their 'year' field here it is used to store different companies"""

    def __init__(self, path=None):
        self.path = path
        if not os.path.exists(path):
            os.makedirs(path)

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        return cls(settings.get('JOBS_PATH', '.'))

    def open_spider(self, spider):
        self.company_to_exporter = {}

    def close_spider(self, spider):
        for exporter in self.company_to_exporter.values():
            exporter.finish_exporting()

    def _exporter_for_item(self, item):
        adapter = ItemAdapter(item)
        company = adapter['company']
        if company not in self.company_to_exporter:
            f = open(os.path.join(self.path, f'{company}.json'), 'wb')
            exporter = JsonItemExporter(f, indent=4)
            exporter.start_exporting()
            self.company_to_exporter[company] = exporter
        return self.company_to_exporter[company]

    def process_item(self, item, spider):
        exporter = self._exporter_for_item(item)
        exporter.export_item(item)
        return item

# settings_v2.py

In [26]:
JOBS_PATH = 'data'

ITEM_PIPELINES = {
    'jobscrapper.pipelines.JobOpeningsPipeline': 300,
    'jobscrapper.pipelines.PerCompanyExportPipeline': 600,
}

In [28]:
'''# Run Recruitee spider
scrapy runspider openings/spiders/recruitee.py
# Run Workable spider
scrapy runspider openings/spiders/workable.py'''

'# Run Recruitee spider\nscrapy runspider openings/spiders/recruitee.py\n# Run Workable spider\nscrapy runspider openings/spiders/workable.py'

# scrape.yml

name: Scrape latest data

on:
  push:
  workflow_dispatch:
  schedule:
    - cron:  '0 13 * * 1'

jobs:
  scheduled:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: [ 3.8 ]

    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v2
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
    - name: Fetch latest data
      run: |-
        scrapy runspider jobscrapper/spiders/workable.py && scrapy runspider jobscrapper/spiders/recruitee.py
    - name: Commit and push if it changed
      run: |-
        git config user.name "Automated"
        git config user.email "actions@users.noreply.github.com"
        git add -A
        timestamp=$(date -u)
        git commit -m "Latest data: ${timestamp}" || exit 0
        git push