# Indeed – Job Scraping

## Requirements 

In [2]:
# Load Packages:
from bs4 import BeautifulSoup 
import requests
import numpy as np

## Functions

In [48]:
def get_job_description(link):
    webpage = requests.get(link)
    soup = BeautifulSoup(webpage.content, "html.parser")
    
    return soup.find('div', id="jobDescriptionText").text

In [53]:
def create_list_of_job_info_dict(job_listings, jobs=[], description=False):
    """
    Function creates a list of job info dictionaries
    """
    
    jobs_list = []
    
    for job in job_listings:
        
        # create job info dict:
        job_info = dict()
        
        try:
            job_info['company'] = job.find("span", class_='company').text.strip()
        except AttributeError:
            job_info['company'] = None
            
        job_info['location'] = job.find('div', class_="recJobLoc")['data-rc-loc']
        job_info['title'] = job.find('a')['title'].strip()
        job_info['posted'] = job.find('span', class_='date').text
        job_info['link'] = 'https://il.indeed.com' + job.find('a')['href']        
        
        # full description or summary:
        if description:
            job_info['description'] = get_job_description(job_info['link'])
        else:
            job_info['description'] = job.find('div', class_='summary').text.strip()
        
        # append dictionary to job list:
        jobs.append(job_info)
    
    return jobs

In [54]:
# create url to scrape:
query = 'data'
location = 'israel'
days_ago = 2
url = f"https://il.indeed.com/jobs?q={query}&l={location}&fromage={days_ago}&sort=date" # add &start=1

In [55]:
# create soup from url:
webpage = requests.get(url)
soup = BeautifulSoup(webpage.content, "html.parser")

# how many jobs where found:
search_count = soup.find(id="resultsCol").find(id="searchCountPages").text
job_count = max([int(s) for s in search_count.replace(',', "").split() if s.isdigit()]) 
page_count = int(np.ceil(job_count / 15))

print(f"found {job_count} jobs")
print(f'{page_count} to scrape')

found 120 jobs
8 to scrape


In [56]:
# loop over all pages:
jobs = []
for page in range(0, page_count*10+1, 10):
    print(f'scraping page number {int(page/10)}...')
    
    url = f"https://il.indeed.com/jobs?q={query}&l={location}&fromage={days_ago}&sort=date&start={page}" # add &start=1
    print(url)
    webpage = requests.get(url)
    soup = BeautifulSoup(webpage.content, "html.parser")
    job_listings = soup.find(id="resultsCol").find_all('div', class_='jobsearch-SerpJobCard')
    jobs = create_list_of_job_info_dict(job_listings, jobs)


scraping page number 0...
https://il.indeed.com/jobs?q=data&l=israel&fromage=2&sort=date&start=0
scraping page number 1...
https://il.indeed.com/jobs?q=data&l=israel&fromage=2&sort=date&start=10
scraping page number 2...
https://il.indeed.com/jobs?q=data&l=israel&fromage=2&sort=date&start=20
scraping page number 3...
https://il.indeed.com/jobs?q=data&l=israel&fromage=2&sort=date&start=30
scraping page number 4...
https://il.indeed.com/jobs?q=data&l=israel&fromage=2&sort=date&start=40
scraping page number 5...
https://il.indeed.com/jobs?q=data&l=israel&fromage=2&sort=date&start=50
scraping page number 6...
https://il.indeed.com/jobs?q=data&l=israel&fromage=2&sort=date&start=60
scraping page number 7...
https://il.indeed.com/jobs?q=data&l=israel&fromage=2&sort=date&start=70
scraping page number 8...
https://il.indeed.com/jobs?q=data&l=israel&fromage=2&sort=date&start=80


In [57]:
len(jobs)

121

In [58]:
jobs

[{'company': 'Melio Payments',
  'location': 'תל אביב -יפו, מחוז תל אביב',
  'title': 'Data Scientist',
  'posted': 'פורסם זה עתה',
  'link': 'https://il.indeed.com/rc/clk?jk=3188545285eb7054&fccid=546397f536d1e87a&vjs=3',
  'description': 'players. As a \nData Scientist at Melio you will be working with colleagues from various disciplines - Research Engineers, Analysts, and other \nData Scientists...'},
 {'company': 'Riskified',
  'location': 'תל אביב -יפו, מחוז תל אביב',
  'title': 'SRE',
  'posted': 'פורסם זה עתה',
  'link': 'https://il.indeed.com/rc/clk?jk=28e5463de0da737b&fccid=c0cccca6dc7f0bd0&vjs=3',
  'description': 'flow, involving a variety of \ndata sources and complex... behavioral, social, geographical, and other types of \ndata, and use machine learning algorithms to create...'},
 {'company': 'ironSource',
  'location': 'תל אביב -יפו, מחוז תל אביב',
  'title': 'Supersonic-Creative Business Analyst',
  'posted': 'פורסם זה עתה',
  'link': 'https://il.indeed.com/rc/clk?jk=9a