# Indeed – Job Scraping

## Requirements 

In [59]:
# Load Packages:
from bs4 import BeautifulSoup 
import requests
import numpy as np

## Functions

In [60]:
def get_job_description(link):
    webpage = requests.get(link)
    soup = BeautifulSoup(webpage.content, "html.parser")
    
    return soup.find('div', id="jobDescriptionText").text

In [61]:
def create_list_of_job_info_dict(job_listings, jobs=[], description=False):
    """
    Function creates a list of job info dictionaries
    """
    
    jobs_list = []
    
    for job in job_listings:
        
        # create job info dict:
        job_info = dict()
        
        try:
            job_info['company'] = job.find("span", class_='company').text.strip()
        except AttributeError:
            job_info['company'] = None
            
        job_info['location'] = job.find('div', class_="recJobLoc")['data-rc-loc']
        job_info['title'] = job.find('a')['title'].strip()
        job_info['posted'] = job.find('span', class_='date').text
        job_info['link'] = 'https://il.indeed.com' + job.find('a')['href']        
        
        # full description or summary:
        if description:
            job_info['description'] = get_job_description(job_info['link'])
        else:
            job_info['description'] = job.find('div', class_='summary').text.strip()
        
        # append dictionary to job list:
        jobs.append(job_info)
    
    return jobs

In [62]:
# create url to scrape:
query = 'data'
location = 'israel'
days_ago = 2
url = f"https://il.indeed.com/jobs?q={query}&l={location}&fromage={days_ago}&sort=date" # add &start=1

In [55]:
# create soup from url:
webpage = requests.get(url)
soup = BeautifulSoup(webpage.content, "html.parser")

# how many jobs where found:
search_count = soup.find(id="resultsCol").find(id="searchCountPages").text
job_count = max([int(s) for s in search_count.replace(',', "").split() if s.isdigit()]) 
page_count = int(np.ceil(job_count / 15))

print(f"found {job_count} jobs")
print(f'{page_count} to scrape')

found 120 jobs
8 to scrape


In [63]:
# loop over all pages:
jobs = []
for page in range(0, page_count*10+1, 10):
    print(f'scraping page number {int(page/10)}...')
    
    url = f"https://il.indeed.com/jobs?q={query}&l={location}&fromage={days_ago}&sort=date&start={page}" # add &start=1
    webpage = requests.get(url)
    soup = BeautifulSoup(webpage.content, "html.parser")
    job_listings = soup.find(id="resultsCol").find_all('div', class_='jobsearch-SerpJobCard')
    print(job_listings[0])
    jobs = create_list_of_job_info_dict(job_listings, jobs)


scraping page number 0...
https://il.indeed.com/jobs?q=data&l=israel&fromage=2&sort=date&start=0
<div class="jobsearch-SerpJobCard unifiedRow row result" data-jk="8c014c696301560c" data-tn-component="organicJob" id="p_8c014c696301560c">
<h2 class="title">
<a class="jobtitle turnstileLink" data-tn-element="jobTitle" href="/rc/clk?jk=8c014c696301560c&amp;fccid=e5fa6ac4ea4b01e9&amp;vjs=3" id="jl_8c014c696301560c" onclick="setRefineByCookie([]); return rclk(this,jobmap[0],true,0);" onmousedown="return rclk(this,jobmap[0],0);" rel="noopener nofollow" target="_blank" title="לחברת טלדור דרוש/ה ארכיטקט/ית DATA לירושלים">
לחברת טלדור דרוש/ה ארכיטקט/ית <b>DATA</b> לירושלים</a>
<span class="new">חדש</span></h2>
<div class="sjcl">
<div>
<span class="company">
טלדור מערכות מחשבים (1986) בע''מ</span>
</div>
<div class="recJobLoc" data-rc-loc="תל אביב -יפו, מחוז תל אביב" id="recJobLoc_8c014c696301560c" style="display: none"></div>
<span class="location accessible-contrast-color-location">תל אביב -יפו

In [57]:
len(jobs)

121

In [94]:
jobs[0:15]

[{'company': "טלדור מערכות מחשבים (1986) בע''מ",
  'location': 'תל אביב -יפו, מחוז תל אביב',
  'title': 'לחברת טלדור דרוש/ה ארכיטקט/ית DATA לירושלים',
  'posted': 'פורסם זה עתה',
  'link': 'https://il.indeed.com/rc/clk?jk=8c014c696301560c&fccid=e5fa6ac4ea4b01e9&vjs=3',
  'description': 'לחברת טלדור דרוש/ה ארכיטקט/ית \nDATA לארגון מוביל בירושלים... ליישום ארכיטקטורה ועוד. * ניסיון מעשי בעולמות ה \nDATA - הקמה וניהול מגוון מוצרים לאיסוף וניהול מידע...'},
 {'company': 'Intel',
  'location': 'ישראל',
  'title': 'Data Scientists for Advanced Analytics',
  'posted': 'פורסם זה עתה',
  'link': 'https://il.indeed.com/rc/clk?jk=479200135571eac4&fccid=f1374be6a45f4b8a&vjs=3',
  'description': '& big \ndata at Intel, we’re one of the biggest \nData... as well as organizing the \nData Science Summit conference; If you’re a \ndata scientist in heart and soul...'},
 {'company': 'DSP Group',
  'location': 'הרצליה, מחוז תל אביב',
  'title': 'Financial Analyst',
  'posted': 'פורסם זה עתה',
  'link': 'ht

'6355a22290592ee0'

In [66]:
for i in job_listings[0].find_all('span'):
    print(i, '\n\n')

<span class="new">חדש</span> 


<span class="company">
SQlink</span> 


<span class="location accessible-contrast-color-location">ישראל</span> 


<span class="date">לפני יום 1</span> 


<span class="tt_set" id="tt_set_0"><span class="result-link-bar-separator">·</span><a class="sl resultLink save-job-link" href="#" id="sj_b401c77d02d0b424" onclick="changeJobState('b401c77d02d0b424', 'save', 'linkbar', false, ''); return false;" title="שמירת משרה זו ל-my.indeed">שמירת משרה</a><span class="result-link-bar-separator">·</span><button aria-expanded="false" class="sl resultLink more-link" id="tog_0" onclick="toggleMoreLinks('b401c77d02d0b424', '0'); return false;">עוד...</button></span> 


<span class="result-link-bar-separator">·</span> 


<span class="result-link-bar-separator">·</span> 


<span class="mat">הצגת כל המשרות <a href="/Sqlink-jobs" rel="nofollow">משרות SQlink</a> - <a href="/jobs-in-%D7%99%D7%A9%D7%A8%D7%90%D7%9C">ישראל</a></span> 


