In [66]:
%matplotlib inline
import pandas as pd
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('poster')
import requests
import bs4
import time

In [67]:
#request web page and parse with Beautiful Soup
req = requests.get('https://www.indeed.com/jobs?q=web+developer&l=Boston%2C+MA')
page = req.text
soup = bs4.BeautifulSoup(page,'html.parser')

#check that the parser worked
print(soup.title)
print(soup.prettify()[:1000])

<title>Web Developer Jobs, Employment in Boston, MA | Indeed.com</title>
<!DOCTYPE html>
<html lang="en">
 <head>
  <meta content="text/html;charset=utf-8" http-equiv="content-type"/>
  <script src="/s/f42152d/en_US.js" type="text/javascript">
  </script>
  <link href="/s/ecdfb5e/jobsearch_all.css" rel="stylesheet" type="text/css"/>
  <link href="http://rss.indeed.com/rss?q=web+developer&amp;l=Boston%2C+MA" rel="alternate" title="Web Developer Jobs, Employment in Boston, MA" type="application/rss+xml"/>
  <link href="/m/jobs?q=web+developer&amp;l=Boston%2C+MA" media="only screen and (max-width: 640px)" rel="alternate"/>
  <link href="/m/jobs?q=web+developer&amp;l=Boston%2C+MA" media="handheld" rel="alternate"/>
  <script type="text/javascript">
   if (typeof window['closureReadyCallbacks'] == 'undefined') {
        window['closureReadyCallbacks'] = [];
    }

    function call_when_jsall_loaded(cb) {
        if (window['closureReady']) {
            cb();
        } else {
            w

In [68]:
#getting job titles

def extract_job_title_from_result(soup): 
    jobs = []
    for div in soup.find_all(name='div', attrs={'class':'row'}):
        for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
            jobs.append(a['title'])
    return(jobs)

extract_job_title_from_result(soup)

['Front End Developer',
 'Front-end Web Developer (E-Commerce)',
 'Transportation Designer/Engineer',
 'Frontend Software Developer (Javascript / Ember JS)',
 'ServiceNow Admin',
 'Web / Front End Developer',
 'Junior Back-End Development Position (remote)',
 'Front-End Developer',
 'Front End Developer',
 'Full Stack Web Developer',
 'Junior Developer - Sheprd',
 'Full Stack Web Developer',
 'Front-end Developer',
 'Lead Full Stack Web Developer',
 'Senior Web Developer']

In [69]:
#getting company names

def extract_company_from_result(soup): 
    companies = []
    for div in soup.find_all(name='div', attrs={'class':'row'}):
        company = div.find_all(name='span', attrs={'class':'company'})
        if len(company) > 0:
            for b in company:
                companies.append(b.text.strip())
        else:
            sec_try = div.find_all(name='span', attrs={'class':'result-link-source'})
            for span in sec_try:
                companies.append(span.text.strip())
    return(companies)
 
extract_company_from_result(soup)

['RWS Group',
 'Digital Management, LLC',
 'Nitsch Engineering, Inc.',
 'Humanyze',
 'Netpro System',
 'FEN Learning / StoryArc Media',
 'Kuvio Creative, LLC.',
 'Coresecure, Inc.',
 'Abt Associates',
 'Boston',
 'InMotion Ventures',
 'Artaic',
 'CareDash',
 'Liberty Mutual',
 'Pluralsight']

In [70]:
#getting location

def extract_location_from_result(soup): 
    locations = []
    spans = soup.find_all('span', attrs={'class': 'location'})
    for span in spans:
        locations.append(span.text)
    return(locations)

extract_location_from_result(soup)

['Boston, MA',
 'Burlington, MA',
 'Boston, MA 02108',
 'Boston, MA 02108 (Back Bay-Beacon Hill area)',
 'Boston, MA',
 'Boston, MA',
 'Boston, MA',
 'Cambridge, MA 02142 (East Cambridge area)',
 'Cambridge, MA 02138 (West Cambridge area)',
 'Boston, MA 02113 (Central area)',
 'Boston, MA',
 'Boston, MA',
 'Cambridge, MA',
 'Boston, MA',
 'Boston, MA']

In [71]:
def extract_salary_from_result(soup): 
    salaries = []
    for div in soup.find_all(name='div', attrs={'class':'row'}):
        try:
            salaries.append(div.find('nobr').text)
        except:
            try:
                div_two = div.find(name='div', attrs={'class':'sjcl'})
                div_three = div_two.find('div')
                salary = div_three.find_all(name='span', attrs={'class':'no-wrap'})
                salaries.append(salary.text)
            except:
                salaries.append('Nothing_found')
    return(salaries)

extract_salary_from_result(soup)

['Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found']

In [76]:
columns = ['job_title', 'company_name', 'location', 'salary']
sample_df = pd.DataFrame(columns = columns)

In [77]:
#scraping code:

#getting data from multiple pages
for start in range(0,3):
    page = requests.get('https://www.indeed.com/jobs?q=web+developer&l=Boston%2C+MA'+ '&start='+ str(start))
    time.sleep(1)  #ensuring at least 1 second between page grabs

#populating data frame
    for div in soup.find_all(name='div', attrs={'class':'row'}): 
#specifying row num for index of job posting in dataframe
        num = (len(sample_df) + 1) 
#creating an empty list to hold the data for each posting
        job_post = [] 
#append city name
#job_post.append(city) 
    
#grabbing job title
        for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
            job_post.append(a['title']) 
#grabbing company name
            company = div.find_all(name='span', attrs={'class':'company'}) 
            if len(company) > 0: 
                for b in company:
                    job_post.append(b.text.strip()) 
            else: 
                sec_try = div.find_all(name='span', attrs={'class':'result-link-source'})
                for span in sec_try:
                    job_post.append(span.text) 
#grabbing location name
            c = div.findAll('span', attrs={'class': 'location'}) 
            for span in c: 
                job_post.append(span.text) 
#grabbing salary
                try:
                    job_post.append(div.find('nobr').text) 
                except:
                    try:
                        div_two = div.find(name='div', attrs={'class':'sjcl'})
                        div_three = div_two.find('div')
                        salary = div_three.find_all(name='span', attrs={'class':'no-wrap'})
                        salaries.append(salary.text.strip())
                    except:
                        job_post.append('Nothing_found') 

#appending list of job post info to dataframe at index num
        sample_df.loc[num] = job_post

sample_df

#saving sample_df as a local csv file — define your own local path to save contents 
#sample_df.to_csv(“[filepath].csv”, encoding=’utf-8')
 


Unnamed: 0,job_title,company_name,location,salary
1,Front End Developer,RWS Group,"Boston, MA",Nothing_found
2,Front-end Web Developer (E-Commerce),"Digital Management, LLC","Burlington, MA",Nothing_found
3,Transportation Designer/Engineer,"Nitsch Engineering, Inc.","Boston, MA 02108",Nothing_found
4,Frontend Software Developer (Javascript / Embe...,Humanyze,"Boston, MA 02108 (Back Bay-Beacon Hill area)",Nothing_found
5,ServiceNow Admin,Netpro System,"Boston, MA",Nothing_found
6,Web / Front End Developer,FEN Learning / StoryArc Media,"Boston, MA",Nothing_found
7,Junior Back-End Development Position (remote),"Kuvio Creative, LLC.","Boston, MA",Nothing_found
8,Front-End Developer,"Coresecure, Inc.","Cambridge, MA 02142 (East Cambridge area)",Nothing_found
9,Front End Developer,Abt Associates,"Cambridge, MA 02138 (West Cambridge area)",Nothing_found
10,Full Stack Web Developer,Boston,"Boston, MA 02113 (Central area)",Nothing_found
