In [20]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime

from time import sleep
from random import randint

In [21]:
def get_job_title(job):
    """
    Takes a beautiful soup job object from a list of job_objects returned from:
    jobs = soup.find_all('a','tapItem') 
    """

    spans = job.h2.find_all('span') 
    for i in range(len(spans)):
        try:
            title = spans[i]['title'].strip()
        except KeyError:
            continue
    
    return title


def get_job_link(job):
    """
    Takes a beautiful soup job object from a list of job_objects returned from:
    jobs = soup.find_all('a','tapItem') 
    """

    link1 = job.get('href')
    link = 'https://www.indeed.com' + link1

    return link


In [22]:

def get_record(job):
    
    job_title = get_job_title(job)
    company_name = job.find('span', 'companyName').text
    company_location = job.find('div', 'companyLocation').text
    job_link = get_job_link(job)
    salary = 'Unspecified'
    job_type = 'Unspecified'

    full_job_post = requests.get(job_link)
    full_job_post = BeautifulSoup(full_job_post.text, 'html.parser')
    full_job_description = full_job_post.find('div', {'id': "jobDescriptionText"}).text


    # find the elements
    job_details_section = full_job_post.find('div', {'id': "jobDetailsSection"})
    job_details = job_details_section.find_all('div', 'jobsearch-JobDescriptionSection-sectionItem')

    
    # parse Salary and Job Type
    for detail in job_details:
        
        if "Salary" in detail.text:
            salary = detail.text.replace('Salary','')
        
        if "Job Type" in detail.text:
            job_type = detail.text.replace('Job Type','')
            list_of_job_types = [s for s in re.split("([A-Z][^A-Z]*)", job_type) if s]
            job_type = ", ".join(list_of_job_types)


    # Get Extract Date
    extract_date = datetime.today().strftime('%Y-%m-%d')


    job_record = dict(
        job_title = job_title,
        company_name = company_name,
        company_location = company_location,
        job_link = job_link,
        job_type = job_type,
        salary = salary,
        full_job_description = full_job_description,
        extract_date = extract_date,
    )
    
    return job_record

In [23]:
def scrape_indeed_url(url):
    records = []
    start = 0

    while True:
        print("Requesting: ", url)
        response = requests.get(url)

        print("Repsonse Code: ", response.status_code)
        print('\n')

        soup = BeautifulSoup(response.text, 'html.parser')
        jobs = soup.find_all('a','tapItem')
        for job in jobs:
            record = get_record(job)
            records.append(record)
            print("Successfully added: ", record['job_title'])
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
            start += 10
            url = url[:-2] + str(start)
            delay = randint(1, 10)
            print('\n')
            print(f'Sleeping for {delay} seconds before starting the next request.')
            sleep(delay)
            print('\n')

        except AttributeError:
            break
    
    print("Creating a pandas dataframe...")
    records_df = pd.DataFrame(records)

    return records_df


In [32]:
test_two = scrape_indeed_url('https://www.indeed.com/jobs?q=park%20ranger&jt=internship&vjk=01dbabd30c208150')



Requesting:  https://www.indeed.com/jobs?q=park%20ranger&jt=internship&vjk=01dbabd30c208150
Repsonse Code:  200


Successfully added:  Wilderness Ranger Crew
Successfully added:  Assistant Park Ranger
Successfully added:  Park Interpreter
Successfully added:  Seasonal Park Maintenance Associate
Successfully added:  Seasonal Park Interpreter
Successfully added:  Park Customer Service Seasonal Associate
Successfully added:  Seasonal Park Ranger
Creating a pandas dataframe...


In [31]:
test_two.to_csv('park_ranger_internships.csv')

In [9]:
from indeed_scraper import scrape_indeed_url

In [17]:
test_three = scrape_indeed_url('https://www.indeed.com/jobs?as_and&as_phr&as_any=ecology%20ecological%20conservation%20conservancy%20wildlife%20fisheries%20fishery%20&as_not&as_ttl&as_cmp&jt=internship&st&salary&radius=25&l&fromage=any&limit=10&sort&psf=advsrch&from=advancedsearch&vjk=95e576aa42ce3a31')

Requesting:  https://www.indeed.com/jobs?as_and&as_phr&as_any=ecology%20ecological%20conservation%20conservancy%20wildlife%20fisheries%20fishery%20&as_not&as_ttl&as_cmp&jt=internship&st&salary&radius=25&l&fromage=any&limit=10&sort&psf=advsrch&from=advancedsearch&vjk=95e576aa42ce3a31
Repsonse Code:  200


Creating a pandas dataframe...


Indeed Search Terms

In [None]:
indeed_search_terms = ['conservation', 'ecology','husbandry','land management', 'biology',
'natural resources', 'public water', 'park ranger', 'agriculture', 'forestry', 'botany',
'environmental', 'species', 'fisheries','outdoors', 'wildlife', 'marine', 'aqua', 'horticulture']

Land Management- 266
Public Water- 180
Species- 152
Conservation- 120
Agriculture- 105
Biology- 82
Marine- 77
Environmental- 70
Ecology- 69
Wildlife- 65
Outdoors- 63
Natural Resources- 53
Horticulture- 42
Fisheries- 26
Husbandry- 20
Botany- 19
Forestry- 6
Park Ranger- 5
Aqua- 4