In [14]:
# Import splinter
from splinter import Browser
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time

In [2]:
# Create browser object
browser = Browser('chrome')

In [3]:
# Open Indeed
url = "https://www.indeed.com/"
browser.visit(url)

# Give the browser time to respond
time.sleep(10)

In [4]:
# Removes the / from the end of the url
url[:-1]

'https://www.indeed.com'

In [5]:
# Array to store results in
data = []

In [6]:
# Function to scrape all the posts on a single page
def scrape_one_page(data, browser):
    # Job Title
    posts_in_page = browser.find_by_css(".jobsearch-ResultsList").find_by_css(".job_seen_beacon")
        
    for post in posts_in_page:
        # Initialize 1D array to store this post's data
        post_data = []
            
        # Grab job id
        try:
            job_id = post.find_by_css("a").first["data-jk"]
            post_data.append(job_id)
        except:
            post_data.append(None)
            
        # Grab job title
        try:
            title = post.find_by_css("span").first.text
            post_data.append(title)
        except:
            post_data.append(None)
            
        # Grab company
        try:
            company = post.find_by_css(".companyName").first.text
            post_data.append(company)
        except:
            post_data.append(None)
            
        # Grab location
        try:
            location = post.find_by_css(".companyLocation").first.text
            post_data.append(location)
        except:
            post_data.append(None)
            
        # Grab job type (if available)
        try:
            job_type = post.find_by_css("[aria-label='Job type']").first.find_by_xpath("..").first.text
            post_data.append(job_type)
        except:
            post_data.append(None)
                
        # Grab salary (if available)
        try:
            salary = post.find_by_css("[aria-label='Salary']").first.find_by_xpath("..").first.text
            post_data.append(salary)
        except:
            try:
                salary = post.find_by_css(".estimated-salary").first.find_by_css("span").first.text
                post_data.append(salary)
            except:
                post_data.append(None)
            
        # Grab job url
        job_url = post.find_by_css("a").first["href"]
        post_data.append(job_url)
        
        data.append(post_data)

In [7]:
# Function to scrape all the posts on all pages
def indeed_scraper(data, search_params, browser):
    base_url = url[:-1]
    # Find the what box & search the job title
    what = browser.find_by_id("text-input-what").first
    what.fill(search_params["what"])
    
    # Find the where box & search for the location
    where = browser.find_by_id("text-input-where").first
    where.type(Keys.CONTROL + "a")
    where.type(Keys.BACKSPACE)
    where.fill(search_params["where"])
    
    where.type(Keys.RETURN)
    
    time.sleep(1)
    
    date_posted = browser.find_by_text("Date posted").first
    date_posted.click()
    
    time.sleep(1)
    
    # This can be set to one of these options = ["Last 24 Hours", "Last 3 days", "Last 7 days", "Last 14 days"]
    last_24_hours = browser.find_by_text("Last 24 hours").first
    last_24_hours.click()
    
    last_page = False
    page_count = 0
    
    while not last_page:
        try:  # This code runs for every page except the last page
            next_page_button = browser.find_by_css("[aria-label='Next Page']")
            
            page_count += 1
            print(f"Scraping page {page_count}...")
        
            post_data = scrape_one_page(data, browser)
                            
            print("Clicking to next page.")
            next_page_button.click()
                            
        except:  # This code runs only for the last page
            
            page_count += 1
            print(f"Scraping final page of {page_count} total pages...")
        
            post_data = scrape_one_page(data, browser)
                            
            print("Final page has been scraped.")               
            last_page = True
                       
    return data

In [8]:
# Set "what" for the words to search, and "where" for the location search
search_params = {"what": "Data Scientist", "where": "United States"}

# Run the webscraper to collect all listings from all pages for the last 24 hours
print("Beginning scraping process...")
try:
    indeed_scraper(data, search_params, browser)
    print("Scraping completed successfully!")
    
    browser.quit()
    print("Browser has been closed.")
except:
    print("Something went wrong with the scraper.. :(")
    
    browser.quit()
    print("Browser has been closed.")

Beginning scraping process...
Scraping page 1...
Clicking to next page.
Scraping page 2...
Clicking to next page.
Scraping page 3...
Clicking to next page.
Scraping page 4...
Clicking to next page.
Scraping page 5...
Clicking to next page.
Scraping final page of 6 total pages...
Final page has been scraped.
Scraping completed successfully!


In [22]:
# Name columns for putting the results into a DataFrame
columns = ["id", "title", "company", "location", "job_type", "salary", "url"]

# Create a Pandas DataFrame
listings_new_df = pd.DataFrame(data, columns = columns)
listings_new_df

Unnamed: 0,id,title,company,location,job_type,salary,url
0,80b9309315126177,Education Data Scientist,,"Pasadena, CA",Full-time,"$110,000 - $150,000 a year",https://www.indeed.com/rc/clk?jk=80b9309315126...
1,e1fc69199481f0a9,Data Scientist III,Prudent Technology,Remote,Full-time,"$125,000 - $150,000 a year",https://www.indeed.com/rc/clk?jk=e1fc69199481f...
2,15aaa9050d486064,Data Scientist,MASS MUTUAL FINANCIAL GROUP,"Boston, MA",,"$121,800 - $159,800 a year",https://www.indeed.com/rc/clk?jk=15aaa9050d486...
3,ff1be6d86a1eb4a7,Bioinformatics & Data Scientist,"Infinity Bio, Inc.","Baltimore, MD",Full-time,"$98,000 - $118,000 a year","https://www.indeed.com/company/Infinity-Bio,-I..."
4,cca8a90974f634b2,Data Scientist,Noble Drilling Services Inc.,"Sugar Land, TX",Full-time,Estimated $115K - $146K a year,https://www.indeed.com/rc/clk?jk=cca8a90974f63...
...,...,...,...,...,...,...,...
59,d456d658226db277,AVP Machine Learning Engineering,HCA Healthcare,"Nashville, TN 37203",Full-time,,https://www.indeed.com/rc/clk?jk=d456d658226db...
60,3ac2df2573895006,Data Scientist I - Energy Applications,Pacific Northwest National Laboratory,United States,Full-time,"$136,400 a year",https://www.indeed.com/rc/clk?jk=3ac2df2573895...
61,896d30aff0b7509e,"Principal Software Engineer - Cloud, AI/Machin...",Oracle,United States,,Estimated $141K - $179K a year,https://www.indeed.com/rc/clk?jk=896d30aff0b75...
62,3ac2df2573895006,Data Scientist I - Energy Applications,Pacific Northwest National Laboratory,United States,Full-time,"$136,400 a year",https://www.indeed.com/rc/clk?jk=3ac2df2573895...


In [36]:
# Open the .csv file which contains all the previously collected listings
try:
    file_path = "../data/listings_all.csv"

    listings_archive_df = pd.read_csv(file_path)

    # Drops the row number from the .csv file so it doesn't load as a new column
    listings_archive_df.drop(columns=listings_archive_df.columns[0], axis=1, inplace=True)
    listings_archive_df.head()


    # Append the new listings to the existing ones
    listings_all_df = pd.concat([listings_archive_df, listings_new_df])


    # Save the updated DataFrame as a .csv file
    listings_all_df.to_csv(f'../data/listings_all.csv')
    print(f"New results added and saved to file {file_path}")
    
except:
    print("Something went wrong with loading and appending to the archived file.\nNew results are being saved.")
    
    # Save the updated DataFrame as a .csv file
    listings_new_df.to_csv(f'../data/listings_all.csv')
    print(f"New results saved to file {file_path}")

    

New results added and saved to file ../data/listings_all.csv
