In [1]:
# Import splinter
from splinter import Browser
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time

In [2]:
# Create browser object
browser = Browser('chrome')

In [3]:
# Open Indeed
url = "https://www.indeed.com/"
browser.visit(url)

# Give the browser time to respond
time.sleep(10)

In [4]:
# Removes the / from the end of the url
url[:-1]

'https://www.indeed.com'

In [5]:
# Array to store results in
data = []

In [6]:
# Function to scrape all the posts on a single page
def scrape_one_page(data, browser):
    # Job Title
    posts_in_page = browser.find_by_css(".jobsearch-ResultsList").find_by_css(".job_seen_beacon")
        
    for post in posts_in_page:
        # Initialize 1D array to store this post's data
        post_data = []
            
        # Grab job id
        try:
            job_id = post.find_by_css("a").first["data-jk"]
            post_data.append(job_id)
        except:
            post_data.append(None)
            
        # Grab job title
        try:
            title = post.find_by_css("span").first.text
            post_data.append(title)
        except:
            post_data.append(None)
            
        # Grab company
        try:
            company = post.find_by_css(".companyName").first.text
            post_data.append(company)
        except:
            post_data.append(None)
            
        # Grab location
        try:
            location = post.find_by_css(".companyLocation").first.text
            post_data.append(location)
        except:
            post_data.append(None)
            
        # Grab job type (if available)
        try:
            job_type = post.find_by_css("[aria-label='Job type']").first.find_by_xpath("..").first.text
            post_data.append(job_type)
        except:
            post_data.append(None)
                
        # Grab salary (if available)
        try:
            salary = post.find_by_css("[aria-label='Salary']").first.find_by_xpath("..").first.text
            post_data.append(salary)
        except:
            try:
                salary = post.find_by_css(".estimated-salary").first.find_by_css("span").first.text
                post_data.append(salary)
            except:
                post_data.append(None)
            
        # Grab job url
        job_url = post.find_by_css("a").first["href"]
        post_data.append(job_url)
        
        data.append(post_data)

In [7]:
# Function to scrape all the posts on all pages
def indeed_scraper(data, search_params, browser):
    base_url = url[:-1]
    # Find the what box & search the job title
    what = browser.find_by_id("text-input-what").first
    what.fill(search_params["what"])
    
    # Find the where box & search for the location
    where = browser.find_by_id("text-input-where").first
    where.type(Keys.CONTROL + "a")
    where.type(Keys.BACKSPACE)
    where.fill(search_params["where"])
    
    where.type(Keys.RETURN)
    
    time.sleep(1)
    
    date_posted = browser.find_by_text("Date posted").first
    date_posted.click()
    
    time.sleep(1)
    
    # This can be set to one of these options = ["Last 24 Hours", "Last 3 days", "Last 7 days", "Last 14 days"]
    last_24_hours = browser.find_by_text("Last 7 days").first
    last_24_hours.click()
    
    last_page = False
    page_count = 0
    
    while not last_page:
        try:  # This code runs for every page except the last page
            next_page_button = browser.find_by_css("[aria-label='Next Page']")
            
            page_count += 1
            print(f"Scraping page {page_count}...")
        
            post_data = scrape_one_page(data, browser)
                            
            print("Clicking to next page.")
            next_page_button.click()
                            
        except:  # This code runs only for the last page
            
            page_count += 1
            print(f"Scraping final page of {page_count} total pages...")
        
            post_data = scrape_one_page(data, browser)
                            
            print("Final page has been scraped.")               
            last_page = True
                       
    return data

In [8]:
# Set "what" for the words to search, and "where" for the location search
search_params = {"what": "Data Scientist", "where": "United States"}

# Run the webscraper to collect all listings from all pages for the last 24 hours
print("Beginning scraping process...")
try:
    indeed_scraper(data, search_params, browser)
    print("Scraping completed successfully!")
    
    browser.quit()
    print("Browser has been closed.")
    
except:
    print("Something went wrong with the scraper.. :(")
    
    browser.quit()
    print("Browser has been closed.")

Beginning scraping process...
Scraping page 1...
Clicking to next page.
Scraping page 2...
Clicking to next page.
Scraping page 3...
Clicking to next page.
Scraping page 4...
Clicking to next page.
Scraping page 5...
Clicking to next page.
Scraping page 6...
Clicking to next page.
Scraping page 7...
Clicking to next page.
Scraping page 8...
Clicking to next page.
Scraping page 9...
Clicking to next page.
Scraping page 10...
Clicking to next page.
Scraping page 11...
Clicking to next page.
Scraping page 12...
Clicking to next page.
Scraping page 13...
Clicking to next page.
Scraping page 14...
Clicking to next page.
Scraping page 15...
Clicking to next page.
Scraping page 16...
Clicking to next page.
Scraping page 17...
Clicking to next page.
Scraping page 18...
Clicking to next page.
Scraping page 19...
Clicking to next page.
Scraping page 20...
Clicking to next page.
Scraping page 21...
Clicking to next page.
Scraping page 22...
Clicking to next page.
Scraping page 23...
Clicking to n

In [13]:
browser.quit()
print("Browser has been closed.")

Browser has been closed.


In [14]:
# Name columns for putting the results into a DataFrame
columns = ["id", "title", "company", "location", "job_type", "salary", "url"]

# Create a Pandas DataFrame
listings_new_df = pd.DataFrame(data, columns = columns)
listings_new_df

Unnamed: 0,id,title,company,location,job_type,salary,url
0,7512ba2b05e1ba52,Data Scientist,"CorTech, LLC","Hybrid remote in Research Triangle Park, NC 27709",,$39.37 an hour,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
1,688e16f6ae2c5cce,Mid-Level ORSA / Data Scientist,Peraton,"Fort Meade, MD 20755",,"$146,000 - $234,000 a year",https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
2,b0a069bb20f180da,Consultant SME (Principal Data Scientist-Data ...,Experis,"Remote in Raleigh, NC 27609",,Estimated $128K - $162K a year,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
3,ca739b233c19204c,Data Scientist,Pacer Staffing,"Durham, NC 27709",,$39 an hour,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
4,3cd0c6a8ff3d8df5,Vice President Data Architect Lead Analyst - R...,Citi,"Rutherford, NJ 07070",,"$137,610 - $206,420 a year",https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
...,...,...,...,...,...,...,...
98,c9448c293d6fdd9f,"Sr Data Analyst, Growth",Ollie Pet,Remote,,Estimated $71.4K - $90.4K a year,https://www.indeed.com/rc/clk?jk=c9448c293d6fd...
99,bb61cca8c13df816,Data Scientist,Galen College of Nursing,"Hybrid remote in Louisville, KY 40207",,,https://www.indeed.com/rc/clk?jk=bb61cca8c13df...
100,de9595496ad68832,Applied Data Scientist,WebstaurantStore,"Remote in Lititz, PA 17543",,"$70,000 - $110,000 a year",https://www.indeed.com/rc/clk?jk=de9595496ad68...
101,59c17ef70e1fe45a,Senior Data Scientist,Neo.Tax,"Mountain View, CA",,"$170,000 - $190,000 a year",https://www.indeed.com/rc/clk?jk=59c17ef70e1fe...


In [15]:
# Open the .csv file which contains all the previously collected listings
try:
    file_path = "../data/listings_all.csv"

    listings_archive_df = pd.read_csv(file_path)

    # Drops the row number from the .csv file so it doesn't load as a new column
    listings_archive_df.drop(columns=listings_archive_df.columns[0], axis=1, inplace=True)
    listings_archive_df.head()


    # Append the new listings to the existing ones
    listings_all_df = pd.concat([listings_archive_df, listings_new_df])


    # Save the updated DataFrame as a .csv file
    listings_all_df.to_csv(f'../data/listings_all.csv')
    print(f"New results added and saved to file {file_path}")
    
except:
    print("Something went wrong with loading and appending to the archived file.\nPossibly no previous file existed.\nNew results are being saved.")
    
    # Save the updated DataFrame as a .csv file
    listings_new_df.to_csv(f'../data/listings_all.csv')
    print(f"New results saved to file {file_path}")

    

New results added and saved to file ../data/listings_all.csv
