In [4]:
# Import splinter
from splinter import Browser
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time

In [5]:
t = time.localtime()
current_time = time.strftime("%Y-%m-%d %H:%M:%S", t)
print(current_time)

2023-07-10 15:09:34


In [6]:
# Create browser object
browser = Browser('chrome')

In [7]:
# Open Indeed
url = "https://www.indeed.com/"
browser.visit(url)

# Give the browser time to respond
time.sleep(10)

In [8]:
# Removes the / from the end of the url
url[:-1]

'https://www.indeed.com'

In [9]:
# Array to store results in
data = []

In [10]:
# Function to scrape all the posts on a single page
def scrape_one_page(data, browser):
    # Job Title
    posts_in_page = browser.find_by_css(".jobsearch-ResultsList").find_by_css(".job_seen_beacon")
        
    for post in posts_in_page:
        # Initialize 1D array to store this post's data
        post_data = []
            
        # Grab job id
        try:
            job_id = post.find_by_css("a").first["data-jk"]
            post_data.append(job_id)
        except:
            post_data.append(None)
            
        # Grab job title
        try:
            title = post.find_by_css("span").first.text
            post_data.append(title)
        except:
            post_data.append(None)
            
        # Grab company
        try:
            company = post.find_by_css(".companyName").first.text
            post_data.append(company)
        except:
            post_data.append(None)
            
        # Grab location
        try:
            location = post.find_by_css(".companyLocation").first.text
            post_data.append(location)
        except:
            post_data.append(None)
            
        # Grab job type (if available)
        try:
            job_type = post.find_by_css("[aria-label='Job type']").first.find_by_xpath("..").first.text
            post_data.append(job_type)
        except:
            post_data.append(None)
                
        # Grab salary (if available)
        try:
            salary = post.find_by_css("[aria-label='Salary']").first.find_by_xpath("..").first.text
            post_data.append(salary)
        except:
            try:
                salary = post.find_by_css(".estimated-salary").first.find_by_css("span").first.text
                post_data.append(salary)
            except:
                post_data.append(None)
                
        # Record the time
        t = time.localtime()
        current_time = time.strftime("%Y-%m-%d %H:%M:%S", t)
        post_data.append(current_time)
            
        # Grab job url
        try:
            job_url = post.find_by_css("a").first["href"]
            post_data.append(job_url)
        except:
            post_data.append(None)
        
        data.append(post_data)

In [11]:
# Function to scrape all the posts on all pages
def indeed_scraper(data, search_params, browser):
    base_url = url[:-1]
    # Find the what box & search the job title
    what = browser.find_by_id("text-input-what").first
    what.fill(search_params["what"])
    
    # Find the where box & search for the location
    where = browser.find_by_id("text-input-where").first
    where.type(Keys.CONTROL + "a")
    where.type(Keys.BACKSPACE)
    where.fill(search_params["where"])
    
    where.type(Keys.RETURN)
    
    time.sleep(1)
    
    date_posted = browser.find_by_text("Date posted").first
    date_posted.click()
    
    time.sleep(1)
    
    # This can be set to one of these options = ["Last 24 Hours", "Last 3 days", "Last 7 days", "Last 14 days"]
    last_24_hours = browser.find_by_text("Last 24 hours").first
    last_24_hours.click()
    
    last_page = False
    page_count = 0
    
    while not last_page:
        try:  # This code runs for every page except the last page
            next_page_button = browser.find_by_css("[aria-label='Next Page']")
            
            page_count += 1
            print(f"Scraping page {page_count}...")
        
            post_data = scrape_one_page(data, browser)
                            
            print("Clicking to next page.")
            next_page_button.click()
                            
        except:  # This code runs only for the last page
            
            page_count += 1
            print(f"Scraping final page of {page_count} total pages...")
        
            post_data = scrape_one_page(data, browser)
                            
            print("Final page has been scraped.")               
            last_page = True
                       
    return data

In [12]:
# Set "what" for the words to search, and "where" for the location search
search_params = {"what": "Data Scientist", "where": "United States"}

# Run the webscraper to collect all listings from all pages for the last 24 hours

t = time.localtime()
current_time = time.strftime("%Y-%m-%d %H:%M:%S", t)

print(f"Beginning scraping process...{current_time}")
try:
    indeed_scraper(data, search_params, browser)
    t = time.localtime()
    current_time = time.strftime("%Y-%m-%d %H:%M:%S", t)
    print(f"Scraping completed successfully! {current_time}")
    
    browser.quit()
    print("Browser has been closed.")
    
except:
    t = time.localtime()
    current_time = time.strftime("%Y-%m-%d %H:%M:%S", t)
    print(f"Something went wrong with the scraper.. :(\n{current_time}")
    
    browser.quit()
    print("Browser has been closed.")

Beginning scraping process...2023-07-10 15:09:48
Scraping page 1...
Clicking to next page.
Scraping page 2...
Clicking to next page.
Scraping page 3...
Clicking to next page.
Scraping page 4...
Clicking to next page.
Scraping page 5...
Clicking to next page.
Scraping page 6...
Clicking to next page.
Scraping page 7...
Clicking to next page.
Scraping final page of 8 total pages...
Final page has been scraped.
Scraping completed successfully! 2023-07-10 15:14:47
Browser has been closed.


In [13]:
# Name columns for putting the results into a DataFrame
columns = ["id", "title", "company", "location", "job_type", "salary", "time", "url"]

# Create a Pandas DataFrame
listings_new_df = pd.DataFrame(data, columns = columns)
listings_new_df

Unnamed: 0,id,title,company,location,job_type,salary,time,url
0,8fa76a75a2adabe3,Lead Data Scientist,Target,"Remote in Minneapolis, MN 55403",,"$124,100 - $223,400 a year",2023-07-10 15:09:55,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
1,fc81cdfe57348569,Data Scientist,Argo Data,"Richardson, TX 75081",Full-time,"$80,696 - $150,547 a year",2023-07-10 15:09:55,https://www.indeed.com/company/Argo-Data/jobs/...
2,4ce85b280bb2c908,Senior Data Scientist,Artera,"Remote in Los Angeles, CA",,Estimated $128K - $162K a year,2023-07-10 15:09:59,https://www.indeed.com/rc/clk?jk=4ce85b280bb2c...
3,969e54404d4ecc45,Data Scientist,Technocore360,"Orlando, FL 32801 \n(Central Business District...",Contract,$80 an hour,2023-07-10 15:09:59,https://www.indeed.com/company/Technocore360/j...
4,3510d68ef1c6f67a,Data Scientist,Intellipro Group Inc,"Clark, NJ 07066",Full-time\n+1,$24 an hour,2023-07-10 15:09:59,https://www.indeed.com/company/Intellipro-Tech...
...,...,...,...,...,...,...,...,...
109,f20d735962ef3766,Principal Software Engineer - Java / MapReduce...,Captivation Software,"Annapolis Junction, MD 20701",Full-time,"$125,000 - $250,000 a year",2023-07-10 15:14:31,https://www.indeed.com/rc/clk?jk=f20d735962ef3...
110,81caef9e5196a940,Director - Data Science Outsourcing,Novo Nordisk,"Lexington, MA",,Estimated $185K - $234K a year,2023-07-10 15:14:35,https://www.indeed.com/rc/clk?jk=81caef9e5196a...
111,16e44f5c11c2c659,A/AI Machine Learning Engineering Manager,Lockheed Martin,"Moorestown, NJ 08057",Full-time,,2023-07-10 15:14:39,https://www.indeed.com/rc/clk?jk=16e44f5c11c2c...
112,c748868cba2ec18a,Computer Vision AI/ML Engineer,Medtronic,"Brooklyn Center, MN",,,2023-07-10 15:14:45,https://www.indeed.com/rc/clk?jk=c748868cba2ec...


In [None]:
# Save the updated DataFrame as a .csv file

t = time.localtime()
current_time = time.strftime("%Y-%m-%d %H:%M:%S", t)

file_path = '../data/listings_new.csv'

listings_new_df.to_csv(file_path)
print(f"New results added and saved to file {file_path}\n{current_time}")