In [1]:
import uuid
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv
import datetime
from selenium import webdriver
import os
from urllib.parse import quote
from selenium.common.exceptions import NoSuchElementException

In [2]:
COMPANIES_FILENAME = 'sp_500.csv'
INDEED_BASE_URL = 'https://www.indeed.com/cmp'
INDEED_RATINGS_DIR = 'indeed_ratings'
LOG_FILENAME = 'data_acquisition_log'
RESULT_COLUMNS = ('company_name', 'url', 'indeed_rating')

In [3]:
scrape_id = str(uuid.uuid4())

In [4]:
company_names = pd.read_csv(COMPANIES_FILENAME)['Security']

In [5]:
def indeed_query_url(company_name):
    return f'{INDEED_BASE_URL}?q={quote(company_name)}'

def extract_rating(soup):
    rating_el = soup.find(
        'span',
        {'class': 'cmp-header-rating-average'}
    )
    return rating_el and rating_el.text

def extract_url(soup):
    link_el = soup.find(
        'a',
        {'data-tn-element': 'companyLink[]'}
    )
    return link_el and link_el['href']

def scrape_indeed(company_name):
    print(company_name)
    driver.get(indeed_query_url(company_name))
    try:
        first_result_el = driver.find_element_by_class_name('cmp-CompanyWidget-name').click()
        driver.find_element_by_xpath("//*[@data-tn-element='about-tab']").click()
    except NoSuchElementException:
        print(f'{company_name} entry is fucked')
        return None
    soup = BeautifulSoup(driver.page_source, 'lxml')
    return {
        'company_name': company_name,
        'url': extract_url(soup),
        'indeed_rating': extract_rating(soup)
    }

In [6]:
driver_path = '/Applications/chromedriver'
os.environ['webdriver.chrome.driver'] = '/Applications/chromedriver'
driver = webdriver.Chrome(driver_path)

try:
    urls_and_ratings = [scrape_indeed(company_name) for company_name in company_names]
finally:
    driver.close()

urls_and_ratings = [el for el in urls_and_ratings if el is not None]

3M Company
Abbott Laboratories
AbbVie Inc.
ABIOMED Inc
Accenture plc
Activision Blizzard
Adobe Systems Inc
Advanced Micro Devices Inc
Advance Auto Parts
AES Corp
Affiliated Managers Group Inc
AFLAC Inc
Agilent Technologies Inc
Air Products & Chemicals Inc
Akamai Technologies Inc
Alaska Air Group Inc
Albemarle Corp
Alexandria Real Estate Equities Inc
Alexion Pharmaceuticals
Align Technology
Allegion
Allergan, Plc
Alliance Data Systems
Alliant Energy Corp
Allstate Corp
Alphabet Inc Class A
Alphabet Inc Class A entry is fucked
Alphabet Inc Class C
Alphabet Inc Class C entry is fucked
Altria Group Inc
Amazon.com Inc.
Ameren Corp
American Airlines Group
American Electric Power
American Express Co
American International Group, Inc.
American Tower Corp.
American Water Works Company Inc
Ameriprise Financial
AmerisourceBergen Corp
AMETEK Inc.
Amgen Inc.
Amphenol Corp
Anadarko Petroleum Corp
Analog Devices, Inc.
ANSYS
Anthem Inc.
Aon plc
A.O. Smith Corp
Apache Corporation
Apartment Investment & 

T. Rowe Price Group
Take-Two Interactive
Tapestry, Inc.
Target Corp.
TE Connectivity Ltd.
TechnipFMC
Teleflex Inc
Texas Instruments
Textron Inc.
Thermo Fisher Scientific
Tiffany & Co.
Twitter, Inc.
TJX Companies Inc.
Torchmark Corp.
Total System Services
Tractor Supply Company
TransDigm Group
The Travelers Companies Inc.
TripAdvisor
Twenty-First Century Fox Class A
Twenty-First Century Fox Class A entry is fucked
Twenty-First Century Fox Class B
Twenty-First Century Fox Class B entry is fucked
Tyson Foods
UDR Inc
Ulta Beauty
U.S. Bancorp
Under Armour Class A
Under Armour Class A entry is fucked
Under Armour Class C
Under Armour Class C entry is fucked
Union Pacific
United Continental Holdings
United Health Group Inc.
United Parcel Service
United Rentals, Inc.
United Technologies
Universal Health Services, Inc.
Unum Group
V.F. Corp.
Valero Energy
Varian Medical Systems
Ventas Inc
Verisign Inc.
Verisk Analytics
Verizon Communications
Vertex Pharmaceuticals Inc
Viacom Inc.
Visa Inc.
Vorna

In [7]:
with open(f'{INDEED_RATINGS_DIR}/{scrape_id}', 'w') as results_file:
    csv_writer = csv.writer(results_file)
    csv_writer.writerow(RESULT_COLUMNS)
    csv_writer.writerows([
        [result[col_name] for col_name in RESULT_COLUMNS]
        for result in urls_and_ratings
    ])

In [8]:
with open(LOG_FILENAME, 'a') as log:
    csv_writer = csv.writer(log)
    csv_writer.writerow([scrape_id, datetime.datetime.now()])