In [49]:
import uuid
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv
import datetime

In [51]:
COMPANIES_FILENAME = 'top_ten.csv'
INDEED_BASE_URL = 'https://www.indeed.com/cmp'
INDEED_RATINGS_DIR = 'indeed_ratings'
LOG_FILENAME = 'data_acquisition_log'

In [35]:
scrape_id = str(uuid.uuid4())

In [36]:
company_names = pd.read_csv(COMPANIES_FILENAME)['name']

In [37]:
def query_indeed(company_name):
    return requests.get(
        url=INDEED_BASE_URL,
        params={'q': company_name}
    )

In [38]:
result_pagetexts = [
    query_indeed(company_name)
    for company_name in company_names
]

In [39]:
result_soups = [
    BeautifulSoup(result_pagetext.text, 'lxml')
    for result_pagetext in result_pagetexts
]

In [40]:
def extract_rating(soup):
    return soup.find(
        'div',
        {'class': 'icl-Ratings-rating'}
    ).text

In [41]:
ratings = [
    extract_rating(soup)
    for soup in result_soups
]

In [43]:
names_to_ratings = zip(company_names, ratings)

In [47]:
with open(f'{INDEED_RATINGS_DIR}/{scrape_id}', 'w') as results_file:
    csv_writer = csv.writer(results_file)
    csv_writer.writerow(['name', 'rating'])
    csv_writer.writerows([
        [name, rating]
        for name, rating in names_to_ratings
    ])

In [52]:
with open(LOG_FILENAME, 'w') as log:
    csv_writer = csv.writer(log)
    csv_writer.writerow([scrape_id, datetime.datetime.now()])