In [25]:
import uuid
import pandas as pd
import requests
import re
import csv
import datetime

In [26]:
COMPANIES_FILENAME = 'top_ten.csv'
LOG_FILENAME = 'data_acquisition_log'
PAGESPEED_RESULTS_DIR= 'pagespeed_results'
PAGESPEED_BASE_URL = 'https://www.googleapis.com/pagespeedonline/v5/runPagespeed'
with open('PAGESPEED_API_KEY') as key_file:
  PAGESPEED_API_KEY = key_file.read().strip() # bad style? should be constant per run
AUDIT_KEYS = [
  'first-contentful-paint',
  'speed-index',
  'interactive',
  'first-meaningful-paint',
  'first-cpu-idle',
  'estimated-input-latency'
]
METADATA_KEYS = ['url']
RESULT_KEYS = METADATA_KEYS + AUDIT_KEYS
TRUNCATE_COMPANIES = None

In [27]:
companies = pd.read_csv(COMPANIES_FILENAME)
companies = companies.drop('ticker_symbol', axis=1)
sample_companies = companies[:2]
sample_companies

Unnamed: 0,name,web_url
0,Microsoft Corp,https://www.microsoft.com/en-us/
1,Apple Inc.,https://www.apple.com/


In [28]:
fetch_id = str(uuid.uuid4())

In [29]:
[pagespeed_query_url(page) for page in companies[:3]['web_url']];

In [5]:
def make_pagespeed_request(page):
    return requests.get(
        url=PAGESPEED_BASE_URL,
        params={
            'url': page,
            'key': PAGESPEED_API_KEY
        }
    )

In [6]:
def unit(display_value):
    return re.search('\w+$', display_value)[0]

def verify_units(audits):
    for key in AUDIT_KEYS:
        if key == 'estimated-input-latency':
            if unit(audits[key]['displayValue']) != 'ms':
                raise BaseException('expected to always receive milliseconds')
        else:
            if unit(audits[key]['displayValue']) != 's':
                raise BaseException('expected to always receive seconds')
            

In [7]:
def displayToFloat(displayValue):
    return float(re.search('^\d+\.?\d*', displayValue)[0])

In [8]:
def audits(response):
    audits = response.json()['lighthouseResult']['audits']
    verify_units(audits)
    return {
        audit_key : displayToFloat(audits[audit_key]['displayValue'])
        for audit_key in AUDIT_KEYS
    }

In [9]:
def site_row(web_url):
    return dict(
        audits(make_pagespeed_request(web_url)),
        url=web_url
    )

In [10]:
target_companies = companies
if TRUNCATE_COMPANIES:
    target_companies = target_companies[:TRUNCATE_COMPANIES]

results = [
    site_row(web_url)
    for web_url in companies['web_url']
]

In [11]:
with open(f'{PAGESPEED_RESULTS_DIR}/{fetch_id}', 'w') as results_file:
    csv_writer = csv.writer(results_file)
    csv_writer.writerow(RESULT_KEYS)
    csv_writer.writerows([
        [result[key] for key in RESULT_KEYS]
        for result in results
    ])

In [12]:
with open(LOG_FILENAME, 'w') as log:
    csv_writer = csv.writer(log)
    csv_writer.writerow([fetch_id, datetime.datetime.now()])