In [234]:
from selectorlib import Extractor
import requests
import json
import csv
from dateutil import parser as dateparser
import pandas as pd
import datetime as dt

# 1. Gather Review Data

## Gather product page ASIN numbers from TXT file

In [248]:
asins_file = open("./data/asins.txt", "r")
content = asins_file.read()
asins = content.split("\n")
asins_file.close()
print(f"Imported {len(asins)} ASINS")

Imported 106 ASINS


## Gather reviews from product page URL

In [322]:
# Create an Extractor by reading from the YAML file
extractor = Extractor.from_yaml_file('./src/selectors.yml')

In [235]:
def scrape_page(url):    
    headers = {
        'authority': 'www.amazon.com',
        'pragma': 'no-cache',
        'cache-control': 'no-cache',
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'none',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-dest': 'document',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    #print(f"Downloading {url}")
    r = requests.get(url, headers=headers)
    
    # Pass the HTML of the page and create 
    return extractor.extract(r.text)

In [236]:
def get_page_url(asin, page_no, limit):
    t_url = f"https://www.amazon.com/product-reviews/{asin}"
    
    if page_no > 0:
        if page_no <= limit:
            t_url = t_url + f"/?pageNumber={page_no}"
            return t_url
    elif page_no == 0:
        return t_url
        
    return "FAILS CONDITION CHECK"

In [237]:
# Empty the data frame
reviews = pd.DataFrame(columns = ["product_title", "asin", "author", "review", "rating", "date", "url"])

# Max. number of pages to scrape (assume 10 reviews per page)
limit = 20

In [238]:
# Start timer
tick = dt.datetime.now()

print("Scraping...")

# Scrape urls for reviews and append to datad
for asin in asins:
    # Reset loop variables
    page_flag = True
    page_no = 0

    while page_flag:
        curr_url = get_page_url(asin, page_no, limit)
        
        # Check for a valid URL
        if "FAILS CONDITION CHECK" in curr_url:
            page_flag = False
            break
        
        # Gather data from scraper
        data = scrape_page(curr_url)
        
        # Try to parse data, set flag to false otherwise
        try:
            if data:
                for r in data['reviews']:
                    t_df = {"product_title":data['product_title'],
                            "asin":asin,
                            "author":r['author'], 
                            "review":r['content'], 
                            "rating":float(r['rating'].split(' out of')[0]), 
                            "date":dateparser.parse(r['date'].split('on ')[-1]).strftime('%B %d, %Y'), 
                            "url":url}

                    # Add review data to data frame
                    reviews = reviews.append(t_df, ignore_index = True)
            else:
                print(f"Page flag thrown at page {page_no}")
                page_flag = False

            # Increment page number
            page_no = page_no + 1
        except:
            page_flag = False
    
    #print(f"Gathered {asin}")

# Stop timer            
tock = dt.datetime.now() - tick

# Print success message
print(f"Done gathering {reviews.shape[0]} reviews in {tock.seconds} seconds.")

Gathered B000WUFVR0
Gathered B001D6HB0M
Gathered B000W3V8S8
Gathered B007FSDIJA
Gathered B01MTS599T
Gathered B000X457HO
Gathered B000ORX6WI
Gathered B000JDGC78
Gathered B0032JHSP6
Gathered B078VJPW4Z
Gathered B07WDYMGZN
Gathered B004USM1A0
Gathered B000C2TB6U
Gathered B004G9DV66
Gathered B0057I5TIS
Gathered B001PAPPKY
Gathered B004G9C0SQ
Gathered B002UE6YQ8
Gathered B001U40C6W
Gathered B000P6THK8
Gathered B07PD1GJDN
Gathered B00J6CH1VO
Gathered B08TZT1N2C
Gathered B08TZ7VTNX
Gathered B07D37WXHH
Gathered B00069ZDJS
Gathered B07F82KQPZ
Gathered B07T94X7GB
Gathered B08TZRRBMZ
Gathered B07D39RV2K
Gathered B08TZS82PY
Gathered B0044R5L6S
Gathered B07D396V8Y
Gathered B07D39HSRQ
Gathered B00O60M2D8
Gathered B01NAAQLZ1
Gathered B06X6H95GW
Gathered B000WQZ5PC
Gathered B005OMNPII
Gathered B07P9S8VZR
Gathered B001U3WTP0
Gathered B0099X1DFU
Gathered B07D3B3Q84
Gathered B00O60L28E
Gathered B000TVJ6XW
Gathered B07D37ZFBH
Gathered B0044R1W98
Gathered B000CB4UQM
Gathered B07ZQQVC9S
Gathered B019PN2DX4


## Output Data

In [319]:
# Output reviews data to the "raw" folder
reviews.to_csv("./data/raw/reviews.csv")

In [320]:
reviews_by_day = reviews['date'].value_counts()
reviews_by_day = reviews_by_day.to_frame().reset_index()
reviews_by_day = reviews_by_day.rename(columns={"index":"date", "date":"freq"})
reviews_by_day.to_csv("./data/raw/reviews_by_day.csv")

# 2. Import and set up COVID-19 Data

## Load in COVID-19 Data from JHU

In [297]:
# Download all-time COVID-19 data from OWID GitHub
covid_data = pd.read_csv('https://github.com/owid/covid-19-data/blob/master/public/data/owid-covid-data.csv?raw=true')

In [318]:
# Output raw data to csv in the "raw" folder
covid_data.to_csv("./data/raw/owid_covid_19_data.csv")

In [298]:
# Preview data
covid_data.head(1)
# Date in format: YYYYMMDD

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,


## Filter COVID-19 Data

In [302]:
# Reduce columns to only those we need
covid_data = covid_data[['date', 'iso_code', 'location', 'total_cases', 'new_cases']]

In [308]:
# Filter to only include data from the USA
covid_data = covid_data[covid_data['iso_code'] == "USA"]

# Filter to only include data within the given data range
date_filter = (covid_data['date'] >= "2020-03-01") & (covid_data['date'] <= "2022-01-20")
covid_data = covid_data.loc[date_filter]

In [313]:
# Preview Data
covid_data.head(1)

Unnamed: 0,date,iso_code,location,total_cases,new_cases
148027,2020-03-01,USA,United States,32.0,7.0


## Output COVID-19 data to data folder

In [321]:
# Output reviews df as a CSV
covid_data.to_csv("./data/interim/covid_data.csv")