# Import libraries

In [330]:
from selectorlib import Extractor
import requests
import json
import csv
from dateutil import parser as dateparser
import pandas as pd
import datetime as dt
import os

# 1. Gather Review Data

## Gather product page ASIN numbers from TXT file

In [331]:
asins_file = open("./data/asins.txt", "r")
content = asins_file.read()
asins = content.split("\n")
asins_file.close()
print(f"Imported {len(asins)} ASINS")

Imported 106 ASINS


## Gather reviews from product page URL

In [332]:
# Create an Extractor by reading from the YAML file
extractor = Extractor.from_yaml_file('./src/selectors.yml')

In [334]:
def scrape_page(url):    
    headers = {
        'authority': 'www.amazon.com',
        'pragma': 'no-cache',
        'cache-control': 'no-cache',
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'none',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-dest': 'document',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }
    
    # Download the page
    r = requests.get(url, headers=headers)
    
    # Return the HTML as text 
    return extractor.extract(r.text)

In [354]:
# Create a review URL from the ASIN number
def get_page_url(asin, page_no, limit):
    # Define the template URL
    t_url = f"https://www.amazon.com/product-reviews/{asin}"
    
    # Check if the target page is after page 1
    if page_no > 0:
        # Check that the target page is within the limit
        if page_no <= limit:
            t_url = t_url + f"/?pageNumber={page_no}"
            return t_url
    # If these conditions are not met, return the template URL as-is
    elif page_no == 0:
        return t_url
    
    # Else return the flag string
    return "FAILS CONDITION CHECK"

In [368]:
# Create an empty data frame with column names
reviews = pd.DataFrame(columns = ["product_title", "asin", "author", "review", "rating", "date", "url"])

# Max. number of pages to scrape (assume 10 reviews per page)
limit = 30

In [369]:
# Start timer
tick = dt.datetime.now()

# Print start message
print("Scraping...")

# Scrape urls for reviews and append to data
for asin in asins:
    # Reset loop variables
    page_flag = True
    page_no = 0
    
    # Loop through a product's reviews until the loop variables are flagged
    while page_flag:
        curr_url = get_page_url(asin, page_no, limit)
        
        # Check for a valid URL
        if "FAILS CONDITION CHECK" in curr_url:
            page_flag = False
            break
        
        # Gather data from scraper
        data = scrape_page(curr_url)
        
        # Try to parse data, set flag to false otherwise
        try:
            # Proceed if data was recieved
            if data:
                for r in data['reviews']:
                    # Create a temporary df to store gathered data
                    t_df = {"product_title":data['product_title'],
                            "asin":asin,
                            "author":r['author'], 
                            "review":r['content'], 
                            "rating":float(r['rating'].split(' out of')[0]), 
                            "date":dateparser.parse(r['date'].split('on ')[-1]).strftime('%Y-%m-%d'), 
                            "url":url}

                    # Append temporary df to reviews df
                    reviews = reviews.append(t_df, ignore_index = True)
            # If no data was recieved, flag output
            else:
                print(f"Page flag thrown at page {page_no}")
                page_flag = False

            # Increment page number
            page_no = page_no + 1
        # Catch any exceptions and continue to next ASIN no.
        except:
            page_flag = False

# Stop timer            
tock = dt.datetime.now() - tick

# Print success message with information
print(f"Done gathering {reviews.shape[0]} reviews in {tock.seconds} seconds.")

Scraping...
Done gathering 10396 reviews in 550 seconds.


In [395]:
# Preview data
reviews.head(1)

Unnamed: 0,product_title,asin,author,review,rating,date,url
0,Yankee Candle Large Jar Candle Home Sweet Home,B000WUFVR0,David C.,I usually have good experience with Yankee Can...,2.0,2019-01-18,https://www.amazon.com/s?k=yankee+candles&crid...


In [371]:
# Output reviews data to the "raw" folder
reviews.to_csv("./data/raw/reviews.csv")

## Filter review data

In [396]:
# Filter to only include data within the given data range
date_filter = (reviews['date'] >= "2020-03-01") & (reviews['date'] <= "2022-01-16")
reviews = reviews.loc[date_filter]

## Output Data

In [400]:
# Output filtered reviews data to the "interim" folder
reviews.to_csv("./data/interim/reviews-cleaned.csv")

In [401]:
# Gather the number of reviews on each day and output to CSV
reviews_by_day = reviews['date'].value_counts()
reviews_by_day = reviews_by_day.to_frame().reset_index()
reviews_by_day = reviews_by_day.rename(columns={"index":"date", "date":"freq"})
reviews_by_day.to_csv("./data/interim/reviews_by_day.csv")

# 2. Import and set up COVID-19 Data

## Load in COVID-19 Data from JHU

In [383]:
# Download all-time COVID-19 data from OWID GitHub
covid_data = pd.read_csv('https://github.com/owid/covid-19-data/blob/master/public/data/owid-covid-data.csv?raw=true')

In [384]:
# Output raw data to csv in the "raw" folder
covid_data.to_csv("./data/raw/owid_covid_19_data.csv")

In [385]:
# Preview data
covid_data.head(1)
# Date in format: YYYYMMDD

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,


## Filter COVID-19 Data

In [386]:
# Reduce columns to only those we need
covid_data = covid_data[['date', 'iso_code', 'location', 'total_cases', 'new_cases']]

In [387]:
# Filter to only include data from the USA
covid_data = covid_data[covid_data['iso_code'] == "USA"]

# Filter to only include data within the given data range
date_filter = (covid_data['date'] >= "2020-03-01") & (covid_data['date'] <= "2022-01-16")
covid_data = covid_data.loc[date_filter]

In [388]:
# Preview Data
covid_data.head(1)

Unnamed: 0,date,iso_code,location,total_cases,new_cases
148027,2020-03-01,USA,United States,32.0,7.0


## Output COVID-19 data to data folder

In [389]:
# Output reviews df as a CSV
covid_data.to_csv("./data/interim/covid_data.csv")