# Covid Candles – Gather Data
----

**Author:** Simon Aytes

**[GitHub](https://github.com/SimonAytes)**

**[Website](https://www.saytes.io)**

## Import libraries

In [3]:
from selectorlib import Extractor
import requests
import json
import csv
from dateutil import parser as dateparser
import pandas as pd
import datetime as dt
import os

In [4]:
# Get path to the repo directory
dir_path = "/".join(os.getcwd().split("/")[0:-1])

# 1. Configure scraping environment
----

## 1.1 Gather product page ASIN numbers from TXT file

ASIN numbers (Amazon Standard Identification Number) are unique 10-character identifiers used by Amazon for product identification.

In this project, we will be using a pre-gathered set of ASIN numbers that correspond to Yankee Candles listed on Amazon.

In [6]:
asins_file = open((dir_path + "/data/asins.txt"), "r")
content = asins_file.read()
asins = content.split("\n")
asins_file.close()
print(f"Imported {len(asins)} ASINS")

Imported 106 ASINS


## 1.2 Import extractor settings

In [7]:
# Create an Extractor by reading from the YAML file
extractor = Extractor.from_yaml_file(dir_path + '/src/selectors.yml')

## 1.3 Define utility functions for scraping

In [8]:
# Our main function for scraping the contents of a page
def scrape_page(url):    
    headers = {
        'authority': 'www.amazon.com',
        'pragma': 'no-cache',
        'cache-control': 'no-cache',
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'none',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-dest': 'document',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }
    
    # Download the page
    r = requests.get(url, headers=headers)
    
    # Return the HTML as text 
    return extractor.extract(r.text)

In [9]:
# Create a review URL from the ASIN number
def get_page_url(asin, page_no, limit):
    # Define the template URL
    t_url = f"https://www.amazon.com/product-reviews/{asin}"
    
    # Check if the target page is after page 1
    #if page_no > 0:
        # Check that the target page is within the limit
    if page_no <= limit:
        t_url = t_url + f"/?pageNumber={page_no+1}"
        return t_url
    # If these conditions are not met, return the template URL as-is
    elif page_no == 0:
        return t_url
    
    # Else return the flag string
    return "FAILS CONDITION CHECK"

# 2. Gather reviews
----

## 2.1 Define scraping parameters

In [13]:
# Create an empty data frame with column names
reviews = pd.DataFrame(columns = ["product_title", "asin", "author", "review", "rating", "date", "url"])

# Max. number of pages to scrape (assume 10 reviews per page)
limit = 30

## 2.2 Scrape product reviews

*Note: This process may take upwards of 10 minutes to complete. When it is finished, it will print a success message.*

In [14]:
# Start timer
tick = dt.datetime.now()

# Print start message
print("Scraping...")

# Scrape urls for reviews and append to data
for asin in asins:
    print(f"Downloading {asin} ({asins.index(asin)+1} / {len(asins)})")
    # Reset loop variables
    page_flag = True
    page_no = 0
    
    # Loop through a product's reviews until the loop variables are flagged
    while page_flag:
        curr_url = get_page_url(asin, page_no, limit)
        
        # Check for a valid URL
        if "FAILS CONDITION CHECK" in curr_url:
            page_flag = False
            break
        
        # Gather data from scraper
        data = scrape_page(curr_url)
        
        # Try to parse data, set flag to false otherwise
        try:
            # Proceed if data was recieved
            if data:
                for r in data['reviews']:
                    # Create a temporary df to store gathered data
                    t_df = {"product_title":data['product_title'],
                            "asin":asin,
                            "author":r['author'], 
                            "review":r['content'], 
                            "rating":float(r['rating'].split(' out of')[0]), 
                            "date":dateparser.parse(r['date'].split('on ')[-1]).strftime('%Y-%m-%d'), 
                            "url":curr_url}

                    # Append temporary df to reviews df
                    reviews = reviews.append(t_df, ignore_index = True)
            # If no data was recieved, flag output
            else:
                print(f"Page flag thrown at page {page_no}")
                page_flag = False

            # Increment page number
            page_no = page_no + 1
        # Catch any exceptions and continue to next ASIN no.
        except Exception as e:
            page_flag = False
            #print(e)

# Stop timer            
tock = dt.datetime.now() - tick

# Print success message with information
print(f"Done gathering {reviews.shape[0]} reviews in {tock.seconds} seconds.")

Scraping...
Downloading B000WUFVR0 (1 / 106)
Downloading B001D6HB0M (2 / 106)
Downloading B000W3V8S8 (3 / 106)
Downloading B007FSDIJA (4 / 106)
Downloading B01MTS599T (5 / 106)
Downloading B000X457HO (6 / 106)
Downloading B000ORX6WI (7 / 106)
Downloading B000JDGC78 (8 / 106)
Downloading B0032JHSP6 (9 / 106)
Downloading B078VJPW4Z (10 / 106)
Downloading B07WDYMGZN (11 / 106)
Downloading B004USM1A0 (12 / 106)
Downloading B000C2TB6U (13 / 106)
Downloading B004G9DV66 (14 / 106)
Downloading B0057I5TIS (15 / 106)
Downloading B001PAPPKY (16 / 106)
Downloading B004G9C0SQ (17 / 106)
Downloading B002UE6YQ8 (18 / 106)
Downloading B001U40C6W (19 / 106)
Downloading B000P6THK8 (20 / 106)
Downloading B07PD1GJDN (21 / 106)
Downloading B00J6CH1VO (22 / 106)
Downloading B08TZT1N2C (23 / 106)
Downloading B08TZ7VTNX (24 / 106)
Downloading B07D37WXHH (25 / 106)
Downloading B00069ZDJS (26 / 106)
Downloading B07F82KQPZ (27 / 106)
Downloading B07T94X7GB (28 / 106)
Downloading B08TZRRBMZ (29 / 106)
Downloading

In [5]:
reviews = pd.read_csv(open(dir_path + "/data/raw/reviews.csv"))

In [6]:
# Preview data
reviews.head()

Unnamed: 0,product_title,asin,author,review,rating,date,url
0,Yankee Candle Large Jar Candle Home Sweet Home,B000WUFVR0,David C.,I usually have good experience with Yankee Can...,2.0,2019-01-18,https://www.amazon.com/product-reviews/B000WUF...
1,Yankee Candle Large Jar Candle Home Sweet Home,B000WUFVR0,Vanessa,I don’t think this is a real yankee candle. Th...,1.0,2018-12-16,https://www.amazon.com/product-reviews/B000WUF...
2,Yankee Candle Large Jar Candle Home Sweet Home,B000WUFVR0,keith e.,If you like a cinnamon scent with a hint of ap...,5.0,2019-08-25,https://www.amazon.com/product-reviews/B000WUF...
3,Yankee Candle Large Jar Candle Home Sweet Home,B000WUFVR0,Concerned Citizen,I read reviews of this and another version whe...,1.0,2018-11-27,https://www.amazon.com/product-reviews/B000WUF...
4,Yankee Candle Large Jar Candle Home Sweet Home,B000WUFVR0,Wayne Connell,"I purchase Yankee Candles regularly, an array ...",1.0,2016-09-18,https://www.amazon.com/product-reviews/B000WUF...


## 2.3 Output raw review data

In [16]:
# Output reviews data to the "raw" folder
reviews.to_csv((dir_path + "/data/raw/reviews.csv"), index=False)

# 3. Pre-process review data
----

In [17]:
# Filter to only include data within the given data range
date_filter = (reviews['date'] >= "2021-10-01") & (reviews['date'] <= "2022-01-01")
reviews = reviews.loc[date_filter]

## 3.1 Calculate daily review counts

In [18]:
# Gather the number of reviews on each day and output to CSV
reviews_by_day = reviews['date'].value_counts()
reviews_by_day = reviews_by_day.to_frame().reset_index()
reviews_by_day = reviews_by_day.rename(columns={"index":"date", "date":"freq"})

## 3.2 Output pre-processed data

In [19]:
# DEPRICATED
# Output filtered reviews data to the "interim" folder
reviews.to_csv(dir_path + "/data/interim/reviews-cleaned.csv", index=False)

In [20]:
# Output daily counts data to the "interim" folder
reviews_by_day.to_csv(dir_path + "/data/interim/reviews_by_day.csv", index=False)

# 4. Gather COVID-19 data
----

## 4.1 Download data

The COVID-19 data for this project is sourced from Our World In Data (OWID) and is updated daily. View their documentation [here](https://ourworldindata.org/coronavirus-source-data).

In [31]:
# Download all-time COVID-19 data from OWID GitHub
covid_data = pd.read_csv('https://github.com/owid/covid-19-data/blob/master/public/data/owid-covid-data.csv?raw=true')

In [32]:
# Preview data
covid_data.head(1)

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,


## 4.2 Output raw data

In [33]:
# Output raw data to csv in the "raw" folder
covid_data.to_csv(dir_path + "/data/raw/owid_covid_19_data.csv", index=False)

# 5. Pre-process COVID-19 data
----

## 5.1 Select relevant features

In [34]:
# Reduce columns to only those we need
covid_data = covid_data[['date', 'iso_code', 'location', 'total_cases', 'new_cases']]

## 5.2 Filter data by country

In [35]:
# Filter to only include data from the USA
covid_data = covid_data[covid_data['iso_code'] == "USA"]

In [36]:
# Preview Data
covid_data.head(1)

Unnamed: 0,date,iso_code,location,total_cases,new_cases
148428,2020-01-22,USA,United States,1.0,


In [None]:
# Filter to only include data within the given data range
date_filter = (covid_data['date'] >= "2021-10-01") & (covid_data['date'] <= "2022-01-16")
covid_data = covid_data.loc[date_filter]

## 5.3 Output pre-processed COVID-19 data

In [37]:
# Output reviews df as a CSV
covid_data.to_csv(dir_path + "/data/interim/covid_data.csv", index=False)