# Use the requests library to download web pages

In [147]:
import csv 
import requests
from bs4 import BeautifulSoup

In [163]:
url = "https://www.amazon.ca/s?k=books&i=stripbooks-intl-ship&crid=1TON6NWKD5EWP&sprefix=book%2Cstripbooks-intl-ship%2C372&ref=nb_sb_noss_1"

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

In [164]:
response = requests.get(url, headers = headers)

In [165]:
response.status_code

200

In [166]:
page_contents = response.text
page_contents

'<!doctype html><html lang="en-ca" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n<!-- sp:feature:csm:head-open-part1 -->\n\n<script type=\'text/javascript\'>var ue_t0=ue_t0||+new Date();</script>\n<!-- sp:end-feature:csm:head-open-part1 -->\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-na.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n<!-- sp:feature:csm:head-open-part2 -->\n<script type=\'text/javascript\'>\nwindow.ue_ihb = (window.ue_ihb || window.ueinit || 0) + 1;\nif (window.ue_ihb === 1) {\n\nvar ue_csm = window,\n    ue_hob = +new Date();\n(function(d){var e=d.ue=d.ue||{},f=Date.now||function(){

# Extract the collection 

In [167]:
soup = BeautifulSoup(page_contents, 'html.parser')

In [181]:
selected_class = "a-section a-spacing-base"

results = soup.find_all('div', {'class': selected_class})

In [182]:
len(results)

64

In [183]:
results

[<div class="a-section a-spacing-base"><div class="s-product-image-container aok-relative s-text-center s-image-overlay-grey puis-image-overlay-grey s-padding-left-small s-padding-right-small puis-spacing-small s-height-equalized puis puis-v132n5e4faosf42v0eo3rf7vw9m"><span class="rush-component" data-component-type="s-product-image" data-render-id="r2vffwe2tm8r432n0evmpi5csgr" data-version-id="v132n5e4faosf42v0eo3rf7vw9m"><a class="a-link-normal s-no-outline" href="/sspa/click?ie=UTF8&amp;spc=MTo4NzYwMTg3NjQ4MzUyMDgwOjE2ODg4Nzc2NzM6c3BfYXRmOjIwMDE1Mzc3MjUwNzA5ODo6MDo6&amp;url=%2FGone-Connor-Callahan-Book-1-ebook%2Fdp%2FB08PTGHC8Q%2Fref%3Dsr_1_1_sspa%3Fcrid%3D1TON6NWKD5EWP%26keywords%3Dbooks%26qid%3D1688877673%26sprefix%3Dbook%252Cstripbooks-intl-ship%252C372%26sr%3D8-1-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGY%26psc%3D1"><div class="a-section aok-relative s-image-square-aspect"><img alt="Sponsored Ad – Gone (A Connor Callahan Mystery Thriller Book 1)" class="s-image" data-image-index="

# Prototype the record

In [184]:
item = results[0]

In [203]:
atag = item.h2.a

In [204]:
title = atag.text.strip()

In [205]:
url = 'https://www.amazon.ca' + atag.get('href')

In [206]:
price = item.find('span', 'a-price-fraction').text

In [207]:
stars = item.find('span', 'a-icon-alt').text

In [208]:
review_count = item.find('span', 'a-size-base s-underline-text').text

# Generalize the pattern 

In [215]:
def extract_record(item): 
    """Extract and return data from a single reccord"""

    # description and url
    atag = item.h2.a
    title = atag.text.strip()
    url = 'https://www.amazon.ca' + atag.get('href')
    
    try :
        # price
        price = item.find('span', 'a-price-fraction').text
    except AttributeError: 
        return 
    
    try : 
        # rank and rating 
        stars = item.find('span', 'a-icon-alt').text
        review_count = item.find('span', 'a-size-base s-underline-text').text
    except AttributeError: 
        rating = ''
        review_count = ''

    result = (title, price, stars, review_count, url)

    return result

In [216]:
records = []

for item in results : 
    records.append(extract_record(item))

In [220]:
records[5]

('The Family Across the Street: A totally unputdownable psychological thriller with a shocking twist',
 '99',
 '4.2 out of 5 stars',
 '18,637',
 'https://www.amazon.ca/Family-Across-Street-unputdownable-psychological/dp/1800198272/ref=sr_1_6?crid=1TON6NWKD5EWP&keywords=books&qid=1688877673&sprefix=book%2Cstripbooks-intl-ship%2C372&sr=8-6')

# Getting to the next page

In [221]:
def get_url(search_term): 
    """Generate a url from search term"""
    template = "https://www.amazon.ca/s?k={}&i=stripbooks-intl-ship&crid=1TON6NWKD5EWP&sprefix=book%2Cstripbooks-intl-ship%2C372&ref=nb_sb_noss_1"
    search_term = search_term.replace(' ', '+')

    # add term querry to url 
    url = template.format(search_term)

    # add page query placeholder 
    url += '&page{}'

    return url

# Putting it all together 

In [1]:
import csv 
import requests
from bs4 import BeautifulSoup

def get_url(search_term): 
    """Generate a url from search term"""
    template = "https://www.amazon.ca/s?k={}&i=stripbooks-intl-ship&crid=1TON6NWKD5EWP&sprefix=book%2Cstripbooks-intl-ship%2C372&ref=nb_sb_noss_1"
    search_term = search_term.replace(' ', '+')

    # add term querry to url 
    url = template.format(search_term)

    # add page query placeholder 
    url += '&page{}'

    return url

def extract_record(item): 
    """Extract and return data from a single reccord"""

    # description and url
    atag = item.h2.a
    title = atag.text.strip()
    url = 'https://www.amazon.ca' + atag.get('href')
    
    try :
        # price
        price = item.find('span', 'a-price-fraction').text
    except AttributeError: 
        return 
    
    try : 
        # rank and rating 
        stars = item.find('span', 'a-icon-alt').text
        review_count = item.find('span', 'a-size-base s-underline-text').text
    except AttributeError: 
        stars = ''
        review_count = ''

    result = (title, price, stars, review_count, url)

    return result

def main(search_term): 
    """Run main program routine"""
    
    # startup the webdriver 
    #options = EdgeOptions()
    #options.use_chromium = True 
    #driver = Edge(options=options)

    records = []
    url = get_url(search_term)
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

    for page in range(1, 21):
  
        response = requests.get(url, headers = headers)
        page_contents = response.text
        

        soup = BeautifulSoup(page_contents, 'html.parser')
        selected_class = "a-section a-spacing-base"
        results = soup.find_all('div', {'class': selected_class})

        for item in results: 
            record = extract_record(item)
            if record: 
                records.append(record)
    

    # Save data to csv file 
    with open('Amazon_book_scraping.csv', 'w', newline='', encoding='utf-8') as f: 
        writer = csv.writer(f)
        writer.writerow(['title', 'price', 'stars', 'review_count', 'url'])
        writer.writerows(records)

In [2]:
main('books')