In [2]:
'''
Project : web scraper code which extracts product's search result from Amazon.com website

input --> search_text (the product name which is going to be searched in Amazon.com )

output --> a csv file of desc, price, review_rating, review_count  of the desired product

by Morteza Azh

version 001

2023-01-25

'''

# required libraries 

from bs4 import BeautifulSoup
import csv
from selenium import webdriver
import pandas as pd
import numpy as np


def url_generator(search_text): 
    
    '''
     Navigating to Amazon's website through generating a URL based on the term which is going to be searched.
     This function will insert the search term using string formatting to the url.
    '''
    
    url_format = 'https://www.amazon.com/s?k={}&crid=3W08ZKWQWWA88&sprefix=%2Caps%2C796&ref=nb_sb_ss_recent_1_0_recent'
    
    # Replacing the spaces in search item with '+' to conform to the URL convention
    
    search_text = search_text.replace(' ', '+')
    url = url_format.format(search_text)
    
    # Adding a place to insert <the page number> using string formatting 
    
    url = url + '&page{}'
    
    return url


def content_extraction(row):
    
    ### Extracting the contents of the page from the HTML
    
    # Exctrating the record header or the description of the product
    
    a_tag = row.h2.a
    
    # The text property of the a tag is the description, we use .strip method to remove the extra space on the edges
    
    desc = a_tag.text.strip()
    
    '''
    The a_tag has the href property which is the url, as it's not a complete URL we need to prepend it with the Amazon 
    webiste URL 
    '''
    
    url = 'https://www.amazon.com' + a_tag.get('href')
    
    #Extracting the price
    
    try:
        price_parent = row.find('span','a-price')
        price = price_parent.find('span','a-offscreen').text
    except AttributeError:
        return
    
        
    # Extracting Reading out of five and number of reviews
    try:
        review_rating = row.i.text
        review_count = row.find('span', {'class': 'a-size-base', 'dir' : 'auto'}).text
        
    except AttributeError:
        
        review_rating = ''
        review_count =  ''
    
    result = (desc, price,review_rating,review_count,url)
    return result

def main(search_text):
    
    '''
    this function accepts an argument of the search term and then it's going to run the search and save the result as
    csv file
    
    '''
    
    # begin an instance of the web driver

    driver_path = r"C:\\Program Files (x86)\\chromedriver.exe"

    # for chrome:
    
    driver = webdriver.Chrome(driver_path)
    
    
    record = []
    records = []
    
    # generating a URL based on the search text
    
    url = url_generator(search_text)
    
    '''
    Iterating over 20 pages using the query parameter in the URL for page number, any search that  is done in Amazon 
    will result in a maximum of 20 page results this means that we can add this page query to the URL
    using string formatting then we can request the next page until we've extracted from all 20 pages 
    
    '''
    
    for page in range(1,21):
        
        # using the .get method of the driver and passing in the url as the argument
        
        driver.get(url.format(page))
        
        '''
        before extracting the contents of the page from the HTML,we need to make a soup object which will parse the 
        html content from the page source
        
        '''
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        '''
        'data-component-type' is a good option to identfy products records in the page, it's more specific than the class,
         we use the soup object we created previously to extract all elements with a data-component-type of s-search-result
         in div tag
        '''
        
        result = soup.find_all('div' , {'data-component-type': 's-search-result'} )

        # check to see if what is returned from the function is empty or not
        
        for row in result:
            record = content_extraction(row)
            if record:
                records.append(record)
    driver.close()
    
    # saving the extracted data to a csv file
    '''
    with open('extracted_data.csv' , 'w' , newline = '', encoding = 'utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["desc", "price","review_rating","review_count","url"])
        writer.writerows(records)
     '''
        

In [15]:
main('laptop')