# Amazon Web Scraper Requirements: 

* Selenium
* BeautifulSoup 

In [86]:
import csv
from bs4 import BeautifulSoup
# for fireforx and chrome
from selenium import webdriver

In [87]:
# starting up the driver
driver = webdriver.Chrome()

In [88]:
# starting the AmazonIndia site in the driver page
url = 'https://www.amazon.in/'
driver.get(url)

In [89]:
def get_url(search_term):
    """ This function will will take any text as an input and 
    provide an Amazon Seach link as an output """
    
    base_link = 'https://www.amazon.in/s?k={}&crid=13LMAMNW5PMHP&sprefix=video+games%2Caps%2C229&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ','+')

    return base_link.format(search_term)

In [91]:
# checking out the function output
url = get_url('video games ps4')
print(url)

https://www.amazon.in/s?k=video+games+ps4&crid=13LMAMNW5PMHP&sprefix=video+games%2Caps%2C229&ref=nb_sb_noss_1


In [92]:
driver.get(url)

# Extracting Content

In [93]:
# accessing page source (inspect element) of the search result page
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [94]:
# searching the individual element directory to access details
results = soup.find_all('div', {'data-component-type':'s-search-result'})

In [95]:
len(results)

17

# Prototype the results

In [96]:
# selecting the first item in search result page
item = results[0]

In [97]:
# filtering directory division of the first search result item in the page
# to locate h2 tag which is the second-level heading on your webpage 
# aka description of the item
atag = item.h2.a

In [98]:
description = atag.text.strip() # using strip to remove the whitespace in results
print(description)

FIFA 23 | Standard Edition | PS4 (PlayStation 4)


In [99]:
# href attribute specifies/provide the URL of the item correlated with main result link
url = 'http://amazon.in'+ atag.get('href')

In [100]:
print(url)

http://amazon.in/sspa/click?ie=UTF8&spc=MToxNTY0MjEyOTQ2MTAwMDcyOjE2NzE1MzA0MjM6c3BfYXRmOjIwMTAzMjMyNzMzNDk4OjowOjo&url=%2FFIFA-23-Standard-PS4-PlayStation%2Fdp%2FB0B7BD84V6%2Fref%3Dsr_1_1_sspa%3Fcrid%3D13LMAMNW5PMHP%26keywords%3Dvideo%2Bgames%2Bps4%26qid%3D1671530423%26sprefix%3Dvideo%2Bgames%252Caps%252C229%26sr%3D8-1-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGY%26psc%3D1


In [101]:
# filtering span division which is an inline container used to mark up a part of a text
# to locate the price tag
sale_price_parent = item.find('span','a-price')
# finding the inner price tag text 
sale_price = sale_price_parent.find('span','a-offscreen').text
print(sale_price)

₹3,299


In [102]:
# Using same division above to locate original price
original_price_parent = item.find('span','a-price a-text-price')
original_price = original_price_parent.find('span','a-offscreen').text
print(original_price)

₹4,299


In [103]:
# locating the rating in i class
rating = item.i.text
print(rating[:3])

4.7


In [104]:
# locating number of reviews in span division
reviews = item.find('span', {'class':'a-size-base s-underline-text'}).text
print(reviews)

84


# Generalizing the pattern

In [106]:
def extract_records(item):
    
    """Extract Data from single search result"""
    
    # description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'http://amazon.in'+ atag.get('href')
      
    # Using Try & except statement to avoid AttributeError as 
    # there can be items in the search list with blank data
    # which can result in extaction failure 
    
    # sale and original price
    try:
        sale_price_parent = item.find('span','a-price')
        sale_price = sale_price_parent.find('span','a-offscreen').text
        original_price_parent = item.find('span','a-price a-text-price')
        original_price = original_price_parent.find('span','a-offscreen').text
    except AttributeError:
        return
    
    # rating and reviews
    try:
        rating = item.i.text
        reviews = item.find('span', {'class':'a-size-base s-underline-text'}).text
    except AttributeError:
        rating = ''
        reviews = ''
        
    # storing all search results in a tuple
    result = (description, original_price, sale_price, rating[:3], reviews, url)
    
    return result

In [116]:
# now extracting the data from all the search results via loop

records = []
results = soup.find_all('div', {'data-component-type':'s-search-result'})
for item in results:
    record = extract_records(item)
    if record:
        records.append(record)

In [117]:
print(records[0])

('FIFA 23 | Standard Edition | PS4 (PlayStation 4)', '₹4,299', '₹3,299', '4.7', '84', 'http://amazon.in/sspa/click?ie=UTF8&spc=MToxNTY0MjEyOTQ2MTAwMDcyOjE2NzE1MzA0MjM6c3BfYXRmOjIwMTAzMjMyNzMzNDk4OjowOjo&url=%2FFIFA-23-Standard-PS4-PlayStation%2Fdp%2FB0B7BD84V6%2Fref%3Dsr_1_1_sspa%3Fcrid%3D13LMAMNW5PMHP%26keywords%3Dvideo%2Bgames%2Bps4%26qid%3D1671530423%26sprefix%3Dvideo%2Bgames%252Caps%252C229%26sr%3D8-1-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGY%26psc%3D1')


# Putting Everything up in One with Master_Function

In [143]:
import csv
from bs4 import BeautifulSoup
# for fireforx and chrome
from selenium import webdriver

#----------------------------------------------------------------------------------------------------
def get_url(search_term):
    """ This function will will take any text as an input and 
    provide an Amazon Seach link as an output """
    
    base_link = 'https://www.amazon.in/s?k={}&crid=13LMAMNW5PMHP&sprefix=video+games%2Caps%2C229&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ','+')
    
    # add query to url
    url = base_link.format(search_term)
    # add page query placeholder
    url+='&page={}'
    
    return url
#-----------------------------------------------------------------------------------------------------
def extract_records(item):
    
    """Extract Data from single search result"""
    
    # description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'http://amazon.in'+ atag.get('href')
      
    # Using Try & except statement to avoid AttributeError as 
    # there can be items in the search list with blank data
    # which can result in extaction failure 
    
    # sale and original price
    try:
        sale_price_parent = item.find('span','a-price')
        sale_price = sale_price_parent.find('span','a-offscreen').text
        original_price_parent = item.find('span','a-price a-text-price')
        original_price = original_price_parent.find('span','a-offscreen').text
    except AttributeError:
        return
    
    # rating and reviews
    try:
        rating = item.i.text
        reviews = item.find('span', {'class':'a-size-base s-underline-text'}).text
    except AttributeError:
        rating = ''
        reviews = ''
        
    # storing all search results in a tuple
    result = (description, original_price, sale_price, rating[:3], reviews, url)
    
    return result
#-----------------------------------------------------------------------------------------------------
def main(search_term):
    """Run main program routine"""
    # starting up the driver
    driver = webdriver.Chrome()
    
    records = []
    url = get_url(search_term)
    
    for page in range(1, 21):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type':'s-search-result'})
        
        for item in results:
            record = extract_records(item)
            if record:
                records.append(record)
                
    driver.close()
    
    # save data to a csv file  
    with open ('AMZ_WebScrap_data.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Description', 'Original_Price', 'Sale_Price', 'Rating', 'Review_count', 'URL'])
        for row in records:
            writer.writerow(row)

In [144]:
main('games ps4')