In [1]:
import csv
from bs4 import BeautifulSoup
from selenium import webdriver

In [2]:
driver = webdriver.Chrome()

In [3]:
url='https://www.amazon.in/'
driver.get(url)

In [4]:
def get_url(search_term):
    """Generate a url from search term"""
    template = 'https://www.amazon.in/s?k={}&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ','+')
    return template.format(search_term)

In [5]:
url = get_url('hydraulic pump')
print(url)

https://www.amazon.in/s?k=hydraulic+pump&ref=nb_sb_noss_1


In [6]:
driver.get(url)

Extract the collection

In [7]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [8]:
results = soup.find_all('div',{'data-component-type': 's-search-result'})

In [9]:
len(results)

48

Prototype the record

In [10]:
item = results[0]

In [11]:
atag = item.h2.a

In [12]:
description = atag.text.strip()

In [13]:
url = 'https://www.amazon.in/' + atag.get('href')

In [14]:
price_parent = item.find('span', 'a-price')

In [15]:
price = price_parent.find('span', 'a-offscreen').text

In [16]:
rating = item.i.text

In [17]:
review_count = item.find('span', {'class' : 'a-size-base', 'dir' : 'auto'}).text

Generalize the pattern

In [18]:
def extract_record(item):
    """Extract and return data from a single record"""
    
    # description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.in/' + atag.get('href')
    
    # price
    price_parent = item.find('span', 'a-price')
    price = price_parent.find('span', 'a-offscreen').text
    
    # rank and rating
    rating = item.i.text
    review_count = item.find('span', {'class' : 'a-size-base', 'dir' : 'auto'}).text
    
    result = (description, price, rating, review_count, url)
    
    return result

In [19]:
records = []
results = soup.find_all('div',{'data-component-type': 's-search-result'})

for item in results:
    records.append(extract_record(item))

AttributeError: 'NoneType' object has no attribute 'text'

Error Handling

In [20]:
def extract_record(item):
    """Extract and return data from a single record"""
    
    # description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.in/' + atag.get('href')
    
    try:
        # price
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return
    
    try:
        # rank and rating
        rating = item.i.text
        review_count = item.find('span', {'class' : 'a-size-base', 'dir' : 'auto'}).text
    except AttributeError:
        rating = ''
        review_count = ''
        
    result = (description, price, rating, review_count, url)
    
    return result

In [21]:
records = []
results = soup.find_all('div',{'data-component-type': 's-search-result'})

for item in results:
    record = extract_record(item)
    if record:
        records.append(record)

In [22]:
records[0]

('Volo 18" / 450mm Bed Shocker for Smooth Soft Closing Bed Box Lift Mechanism, Weight Capacity 50kg- Pack of 2 Pcs',
 '₹850',
 '3.2 out of 5 stars',
 '20',
 'https://www.amazon.in//Bed-Shocker-Closing-Mechanism-Capacity/dp/B07Z3FG1SB/ref=sr_1_1?dchild=1&keywords=hydraulic+pump&qid=1605693082&sr=8-1')

In [23]:
for row in records:
    print(row[1])

₹850
₹4,625
₹19,373
₹6,799
₹21,523
₹22,104
₹1,600
₹20,137
1442.00
₹27,880
₹479
₹349
₹11,980
₹399
₹265
₹1,000
₹296
₹280
₹7,429
₹4,095
₹1,415
₹350
₹25,925
₹22,990
₹899
₹6,950
₹12,990
₹214
₹1,000
₹3,811
₹999
₹630
₹330
₹600
₹3,099
₹9,280
₹7,404
₹38,184
₹12,299
₹34,990
1707.00
₹13,840
₹3,499
₹14,910
₹299
₹1,748
₹218


Getting the next page

In [24]:
def get_url(search_term):
    """Generate a url from search term"""
    template = 'https://www.amazon.in/s?k={}&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ','+')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page{}' 
    
    return url

Putting it all together

In [25]:
import csv
from bs4 import BeautifulSoup
from selenium import webdriver

def get_url(search_term):
    """Generate a url from search term"""
    template = 'https://www.amazon.in/s?k={}&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ','+')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page{}' 
    
    return url

def extract_record(item):
    """Extract and return data from a single record"""
    
    # description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.in/' + atag.get('href')
    
    try:
        # price
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return
    
    try:
        # rank and rating
        rating = item.i.text
        review_count = item.find('span', {'class' : 'a-size-base', 'dir' : 'auto'}).text
    except AttributeError:
        rating = ''
        review_count = ''
        
    result = (description, price, rating, review_count, url)
    
    return result

def main(search_term):
    """Run main program routine"""
    # startup the webdriver
    driver = webdriver.Chrome()
    
    records = []
    url = get_url(search_term)
    
    for page in range(1,8):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div',{'data-component-type': 's-search-result'})
        
        for item in results:
            record = extract_record(item)
            if record:
                records.append(record)
                
    driver.close()
    
    #save data to csv file
    with open('hydraulic pump.csv', 'w', newline = '', encoding = 'utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Description', 'Price', 'Rating', 'ReviewCount', 'URL'])
        writer.writerows(records)

In [26]:
main('hydraulic pump')