## Importing required packages

In [1]:
import csv
from bs4 import BeautifulSoup
from selenium import webdriver

## Starting up the webdriver

In [8]:
!pip install webdriver-manager
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())



Current google-chrome version is 97.0.4692
Get LATEST chromedriver version for 97.0.4692 google-chrome


Collecting webdriver-manager
  Downloading webdriver_manager-3.5.2-py2.py3-none-any.whl (17 kB)
Collecting configparser
  Downloading configparser-5.2.0-py3-none-any.whl (19 kB)
Collecting crayons
  Downloading crayons-0.4.0-py2.py3-none-any.whl (4.6 kB)
Installing collected packages: configparser, crayons, webdriver-manager
Successfully installed configparser-5.2.0 crayons-0.4.0 webdriver-manager-3.5.2


There is no [win32] chromedriver for browser  in cache
Trying to download new driver from https://chromedriver.storage.googleapis.com/97.0.4692.71/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\Viniciu\.wdm\drivers\chromedriver\win32\97.0.4692.71]


## Creating functions to work with our web driver

In [9]:
#This function will get for us the url from amazon based on the search term we use
def get_url(search_term):
    template = "https://www.amazon.com/s?k={}&ref=nb_sb_noss"
    search_term = search_term.replace(' ','+')
    return template.format(search_term)

In [11]:
#Let's test the function

url = get_url("manga")
print(url)

https://www.amazon.com/s?k=manga&ref=nb_sb_noss


In [13]:
driver.get(url)

## Extracting the data

In [14]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [15]:
results = soup.find_all('div', {'data-component-type':'s-search-result'})

In [16]:
len(results)

48

In [18]:
item = results[0]

In [19]:
atag = item.h2.a

## Getting the title of the product and its url in Amazon website

In [21]:
title = atag.text.strip()

In [22]:
url = 'https://www.amazon.com'+atag.get('href')

## Getting the price of the item

In [24]:
price_parent = item.find('span', 'a-price')

In [26]:
price =price_parent.find('span', 'a-offscreen').text

## Getting the ratings of the items

In [29]:
rating = item.i.text

## Creating a new function to join all the things we've done so far

In [32]:
def extract_data(item):
    #Title and Url
    atag = item.h2.a
    title = atag.text.strip()
    url = 'https://www.amazon.com'+atag.get('href')
    try:
        #Price
        price_parent = item.find('span', 'a-price')
        price =price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return 
    try:
        #Ratings
        rating = item.i.text
    except AttributeError:
        rating = ' '
        
    #Result in tuple format
    result = (title, price, rating, url)
    return result

In [33]:
records = []
results = soup.find_all('div', {'data-component-type':'s-search-result'})

for item in results:
    record = extract_data(item)
    if record:
        records.append(record)
    

In [34]:
records[0]

('Jujutsu Kaisen 0',
 '$8.99',
 '4.8 out of 5 stars',
 'https://www.amazon.com/Jujutsu-Kaisen-0-Gege-Akutami/dp/1974720144/ref=sr_1_1?keywords=manga&qid=1642166138&sr=8-1')

## Navigating to the next page

In [35]:
def get_url(search_term):
    template = "https://www.amazon.com/s?k={}&ref=nb_sb_noss"
    search_term = search_term.replace(' ','+')
    url = template.format(searche_term)
    url += '&page{}'
    return url 

## Putting all the scrapp together in only one cell

In [40]:
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

def get_url(search_term):
    template = "https://www.amazon.com/s?k={}&ref=nb_sb_noss"
    search_term = search_term.replace(' ','+')
    url = template.format(search_term)
    url += '&page{}'
    return url 

def extract_data(item):
    #Title and Url
    atag = item.h2.a
    title = atag.text.strip()
    url = 'https://www.amazon.com'+atag.get('href')
    try:
        #Price
        price_parent = item.find('span', 'a-price')
        price =price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return 
    try:
        #Ratings
        rating = item.i.text
    except AttributeError:
        rating = ' '
        
    #Result in tuple format
    result = (title, price, rating, url)
    return result

def main(search_term):
    #Setting the web driver
    driver = webdriver.Chrome(ChromeDriverManager().install())
    records = []
    url = get_url(search_term)
    
    for page in range(1,8):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type':'s-search-result'})
        for item in results:
            record = extract_data(item)
            if record:
                records.append(record)
                
    driver.close()
    
    with open('prices.csv', 'w', newline = '', encoding = 'utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Title', 'Price','Rating','URL'])
        writer.writerows(records)

## Getting the csv with the amazon products

In [41]:
main('manga')



Current google-chrome version is 97.0.4692
Get LATEST chromedriver version for 97.0.4692 google-chrome
Driver [C:\Users\Viniciu\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe] found in cache
