### Web Scraping Example - Books information from selected Category

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
#from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import re

chromedriver_path="C:\HOWScraping2E\driver\chromedriver.exe" #fit the path as required.

#### Prepare a 'driver' to use with Chrome

In [2]:
service = Service(service=chromedriver_path)
driver = webdriver.Chrome(service=service)

#### URL : https://toscrape.com/

In [3]:
mainUrl="https://toscrape.com/" 

In [4]:
driver.get(mainUrl)

#### Choose the link with text 'bookstore' or 'books.toscrape.com'

In [5]:
link = driver.find_element(By.PARTIAL_LINK_TEXT, "bookstore").get_attribute('href')

In [6]:
link

'http://books.toscrape.com/'

#### Verify if the link is similar to books.toscrape.com

In [7]:
if 'books.toscrape.com' in link:
    bookUrl = driver.find_element(By.PARTIAL_LINK_TEXT, "bookstore").click()
   
    #Choose the Category with text 'Childrens'
    categoryURL = driver.find_element(By.LINK_TEXT, "Childrens").get_attribute('href')
    print(f"Category URL: {categoryURL}")
    
    #lets load the category page
    driver.find_element(By.LINK_TEXT, "Childrens").click() 
    
    #Select the Category Name from page and the results.
    categoryName = driver.find_element(By.CLASS_NAME, "page-header").find_element(By.TAG_NAME,"h1").text
    categoryResults = driver.find_element(By.CSS_SELECTOR, "form.form-horizontal").text
    
    print(f"Category Name: {categoryName}")
    print(f"\tCategory Results: {categoryResults}")

Category URL: http://books.toscrape.com/catalogue/category/books/childrens_11/index.html
Category Name: Childrens
	Category Results: 29 results - showing 1 to 20.


#### Default values like book-count, column-name, pagination

In [8]:
pagination=False  #default 
column=['Upc','Title','Price','Rating','Stock','Stock_Qty','Url','Image']        
dataSet=[]   #empty dataset
count=0

In [9]:
if re.search('\d+\s*result.*showing',categoryResults): #29results - showing 1 to 20. #10 results
#if re.search('\d+\s*result.*',categoryResults): #29results - showing 1 to 20. #10 results
    pagination = True
    page=1
    while pagination:
        try:
            print(f"Processing Page {page} -- {(count+1)}")
            listings = driver.find_elements(By.CSS_SELECTOR,"ol.row li") # multiple elements   
            
            #Iterate the listing available in the page
            for listing in listings:
                # print(count)
                article = listing.find_element(By.TAG_NAME,'article')

                image = article.find_element(By.CSS_SELECTOR,"a")
                articleLink = image.get_attribute('href')
                imageSrc = image.find_element(By.TAG_NAME,'img').get_attribute('src')
                imageAlt = image.find_element(By.TAG_NAME,'img').get_attribute('alt')
                rating = article.find_element(By.CSS_SELECTOR,"p[class*='star']").get_attribute('class')
                title = article.find_element(By.CSS_SELECTOR,"h3 a").get_attribute('title')
                price = article.find_element(By.CLASS_NAME,"price_color").text
                print(f"Data..{articleLink} -- {imageAlt} -- {rating} -- {price}")
               
                if articleLink:
                    listing.find_element(By.TAG_NAME,'img').click() #loads the detail page
                    upc = driver.find_element(By.XPATH,"//th[contains(text(),'UPC')]/following-sibling::td").text
                    
                    if upc: # check if UPC is available
                        stockQty= driver.find_element(By.XPATH,"//th[contains(text(),'Availability')]/following-sibling::td").text
                        
                        # Add values to temp list
                        temp=[upc,title,price,rating.replace('star-rating','').strip(),
                              stockQty.split('(')[0].strip(),
                              stockQty.split('(')[1].replace('available','').replace(')','').strip(),
                              articleLink,
                              imageSrc]
                    count=count+1
                    dataSet.append(temp) #append temp list details to main dataSet.
                
                #Go back to history (From Product page to Product Listing Page)
                driver.back()       
            try:
                #Check for Pagination with text 'next' in the Listing page
                driver.find_element(By.LINK_TEXT,'next').click()
                page=page+1
            except NoSuchElementException:                
                pagination = False
                print(f"Further Pagination is not possible, currently at {page}")
        except Exception as e:
            print(f"Exception Occured: {str(e)}")
            pagination = False
# print(dataSet)

Processing Page 1 -- 1
Data..http://books.toscrape.com/catalogue/birdsong-a-story-in-pictures_975/index.html -- Birdsong: A Story in Pictures -- star-rating Three -- £54.64
Data..http://books.toscrape.com/catalogue/the-bear-and-the-piano_967/index.html -- The Bear and the Piano -- star-rating One -- £36.89
Data..http://books.toscrape.com/catalogue/the-secret-of-dreadwillow-carse_944/index.html -- The Secret of Dreadwillow Carse -- star-rating One -- £56.13
Data..http://books.toscrape.com/catalogue/the-white-cat-and-the-monk-a-retelling-of-the-poem-pangur-ban_865/index.html -- The White Cat and the Monk: A Retelling of the Poem “Pangur Bán” -- star-rating Four -- £58.08
Data..http://books.toscrape.com/catalogue/little-red_817/index.html -- Little Red -- star-rating Three -- £13.47
Data..http://books.toscrape.com/catalogue/walt-disneys-alice-in-wonderland_777/index.html -- Walt Disney's Alice in Wonderland -- star-rating Five -- £12.96
Data..http://books.toscrape.com/catalogue/twenty-yaw

#### With final dataSet in hand, lets write the data to CSV and JSON files.

In [10]:
print(f"Total rows in dataSet : {len(dataSet)}")

Total rows in dataSet : 29


In [11]:
import csv
import json

##### Create a CSV file using column as first row


In [12]:
fileCsv = open('book_details.csv', 'w', newline='', encoding='utf-8')
writer = csv.writer(fileCsv)
writer.writerow(column)
for data in dataSet:
    writer.writerow(data)
fileCsv.close()
print("CSV file Created")

CSV file Created


##### Create a JSON file.


In [13]:
finalDataSet=list() #empty Dataset
for data in dataSet:
    finalDataSet.append(dict(zip(column,data)))  #append a dictionary with column name as key and data as value
# print(finalDataSet)

with open('book_details.json', 'w') as jsonfile:
    json.dump(finalDataSet,jsonfile)
print("JSON file Created")

JSON file Created


#### Closing and ending the 'driver' session using quit()

In [14]:
driver.quit()