## LXML scraping

In [186]:
import lxml.html as web
from lxml.etree import XPath
import math
import csv

#### URLs and Columns

In [187]:
baseUrl="http://books.toscrape.com/"
bookUrl=baseUrl+"catalogue/category/books/childrens_11/index.html"
pageUrl=baseUrl+"catalogue/category/books/childrens_11/page-" #page-1,page-2 found
columns=['title','price','stock','imageUrl','rating','url'] #for CSV header

#### Empty dataSet and default page values

In [188]:
dataSet=[]
page=1
totalPages=1

#### Save dataSet to CSV file

In [189]:
def writeto_csv(data,filename,columns):
    with open(filename,'w+',newline='',encoding="UTF-8") as file:
        writer = csv.DictWriter(file,fieldnames=columns)
        writer.writeheader()
        writer = csv.writer(file)
        for element in data:
            writer.writerows([element])

#### Loop through pages

In [190]:
while page<=totalPages:
    source = web.parse(pageUrl+str(page)+".html").getroot() #read and parse
    if page==1: #pagination
        perpageArticles = source.xpath("//form[@class=\"form-horizontal\"]/strong[3]/text()") #20
        totalArticles = source.xpath("//form[@class=\"form-horizontal\"]/strong[1]/text()")   #29     
        totalPages = math.ceil(int(totalArticles[0])/int(perpageArticles[0])) #1.45 ceil up
        print("TotalPages found:",totalPages)
    print("Processing Page "+str(page)+" from ",totalPages)
    
    #individual path for chosen elements
    articles = XPath("//ol[contains(@class,'row')]/li[position()>0]") #block
    titlePath = XPath(".//article[contains(@class,'product_pod')]/h3/a/@title")
    linkPath = XPath(".//article[contains(@class,'product_pod')]/h3/a/@href")
    pricePath = XPath(".//article/div[2]/p[contains(@class,'price_color')]/text()") 
    stockPath = XPath(".//article/div[2]/p[contains(@class,'availability')]/text()[normalize-space()]") 
    imagePath = XPath(".//article/div[1][contains(@class,'image_container')]/a/img/@src")
    ratingPath = XPath(".//article/p[contains(@class,'star-rating')]/@class")
    
    #iterate through all articles and individual element path
    for row in articles(source):
        title = titlePath(row)[0].strip()
        link = linkPath(row)[0].replace('../../../',baseUrl+'catalogue/').strip()
        price = pricePath(row)[0]
        availability = stockPath(row)[0].strip()
        image = imagePath(row)[0].replace('../../../../',baseUrl).strip()
        rating = ratingPath(row)[0].replace('star-rating','').strip()
        
        #if title is not missing, add to dataSet
        if len(title)>0:
             dataSet.append([title,price,availability,image,rating,link])
                
    print("Rows in Dataset: ",len(dataSet))
    page+=1 #increment page for loop

TotalPages found: 2
Processing Page 1 from  2
Rows in Dataset:  20
Processing Page 2 from  2
Rows in Dataset:  29


In [191]:
len(dataSet) #total length of elements

29

In [192]:
dataSet #List of collected elements

[['Birdsong: A Story in Pictures',
  '£54.64',
  'In stock',
  'http://books.toscrape.com/media/cache/af/6e/af6e796160fe63e0cf19d44395c7ddf2.jpg',
  'Three',
  'http://books.toscrape.com/catalogue/birdsong-a-story-in-pictures_975/index.html'],
 ['The Bear and the Piano',
  '£36.89',
  'In stock',
  'http://books.toscrape.com/media/cache/cf/bb/cfbb5e62715c6d888fd07794c9bab5d6.jpg',
  'One',
  'http://books.toscrape.com/catalogue/the-bear-and-the-piano_967/index.html'],
 ['The Secret of Dreadwillow Carse',
  '£56.13',
  'In stock',
  'http://books.toscrape.com/media/cache/c4/a2/c4a2a1a026c67bcceb5a411c724d7d0c.jpg',
  'One',
  'http://books.toscrape.com/catalogue/the-secret-of-dreadwillow-carse_944/index.html'],
 ['The White Cat and the Monk: A Retelling of the Poem “Pangur Bán”',
  '£58.08',
  'In stock',
  'http://books.toscrape.com/media/cache/26/32/2632a1e12f2c085fabbe022ae4cd6933.jpg',
  'Four',
  'http://books.toscrape.com/catalogue/the-white-cat-and-the-monk-a-retelling-of-the-poe

#### Convert List(dataSet) to CSV

In [193]:
writeto_csv(dataSet,'books.csv',columns)