In [2]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [3]:
landingPage = 'https://books.toscrape.com'

# Get response from website
response = requests.get(url=landingPage)

In [4]:
# View all attributes of the response
response.__attrs__

['_content',
 'status_code',
 'headers',
 'url',
 'history',
 'encoding',
 'reason',
 'cookies',
 'elapsed',
 'request']

In [5]:
# What is the status code?
response.status_code

200

In [6]:
# Creating a soup from the content 
soup = BeautifulSoup(markup=response.content, 
              features='html.parser')

In [7]:
bookList = soup.find(name='ol', attrs={'class': 'row'})\
                .findAll(name='li', attrs={'class': "col-xs-6 col-sm-4 col-md-3 col-lg-3"})

In [8]:
test = bookList[0]

In [9]:
# Grabbing the title of the book
title = test.find(name='h3').find(name='a')['title']

# Grab the price of the book
priceInEuros = test.find(name='div', attrs={'class': 'product_price'})\
                    .find(name='p', attrs={'class': 'price_color'})\
                    .text\
                    .replace('£', '')

# Grab the availability in store
availability = test.find(name='div', attrs={'class': 'product_price'})\
                    .find(name='p', attrs={'class': 'instock availability'})\
                    .text\
                    .strip()

# Extract the star rating
rating = test.find(name='p', 
          attrs={'class': re.compile(pattern='star-rating.+')})['class'][1]

In [10]:
df = pd.DataFrame(columns=['title', 'priceInEuros', 'availability', 'rating'])


for idx, book in enumerate(bookList):
    # Grab the title
    title = book.find(name='h3').find(name='a')['title']
    # Grab the price
    priceInEuros = book.find(name='div', attrs={'class': 'product_price'})\
                    .find(name='p', attrs={'class': 'price_color'})\
                    .text\
                    .replace('£', '')
    # Grab the availability
    availability = book.find(name='div', attrs={'class': 'product_price'})\
                    .find(name='p', attrs={'class': 'instock availability'})\
                    .text\
                    .strip()
    # Grab the rating
    rating = book.find(name='p', 
          attrs={'class': re.compile(pattern='star-rating.+')})['class'][1]
    
    # Appending information to the dataframe
    df.loc[idx] = [title, float(priceInEuros), availability, rating]

## Incorporate Pagination

In [11]:
pagesScrapped = 1
df = pd.DataFrame(columns=['title', 'priceInEuros', 'availability', 'rating'])
landingPage = 'https://books.toscrape.com/'
url = landingPage
listOfDfs = []

while pagesScrapped <= 10:
    # For each page, make a request
    response = requests.get(url=url)

    # Create a soup
    soup = BeautifulSoup(markup=response.content,
                         features='html.parser')

    # Extract the list of books
    bookList = soup.find(name='ol', attrs={'class': 'row'})\
                .findAll(name='li', attrs={'class': "col-xs-6 col-sm-4 col-md-3 col-lg-3"})

    for idx, book in enumerate(bookList):
        # Grab the title
        title = book.find(name='h3').find(name='a')['title']
        # Grab the price
        priceInEuros = book.find(name='div', attrs={'class': 'product_price'})\
                        .find(name='p', attrs={'class': 'price_color'})\
                        .text\
                        .replace('£', '')
        # Grab the availability
        availability = book.find(name='div', attrs={'class': 'product_price'})\
                        .find(name='p', attrs={'class': 'instock availability'})\
                        .text\
                        .strip()
        # Grab the rating
        rating = book.find(name='p', 
            attrs={'class': re.compile(pattern='star-rating.+')})['class'][1]
        
        # Appending information to the dataframe
        df.loc[idx] = [title, float(priceInEuros), availability, rating]
    
    # Store the pages info in a list of dataframes
    listOfDfs.append(df)


    # Get the url for the next page
    nextPageUrl = soup.find(name='li', attrs={'class': 'next'})\
                        .find(name='a')['href']
    
    # Update url 
    if pagesScrapped >= 2:
        url = landingPage + 'catalogue/' + nextPageUrl
    else:
        url = landingPage + nextPageUrl

    # Update control variable
    pagesScrapped += 1

df = pd.concat(listOfDfs)

In [13]:
df

Unnamed: 0,title,priceInEuros,availability,rating
0,Modern Romance,28.26,In stock,Five
1,Miss Peregrine’s Home for Peculiar Children (M...,10.76,In stock,One
2,Louisa: The Extraordinary Life of Mrs. Adams,16.85,In stock,Two
3,Little Red,13.47,In stock,Three
4,Library of Souls (Miss Peregrine’s Peculiar Ch...,48.56,In stock,Five
...,...,...,...,...
15,Eureka Trivia 6.0,54.59,In stock,Four
16,Drive: The Surprising Truth About What Motivat...,34.95,In stock,Four
17,Done Rubbed Out (Reightman & Bailey #1),37.72,In stock,Five
18,Doing It Over (Most Likely To #1),35.61,In stock,Three


In [12]:
df.shape

(200, 4)