## 1. Imports

In [1]:
import requests
import pandas as pd
import re

from datetime import datetime
from bs4 import BeautifulSoup as bs

## 2. Functions

### 2.1 Scraping the categories and urls desired

In [2]:
def catalogue_urls(url):
    # This function intend to get the name and url of the book categories and put them in a dataframe 
    # that will be used afterwards
    
    # header to look real
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    
    # Access the root page and store it
    page = requests.get( url, headers=headers)
    
    # BeautifulSoup Object
    soup = bs( page.text, 'html.parser')
    
    #This is where I found the list including all the categories in the sidebar    
    catalogue_list = soup.find( 'ul', class_='nav nav-list').find_all('a')
    
    # Scraping the urls from all the categories
    catalogue_urls = [p.get('href') for p in catalogue_list]
    
    # Scraping the category names
    catalogue_list = [p.get_text().split() for p in catalogue_list]
    catalogue_list = [' '.join(strings) for strings in catalogue_list]
    
    # Creating a dataframe using the category names and the urls and filtering to get only the ones I need.
    catalogue_df = pd.DataFrame({'Catalogue':catalogue_list, 'urls': catalogue_urls})
    catalogue_true = ['Classics', 'Science Fiction', 'Humor', 'Business']
    catalogue = catalogue_df[catalogue_df['Catalogue'].isin(catalogue_true)]
    
    return catalogue

### 2.2 Book scraping

In [42]:
def scrapping(catalogue):
    # This function use the book categories and urls from the previous function to scrap the books informations

    # Creating an empty dataframe
    books_df_final = pd.DataFrame()
    
    # Root url
    root_url = 'https://books.toscrape.com/'
    
    # header to look real
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
        
    for index, row in catalogue.iterrows():
        
        #extracting infos from the catalogue
        cat = row['Catalogue']
        end_url = row['urls']
        
        # uniting the two parts of the url 
        url = root_url + end_url
        
        # Access and store the page
        page = requests.get( url, headers=headers)

        # BeautifulSoup Object
        soup = bs( page.text, 'html.parser')

        #This is where I found the list of all books from the category
        books = soup.find( 'ol', class_='row')

        # Creating an empty dataframe
        books_df = pd.DataFrame()

        # Book title
        books_df['book_title'] = [p.get_text( 'title' ) for p in books.find_all('h3')]
        
        # Book price
        books_df['book_price'] = [p.get_text() for p in books.find_all( 'p', class_='price_color')]
        books_df['book_price'][:] = [item.replace("Â", "") for item in books_df['book_price']]

        # Book rate
        rating_list = (soup.find('ol', class_='row')).find_all('p', class_='star-rating')
        number_list = ['One', 'Two', 'Three', 'Four', 'Five']
        number_dict = {'One':1, 'Two':2, 'Three':3, 'Four':4, 'Five':5}

        regex = re.compile('|'.join(re.escape(x) for x in number_list))
        books_df['book_rate'] = re.findall(regex,str(rating_list))

        # changing the string type rate to number using dict comprehension
        books_df['book_rate'] = books_df['book_rate'].replace(number_dict) 

        # Book Availability
        books_df['book_availability'] = [p.get_text().strip() for p in books.find_all( 'p', class_='instock availability')]

        # Category name
        books_df['book_category'] = cat
        #books_df['book_category'] = 'CHANGE HERE'

        # datetime of scrap
        scrap_time = datetime.now().strftime( '%Y-%m-%d %H:%M:%S' )
        books_df['scrap_time'] = scrap_time
        
        # Appending all the dataframes into one
        books_df_final = books_df_final.append(books_df)
        
    return books_df_final

In [43]:
# defining the root url
url = 'https://books.toscrape.com/'

# running the catalogue_urls function to get the urls of the different categories
catalogue = catalogue_urls(url)

# running the webscraping function
books_df = scrapping(catalogue)
books_df

Unnamed: 0,book_title,book_price,book_rate,book_availability,book_category,scrap_time
0,The Secret Garden,£15.08,4,In stock,Classics,2022-02-21 19:24:32
1,The Metamorphosis,£28.58,1,In stock,Classics,2022-02-21 19:24:32
2,The Pilgrim's Progress,£50.26,2,In stock,Classics,2022-02-21 19:24:32
3,The Hound of the ...,£14.82,2,In stock,Classics,2022-02-21 19:24:32
4,Little Women (Little Women ...,£28.07,4,In stock,Classics,2022-02-21 19:24:32
5,Gone with the Wind,£32.49,3,In stock,Classics,2022-02-21 19:24:32
6,Candide,£58.63,3,In stock,Classics,2022-02-21 19:24:32
7,Animal Farm,£57.22,3,In stock,Classics,2022-02-21 19:24:32
8,Wuthering Heights,£17.73,3,In stock,Classics,2022-02-21 19:24:32
9,The Picture of Dorian ...,£29.70,2,In stock,Classics,2022-02-21 19:24:32


In [44]:
# turning it into a csv file
csv_file = books_df.to_csv('TeaOClock-dataset.csv')