In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import time

In [2]:

# URL of the website to scrape
url = "http://books.toscrape.com/"

# Send a request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = bs(response.text, 'html.parser')

    # Initialize a list to store book data
    books = []

    # Loop through each book entry in the HTML
    for book in soup.find_all('article', class_='product_pod'):
        # Extracting book title
        title = book.h3.a['title']
        
        # Extracting book rating
        rating = book.p['class'][1]
        
        # Extracting book price
        price = book.find('p', class_='price_color').text
        
        # Extracting book link
        link = url + book.h3.a['href']

        # Add the book data to the list
        books.append({'title': title, 'rating': rating, 'price': price, 'link': link})

        time.sleep(1)

    # Print the list of books
    for book in books:
        print(book)
else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")


{'title': 'A Light in the Attic', 'rating': 'Three', 'price': 'Â£51.77', 'link': 'http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'}
{'title': 'Tipping the Velvet', 'rating': 'One', 'price': 'Â£53.74', 'link': 'http://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'}
{'title': 'Soumission', 'rating': 'One', 'price': 'Â£50.10', 'link': 'http://books.toscrape.com/catalogue/soumission_998/index.html'}
{'title': 'Sharp Objects', 'rating': 'Four', 'price': 'Â£47.82', 'link': 'http://books.toscrape.com/catalogue/sharp-objects_997/index.html'}
{'title': 'Sapiens: A Brief History of Humankind', 'rating': 'Five', 'price': 'Â£54.23', 'link': 'http://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html'}
{'title': 'The Requiem Red', 'rating': 'One', 'price': 'Â£22.65', 'link': 'http://books.toscrape.com/catalogue/the-requiem-red_995/index.html'}
{'title': 'The Dirty Little Secrets of Getting Your Dream Job', 'rating': 'Four', 'pri

In [4]:
df = pd.DataFrame(books, columns = ['title', 'rating', 'price', 'link'])

In [5]:
df.to_csv('books_to_scrape.csv', index = False)

In [7]:
df1 = pd.read_csv("books_to_scrape.csv")

In [8]:
df.head()

Unnamed: 0,title,rating,price,link
0,A Light in the Attic,Three,Â£51.77,http://books.toscrape.com/catalogue/a-light-in...
1,Tipping the Velvet,One,Â£53.74,http://books.toscrape.com/catalogue/tipping-th...
2,Soumission,One,Â£50.10,http://books.toscrape.com/catalogue/soumission...
3,Sharp Objects,Four,Â£47.82,http://books.toscrape.com/catalogue/sharp-obje...
4,Sapiens: A Brief History of Humankind,Five,Â£54.23,http://books.toscrape.com/catalogue/sapiens-a-...


#### Function to simulate above operation for each page

In [135]:
import requests
from bs4 import BeautifulSoup
import time

def scrape_books(url):
    # Send a GET request to the website to fetch the HTML content of the page
    response = requests.get(url)
    
    # Use BeautifulSoup to parse the HTML content for data extraction
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Initialize an empty list to store data about each book
    books = []
    
    # Loop through all the 'article' tags with class 'product_pod' (each represents a book)
    for book in soup.find_all('article', class_='product_pod'):
        # Extract the title of the book
        title = book.h3.a['title'] 
        # Extract the rating (second class attribute) 
        rating = book.p['class'][1]  
        # Extract the price
        price = book.find('p', class_='price_color').text  
        # Extract the relative link to the book's detail page
        link = soup.select_one('.product_pod h3 a')['href'] 

        # Add the extracted data to the books list as a dictionary
        books.append({
            'title': title,
            'rating': rating,
            'price': price,
            'link': url[0:37] + link  # Combine base URL with relative link for the complete URL
        })
    
    return books  

# Example usage
# List to store data from all pages
all_books = []
# Base URL for the book pages  
base_url = 'https://books.toscrape.com/catalogue/page-{}.html'  

# Loop over the pages in the range 1 to 51 (50 pages total)
for page in range(1,51):
    # Format URL for the current page
    url = base_url.format(page)  
    # Scrape data from the current page
    iterative_data = scrape_books(url)
    # Add the data from the current page to the main list  
    all_books.extend(iterative_data)  
    # Sleep for 1 second between page requests to be polite to the server
    time.sleep(1)  


In [129]:
all_books

[{'title': 'A Light in the Attic',
  'rating': 'Three',
  'price': '£51.77',
  'link': 'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'},
 {'title': 'Tipping the Velvet',
  'rating': 'One',
  'price': '£53.74',
  'link': 'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'},
 {'title': 'Soumission',
  'rating': 'One',
  'price': '£50.10',
  'link': 'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'},
 {'title': 'Sharp Objects',
  'rating': 'Four',
  'price': '£47.82',
  'link': 'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'},
 {'title': 'Sapiens: A Brief History of Humankind',
  'rating': 'Five',
  'price': '£54.23',
  'link': 'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'},
 {'title': 'The Requiem Red',
  'rating': 'One',
  'price': '£22.65',
  'link': 'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'},
 {'title': 'The Dirty Lit

In [130]:
df = pd.DataFrame(all_books, columns = ['title', 'rating', 'price', 'link'])

In [131]:
df.to_csv('books_to_scrape.csv', index = False)

In [132]:
df1 = pd.read_csv("books_to_scrape.csv")

In [133]:
df1.head()

Unnamed: 0,title,rating,price,link
0,A Light in the Attic,Three,£51.77,https://books.toscrape.com/catalogue/a-light-i...
1,Tipping the Velvet,One,£53.74,https://books.toscrape.com/catalogue/a-light-i...
2,Soumission,One,£50.10,https://books.toscrape.com/catalogue/a-light-i...
3,Sharp Objects,Four,£47.82,https://books.toscrape.com/catalogue/a-light-i...
4,Sapiens: A Brief History of Humankind,Five,£54.23,https://books.toscrape.com/catalogue/a-light-i...


In [134]:
df1.shape

(400, 4)