In [13]:
import requests
from bs4 import BeautifulSoup

link = 'https://books.toscrape.com/catalogue/page-1.html'

# Sending a request to the website(link)
res = requests.get(link)
res

<Response [200]>

In [14]:
soup = BeautifulSoup(res.text, 'html.parser')


In [15]:
# Extract book information from the books website
books = []
for book in soup.find_all('article', class_='product_pod'):
    title = book.find('h3').find('a')['title']
    price = book.find('p', class_='price_color').text
    rating = book.find('p', class_='star-rating')['class'][1]  # Gets the rating class (One, Two, Three, etc.)
    
    books.append({
        'title': title,
        'price': price,
        'rating': rating
    })
    
    print(f"Title: {title}")
    print(f"Price: {price}")
    print(f"Rating: {rating}")
    print("*" * 50)

print(f"\nTotal books found: {len(books)}")

Title: A Light in the Attic
Price: Â£51.77
Rating: Three
**************************************************
Title: Tipping the Velvet
Price: Â£53.74
Rating: One
**************************************************
Title: Soumission
Price: Â£50.10
Rating: One
**************************************************
Title: Sharp Objects
Price: Â£47.82
Rating: Four
**************************************************
Title: Sapiens: A Brief History of Humankind
Price: Â£54.23
Rating: Five
**************************************************
Title: The Requiem Red
Price: Â£22.65
Rating: One
**************************************************
Title: The Dirty Little Secrets of Getting Your Dream Job
Price: Â£33.34
Rating: Four
**************************************************
Title: The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull
Price: Â£17.93
Rating: Three
**************************************************
Title: The Boys in the Boat: Nine Americans and Their E

In [17]:
# Debug: Let's see what's actually on the page
print("=== PAGE TITLE ===")
print(soup.title.text if soup.title else "No title found")
print("\n=== CHECKING FOR QUOTES ELEMENTS ===")
print(f"Found {len(soup.find_all('span', class_='text'))} spans with class 'text'")
print(f"Found {len(soup.find_all('small', class_='author'))} small elements with class 'author'")
print(f"Found {len(soup.find_all('div', class_='quote'))} divs with class 'quote'")

print("\n=== FIRST FEW DIVS ON THE PAGE ===")
for i, div in enumerate(soup.find_all('div')[:5]):
    print(f"Div {i+1}: classes = {div.get('class', 'No class')}")
    
print("\n=== LOOKING FOR BOOK RELATED ELEMENTS ===")
print(f"Found {len(soup.find_all('article', class_='product_pod'))} articles with class 'product_pod'")
print(f"Found {len(soup.find_all('h3'))} h3 elements")

=== PAGE TITLE ===

    All products | Books to Scrape - Sandbox


=== CHECKING FOR QUOTES ELEMENTS ===
Found 0 spans with class 'text'
Found 0 small elements with class 'author'
Found 0 divs with class 'quote'

=== FIRST FEW DIVS ON THE PAGE ===
Div 1: classes = ['page_inner']
Div 2: classes = ['row']
Div 3: classes = ['col-sm-8', 'h1']
Div 4: classes = ['container-fluid', 'page']
Div 5: classes = ['page_inner']

=== LOOKING FOR BOOK RELATED ELEMENTS ===
Found 20 articles with class 'product_pod'
Found 20 h3 elements


In [18]:
import pandas as pd

# Create DataFrame from the books data
df = pd.DataFrame(books)

# Display the DataFrame
print("=== BOOKS DATAFRAME ===")
print(df)
print(f"\nDataFrame shape: {df.shape}")
print(f"\nDataFrame info:")
print(df.info())
print(f"\nDataFrame columns: {list(df.columns)}")

# Save to CSV file
csv_filename = 'books_data.csv'
df.to_csv(csv_filename, index=False, encoding='utf-8')
print(f"\nDataFrame saved to '{csv_filename}' successfully!")

# Display first few rows to verify
print(f"\nFirst 5 rows of the saved data:")
print(df.head())

=== BOOKS DATAFRAME ===
                                                title    price rating
0                                A Light in the Attic  Â£51.77  Three
1                                  Tipping the Velvet  Â£53.74    One
2                                          Soumission  Â£50.10    One
3                                       Sharp Objects  Â£47.82   Four
4               Sapiens: A Brief History of Humankind  Â£54.23   Five
5                                     The Requiem Red  Â£22.65    One
6   The Dirty Little Secrets of Getting Your Dream...  Â£33.34   Four
7   The Coming Woman: A Novel Based on the Life of...  Â£17.93  Three
8   The Boys in the Boat: Nine Americans and Their...  Â£22.60   Four
9                                     The Black Maria  Â£52.15    One
10     Starving Hearts (Triangular Trade Trilogy, #1)  Â£13.99    Two
11                              Shakespeare's Sonnets  Â£20.66   Four
12                                        Set Me Free  Â£17.46   F

In [20]:
# Basic analysis of the scraped books data
print("=== BASIC ANALYSIS ===")
print(f"Total books scraped: {len(df)}")
print(f"\nUnique ratings distribution:")
print(df['rating'].value_counts())

print(f"\nPrice range:")
# Remove currency symbols and convert to float for analysis
# Handle both £ and encoded currency symbols
df['price_numeric'] = df['price'].str.replace('£', '').str.replace('Â', '').astype(float)
print(f"Min price: £{df['price_numeric'].min():.2f}")
print(f"Max price: £{df['price_numeric'].max():.2f}")
print(f"Average price: £{df['price_numeric'].mean():.2f}")

print(f"\nSample of book titles:")
for i, title in enumerate(df['title'].head(3)):
    print(f"{i+1}. {title}")

print(f"\nPrice samples:")
for i, price in enumerate(df['price'].head(3)):
    print(f"{i+1}. {price}")

print(f"\nData types:")
print(df.dtypes)

=== BASIC ANALYSIS ===
Total books scraped: 20

Unique ratings distribution:
rating
One      6
Five     4
Four     4
Three    3
Two      3
Name: count, dtype: int64

Price range:
Min price: £13.99
Max price: £57.25
Average price: £38.05

Sample of book titles:
1. A Light in the Attic
2. Tipping the Velvet
3. Soumission

Price samples:
1. Â£51.77
2. Â£53.74
3. Â£50.10

Data types:
title             object
price             object
rating            object
price_numeric    float64
dtype: object
