In [1]:
# Import libraries
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# Base URL
BASE = "http://books.toscrape.com/"

# Function to parse one book card
def parse_book(book):
    import re
    title = book.h3.a['title']
    price_text = book.select_one('p.price_color').text.strip()
    price = float(re.sub(r"[^\d.]", "", price_text))   # fixed line
    avail = book.select_one('p.instock.availability').text.strip()
    rating_class = book.select_one('p.star-rating')['class'][1]
    rating_map = {'One':1,'Two':2,'Three':3,'Four':4,'Five':5}
    rating = rating_map.get(rating_class, None)
    return {'title': title, 'price': price, 'availability': avail, 'rating': rating}

In [4]:
def scrape_all():
    session = requests.Session()
    session.headers.update({'User-Agent': 'Mozilla/5.0 (learning-scraper)'})
    url = BASE
    rows = []
    while True:
        r = session.get(url, timeout=10)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, 'html.parser')

        # Loop through all book cards on page
        for b in soup.select('article.product_pod'):
            rows.append(parse_book(b))

        # Next page link
        next_btn = soup.select_one('li.next a')
        if not next_btn:
            break
        url = urljoin(url, next_btn['href'])
        time.sleep(1)  # polite delay
    return pd.DataFrame(rows)

# Run scraper
df_raw = scrape_all()
df_raw.to_csv("books_raw.csv", index=False)
df_raw.head()



Unnamed: 0,title,price,availability,rating
0,A Light in the Attic,51.77,In stock,3
1,Tipping the Velvet,53.74,In stock,1
2,Soumission,50.1,In stock,1
3,Sharp Objects,47.82,In stock,4
4,Sapiens: A Brief History of Humankind,54.23,In stock,5


In [None]:
# Load raw data

df = pd.read_csv("books_raw.csv")

In [6]:
# Quick check
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1000 non-null   object 
 1   price         1000 non-null   float64
 2   availability  1000 non-null   object 
 3   rating        1000 non-null   int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 31.4+ KB
None
                                   title  price availability  rating
0                   A Light in the Attic  51.77     In stock       3
1                     Tipping the Velvet  53.74     In stock       1
2                             Soumission  50.10     In stock       1
3                          Sharp Objects  47.82     In stock       4
4  Sapiens: A Brief History of Humankind  54.23     In stock       5


In [7]:
# Clean availability (remove newlines & spaces)
df['availability'] = df['availability'].str.replace('\n',' ').str.strip()


In [8]:
# Handle missing ratings
df['rating'] = df['rating'].fillna(0).astype(int)

In [9]:
# Drop duplicate titles (if any)
df = df.drop_duplicates(subset=['title'])

In [10]:
# Create price bins
df['price_bin'] = pd.cut(df['price'], bins=[0,10,20,30,50,100], 
                         labels=['0-10','10-20','20-30','30-50','50+'])

df.to_csv("books_clean.csv", index=False)
df.head()

Unnamed: 0,title,price,availability,rating,price_bin
0,A Light in the Attic,51.77,In stock,3,50+
1,Tipping the Velvet,53.74,In stock,1,50+
2,Soumission,50.1,In stock,1,50+
3,Sharp Objects,47.82,In stock,4,30-50
4,Sapiens: A Brief History of Humankind,54.23,In stock,5,50+
