In [5]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import time

In [6]:
url = "https://books.toscrape.com/"
response = requests.get(url)
if response.status_code == 200:
    print("request successful")
else:
    print("request failed")

request successful


In [7]:
# Create a soup object to parse the html content
soup = BeautifulSoup(response.text, "html.parser")
print(soup)

<!DOCTYPE html>

<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-us"> <!--<![endif]-->
<head>
<title>
    All products | Books to Scrape - Sandbox
</title>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta content="24th Jun 2016 09:29" name="created"/>
<meta content="" name="description"/>
<meta content="width=device-width" name="viewport"/>
<meta content="NOARCHIVE,NOCACHE" name="robots"/>
<!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
<!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->
<link href="static/oscar/favicon.ico" rel="shortcut icon"/>
<link href="static/oscar/css/styles.css" rel="stylesheet" type="text/css"/>
<link href="s

In [8]:
# Find all book titles and their links
books = soup.find_all('h3')

start_time = time.time()
books_extracted = 0

# Iterate throung the books and extract the info. for each book
for book in books:
    book_url = book.find('a')['href']
    book_response = requests.get(url + book_url)
    book_soup = BeautifulSoup(book_response.content,"html.parser")

    title = book_soup.find('h1').text
    category = book_soup.find('ul', class_="breadcrumb").find_all('a')[2].text.strip()
    description = ""
    desc_div = book_soup.find("div", id="product_description")
    if desc_div:
        desc_p = desc_div.find_next_sibling("p")
        if desc_p:
            description = desc_p.text.strip()
    rating_tag = book_soup.find("p", class_="star-rating")
    rating_class = rating_tag.get("class", []) if rating_tag else []
    rating_map = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
    average_rating = rating_map.get(rating_class[1], "") if len(rating_class) > 1 else ""
    ratings_count = ""
    table = book_soup.select("table.table-striped tr")
    for row in table:
        th = row.find("th")
        td = row.find("td")
        if th and "Number of reviews" in th.text:
            ratings_count = td.text.strip()
            break
    thumbnail = ""
    img_tag = book_soup.select_one("div.item.active img")
    if img_tag and img_tag.get("src"):
        img_src = img_tag["src"].replace("../../", "")
        thumbnail = "https://books.toscrape.com/" + img_src
    

    books_extracted +=1

    end_time = time.time()
    totla_time = (end_time-start_time)/60.0

    print(f'title: {title}')
    print(f'description: {description[:80]}...')
    print(f'category: {category}')
    print(f'rating: {average_rating}')
    print(f'ratingsCount: {ratings_count}')
    print(f'thumbnail: {thumbnail}')

title: A Light in the Attic
description: It's hard to imagine a world without A Light in the Attic. This now-classic coll...
category: Poetry
rating: 3
ratingsCount: 0
thumbnail: https://books.toscrape.com/media/cache/fe/72/fe72f0532301ec28892ae79a629a293c.jpg
title: Tipping the Velvet
description: "Erotic and absorbing...Written with starling power."--"The New York Times Book ...
category: Historical Fiction
rating: 1
ratingsCount: 0
thumbnail: https://books.toscrape.com/media/cache/08/e9/08e94f3731d7d6b760dfbfbc02ca5c62.jpg
title: Soumission
description: Dans une France assez proche de la nôtre, un homme s’engage dans la carrière uni...
category: Fiction
rating: 1
ratingsCount: 0
thumbnail: https://books.toscrape.com/media/cache/ee/cf/eecfe998905e455df12064dba399c075.jpg
title: Sharp Objects
description: WICKED above her hipbone, GIRL across her heart Words are like a road map to rep...
category: Mystery
rating: 4
ratingsCount: 0
thumbnail: https://books.toscrape.com/media/cache/c0/5

In [9]:
books_data = []
start_time = time.time()

for page_num in range(1, 30):
    print(f"📘 Scraping page {page_num}...")
    url = f'https://books.toscrape.com/catalogue/page-{page_num}.html'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    books = soup.find_all('h3')

    for book in books:
        # Build full URL
        relative_url = book.find('a')['href'].replace('../../../', '')
        book_url = 'https://books.toscrape.com/catalogue/' + relative_url

        # Request book detail page
        book_response = requests.get(book_url)
        book_soup = BeautifulSoup(book_response.content, "html.parser")

        # Title
        title = book_soup.find('h1').text.strip()

        # Category
        category = ""
        breadcrumb = book_soup.find('ul', class_="breadcrumb")
        if breadcrumb:
            links = breadcrumb.find_all('a')
            if len(links) > 2:
                category = links[2].text.strip()

        # Description
        description = ""
        desc_div = book_soup.find("div", id="product_description")
        if desc_div:
            desc_p = desc_div.find_next_sibling("p")
            if desc_p:
                description = desc_p.text.strip()

        # Average rating
        rating_tag = book_soup.find("p", class_="star-rating")
        rating_class = rating_tag.get("class", []) if rating_tag else []
        rating_map = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
        average_rating = rating_map.get(rating_class[1], "") if len(rating_class) > 1 else ""

        # Number of reviews
        ratings_count = ""
        table = book_soup.select("table.table-striped tr")
        for row in table:
            th = row.find("th")
            td = row.find("td")
            if th and th.text.strip() == "Number of reviews":
                ratings_count = td.text.strip()
                break

        # Thumbnail
        thumbnail = ""
        img_tag = book_soup.select_one("div.item.active img")
        if img_tag and img_tag.get("src"):
            img_src = img_tag["src"].replace("../../", "")
            thumbnail = "https://books.toscrape.com/" + img_src

        # Add to data list
        books_data.append([
            title,
            description,
            category,
            average_rating,
            ratings_count,
            thumbnail
        ])

    print(f"Page {page_num} done — total books so far: {len(books_data)}")

# End time after all pages
end_time = time.time()
total_time = (end_time - start_time) / 60.0

print("\n DONE!")
print(f"📚 Total books scraped: {len(books_data)}")
print(f"🕒 Total time taken: {total_time:.2f} minutes")

df = pd.DataFrame(books_data, columns=["title", "description", "category", "averageRating", "ratingsCount", "thumbnail"])

📘 Scraping page 1...
Page 1 done — total books so far: 20
📘 Scraping page 2...
Page 2 done — total books so far: 40
📘 Scraping page 3...
Page 3 done — total books so far: 60
📘 Scraping page 4...
Page 4 done — total books so far: 80
📘 Scraping page 5...
Page 5 done — total books so far: 100
📘 Scraping page 6...
Page 6 done — total books so far: 120
📘 Scraping page 7...
Page 7 done — total books so far: 140
📘 Scraping page 8...
Page 8 done — total books so far: 160
📘 Scraping page 9...
Page 9 done — total books so far: 180
📘 Scraping page 10...
Page 10 done — total books so far: 200
📘 Scraping page 11...
Page 11 done — total books so far: 220
📘 Scraping page 12...
Page 12 done — total books so far: 240
📘 Scraping page 13...
Page 13 done — total books so far: 260
📘 Scraping page 14...
Page 14 done — total books so far: 280
📘 Scraping page 15...
Page 15 done — total books so far: 300
📘 Scraping page 16...
Page 16 done — total books so far: 320
📘 Scraping page 17...
Page 17 done — total boo

In [10]:
df.head(10)

Unnamed: 0,title,description,category,averageRating,ratingsCount,thumbnail
0,A Light in the Attic,It's hard to imagine a world without A Light i...,Poetry,3,0,https://books.toscrape.com/media/cache/fe/72/f...
1,Tipping the Velvet,"""Erotic and absorbing...Written with starling ...",Historical Fiction,1,0,https://books.toscrape.com/media/cache/08/e9/0...
2,Soumission,"Dans une France assez proche de la nôtre, un h...",Fiction,1,0,https://books.toscrape.com/media/cache/ee/cf/e...
3,Sharp Objects,"WICKED above her hipbone, GIRL across her hear...",Mystery,4,0,https://books.toscrape.com/media/cache/c0/59/c...
4,Sapiens: A Brief History of Humankind,From a renowned historian comes a groundbreaki...,History,5,0,https://books.toscrape.com/media/cache/ce/5f/c...
5,The Requiem Red,Patient Twenty-nine.A monster roams the halls ...,Young Adult,1,0,https://books.toscrape.com/media/cache/6b/07/6...
6,The Dirty Little Secrets of Getting Your Dream...,Drawing on his extensive experience evaluating...,Business,4,0,https://books.toscrape.com/media/cache/e1/1b/e...
7,The Coming Woman: A Novel Based on the Life of...,"""If you have a heart, if you have a soul, Kare...",Default,3,0,https://books.toscrape.com/media/cache/97/36/9...
8,The Boys in the Boat: Nine Americans and Their...,For readers of Laura Hillenbrand's Seabiscuit ...,Default,4,0,https://books.toscrape.com/media/cache/d1/2d/d...
9,The Black Maria,"Praise for Aracelis Girmay:""[Girmay's] every l...",Poetry,1,0,https://books.toscrape.com/media/cache/d1/7a/d...


In [11]:
df["ratingsCount"].value_counts()

ratingsCount
0    580
Name: count, dtype: int64

In [12]:
df["authors"] = "Unknown"

In [13]:
df

Unnamed: 0,title,description,category,averageRating,ratingsCount,thumbnail,authors
0,A Light in the Attic,It's hard to imagine a world without A Light i...,Poetry,3,0,https://books.toscrape.com/media/cache/fe/72/f...,Unknown
1,Tipping the Velvet,"""Erotic and absorbing...Written with starling ...",Historical Fiction,1,0,https://books.toscrape.com/media/cache/08/e9/0...,Unknown
2,Soumission,"Dans une France assez proche de la nôtre, un h...",Fiction,1,0,https://books.toscrape.com/media/cache/ee/cf/e...,Unknown
3,Sharp Objects,"WICKED above her hipbone, GIRL across her hear...",Mystery,4,0,https://books.toscrape.com/media/cache/c0/59/c...,Unknown
4,Sapiens: A Brief History of Humankind,From a renowned historian comes a groundbreaki...,History,5,0,https://books.toscrape.com/media/cache/ce/5f/c...,Unknown
...,...,...,...,...,...,...,...
575,"Death Note, Vol. 6: Give-and-Take (Death Note #6)",Light Yagami is an ace student with great pros...,Sequential Art,3,0,https://books.toscrape.com/media/cache/c7/11/c...,Unknown
576,Catherine the Great: Portrait of a Woman,Pulitzer Prize winner Massie offers the tale o...,History,4,0,https://books.toscrape.com/media/cache/37/3a/3...,Unknown
577,Better Homes and Gardens New Cook Book,"Features: Over 900 new recipes -- 1,200 in all...",Food and Drink,3,0,https://books.toscrape.com/media/cache/71/21/7...,Unknown
578,An Unquiet Mind: A Memoir of Moods and Madness,WITH A NEW PREFACE BY THE AUTHORIn her bestsel...,Psychology,3,0,https://books.toscrape.com/media/cache/e5/65/e...,Unknown


In [46]:
df_google = pd.read_csv("google_books_data.csv")

In [48]:
df_google

Unnamed: 0,title,authors,description,category,averageRating,ratingsCount,thumbnail
0,A Game of Thrones,George R. R. Martin,Fantasy-roman.,Fiction,4.5,131.0,http://books.google.com/books/content?id=bIZiA...
1,The Invisible Man,"Len Jenkin, Herbert George Wells",,Drama,,,http://books.google.com/books/content?id=SCHqm...
2,Crime and Punishment,Fyodor Dostoyevsky,Introduction by W. J. Leatherbarrow; Translati...,Fiction,,,http://books.google.com/books/content?id=Z0jil...
3,Cards on the Table,Agatha Christie,Theatre program.,Fiction,,,http://books.google.com/books/content?id=NGoFu...
4,4.50 from Paddington,Agatha Christie,Elspeth looked out the train window into the t...,Fiction,,,http://books.google.com/books/content?id=6I9Hv...
...,...,...,...,...,...,...,...
1667,Harry Potter und der Orden des Phönix (Harry P...,J.K. Rowling,"Harry Potter ist überzeugt davon, dass Lord Vo...",Fiction,,,http://books.google.com/books/content?id=WsD0A...
1668,Harry Potter und die Heiligtümer des Todes von...,"Amy Ainsworth, derQuerleser,",Harry Potter und die Heiligtümer des Todes von...,Study Aids,,,http://books.google.com/books/content?id=eTqoD...
1669,Harry Potter und der Feuerkelch von J .K. Rowl...,"Sandrine Guihéneuf, Florence Balthasar",Harry Potter und der Feuerkelch von J. K. Rowl...,Education,,,http://books.google.com/books/content?id=9xmKD...
1670,"Always and forever, Lara Jean",Jenny Han,Lara Jeans letztes Highschool-Jahr könnte nich...,Juvenile Fiction,,,http://books.google.com/books/content?id=g_4-D...


In [50]:
df_combined = pd.concat([df, df_google], ignore_index=True)

In [52]:
df_combined.head()

Unnamed: 0,title,description,category,averageRating,ratingsCount,thumbnail,authors
0,A Light in the Attic,It's hard to imagine a world without A Light i...,Poetry,3.0,0,https://books.toscrape.com/media/cache/fe/72/f...,Unknown
1,Tipping the Velvet,"""Erotic and absorbing...Written with starling ...",Historical Fiction,1.0,0,https://books.toscrape.com/media/cache/08/e9/0...,Unknown
2,Soumission,"Dans une France assez proche de la nôtre, un h...",Fiction,1.0,0,https://books.toscrape.com/media/cache/ee/cf/e...,Unknown
3,Sharp Objects,"WICKED above her hipbone, GIRL across her hear...",Mystery,4.0,0,https://books.toscrape.com/media/cache/c0/59/c...,Unknown
4,Sapiens: A Brief History of Humankind,From a renowned historian comes a groundbreaki...,History,5.0,0,https://books.toscrape.com/media/cache/ce/5f/c...,Unknown


In [54]:
df_combined.isnull().sum()

title               0
description       244
category          123
averageRating    1433
ratingsCount     1433
thumbnail         166
authors             0
dtype: int64

In [56]:
df_combined.shape

(2252, 7)

In [61]:
# Export to CSV
df_combined.to_csv("combined_data.csv", index=False, encoding="utf-8-sig")
print("✅ Data saved")

✅ Data saved
