In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor

In [2]:


word_to_number = {'Zero':0,'One':1,'Two':2,'Three':3,'Four':4,'Five':5}

def scrape_book(book_url):
    """Scrape one book page"""
    r = requests.get(book_url)
    bs = BeautifulSoup(r.content, "html.parser")

    title = bs.find('h1').get_text(strip=True)
    upc = bs.select_one('table.table.table-striped tr td').get_text(strip=True)
    genre = bs.select('ul.breadcrumb li a')[-1].get_text(strip=True)
    availability = bs.select('table.table.table-striped tr td')[5].get_text(strip=True)
    desc_tag = bs.find('meta', attrs={'name':'description'})
    description = desc_tag['content'].strip() if desc_tag else ""

    #  image URL from book detail page
    image_rel = bs.select_one("div.item.active img")["src"]
    image_url = urljoin("https://books.toscrape.com/", image_rel)

    return upc, title, genre, availability, description, image_url


def scrape_books_per_page(soup):
    prices, ratings, urls = [], [], []
    
    for comp in soup.find_all('article', class_='product_pod'):
        rating = word_to_number[comp.find('p', class_='star-rating')['class'][-1]]
        price = float(comp.select_one('p.price_color').get_text(strip=True).replace("£",""))
        url = urljoin("https://books.toscrape.com/catalogue/", comp.h3.a['href'])
        
        prices.append(price)
        ratings.append(rating)
        urls.append(url)
    
    # Scrape book detail pages in parallel (10 at a time)
    with ThreadPoolExecutor(max_workers=10) as ex:
        details = list(ex.map(scrape_book, urls))
    
    upcs, titles_detail, genres, availabilities, descriptions, images = zip(*details)
    
    return pd.DataFrame({
        'UPC': upcs,
        'Title': titles_detail,
        'Price (£)': prices,
        'Rating': ratings,
        'Genre': genres,
        'Availability': availabilities,
        'Description': descriptions,
        'Image_URL': images    #  new column
    })


# Loop all pages
pages = [f"https://books.toscrape.com/catalogue/page-{p}.html" for p in range(1,51)]
dfs=[]
for url in pages:
    r = requests.get(url)
    s = BeautifulSoup(r.content, "html.parser")
    dfs.append(scrape_books_per_page(s))

result_df1 = pd.concat(dfs, ignore_index=True)
print(result_df1.head(), len(result_df1))

# Save to CSV
result_df1.to_csv("../data/raw/bookstoscrape.csv", index=False, encoding="utf-8-sig")


                UPC                                  Title  Price (£)  Rating  \
0  a897fe39b1053632                   A Light in the Attic      51.77       3   
1  90fa61229261140a                     Tipping the Velvet      53.74       1   
2  6957f44c3847a760                             Soumission      50.10       1   
3  e00eb4fd7b871a48                          Sharp Objects      47.82       4   
4  4165285e1663650f  Sapiens: A Brief History of Humankind      54.23       5   

                Genre             Availability  \
0              Poetry  In stock (22 available)   
1  Historical Fiction  In stock (20 available)   
2             Fiction  In stock (20 available)   
3             Mystery  In stock (20 available)   
4             History  In stock (20 available)   

                                         Description  \
0  It's hard to imagine a world without A Light i...   
1  "Erotic and absorbing...Written with starling ...   
2  Dans une France assez proche de la nôtre,

In [3]:
result_df1.shape

(1000, 8)

In [5]:
result_df1.isna().sum()

UPC             0
Title           0
Price (£)       0
Rating          0
Genre           0
Availability    0
Description     0
Image_URL       0
dtype: int64