In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
import matplotlib.pyplot as plt

In [None]:

# List to store the extracted information
lst_for_db = []

# User-Agent to mimic browser requests
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

# Function to fetch page content with retries and exponential backoff
def fetch_page(url, retries=3):
    delay = 1
    for i in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()  # Raise an exception for HTTP errors
            return response.text
        except requests.exceptions.RequestException as e:
            print(f"Error: {e}, retrying in {delay} seconds...")
            time.sleep(delay)
            delay += 2  # Exponential backoff
    print(f"Failed to retrieve {url} after {retries} attempts")
    return None

# Loop through pages
for i in tqdm(range(1, 51)):
    url = f"https://books.toscrape.com/catalogue/page-{i}.html"
    html_content = fetch_page(url)
    
    if html_content is None:
        continue
    
    soup = BeautifulSoup(html_content, "html.parser")
    
    for sp in soup.find_all("li", class_="col-xs-6 col-sm-4 col-md-3 col-lg-3"):
        author = sp.h3.a.get("title")
        page_link = "https://books.toscrape.com/catalogue/" + sp.a.get("href")
        img_link = "https://books.toscrape.com/" + sp.img.get("src")
        rating = sp.find("p").get("class")[-1]
        price = sp.find(class_="price_color").text[1:]
        availability = sp.find("p", class_="instock availability").text.strip()
        
        lst_for_db.append([author, page_link, img_link, rating, price, availability])
    
    # Delay to prevent rate limiting
    time.sleep(2)

# Print the collected data
for entry in lst_for_db:
    print(entry)


In [None]:
# lets dump it into a dataframe
df = pd.DataFrame(lst_for_db,columns=['Book','Description','Image','Rating','Price','Availability'] )
df.to_json("Bookscrape.json",indent=4)