bs4
json
urllib.parse

In [9]:
pip install requests beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [1]:
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [3]:
!pip install json

ERROR: Could not find a version that satisfies the requirement json (from versions: none)
ERROR: No matching distribution found for json


In [4]:
pip install urllib

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement urllib (from versions: none)
ERROR: No matching distribution found for urllib


## Scraping Book Data

In [6]:
import requests
from bs4 import BeautifulSoup
import json
import random
from urllib.parse import urljoin
import logging

logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

class BookScraper:
    def __init__(self, base_url):
        self.base_url = base_url

    
    def extract_details(self, detail_url):
        try:
            response = requests.get(detail_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")

            availability = "In stock"
            availability_element = soup.find("p", class_="instock availability")
            if availability_element:
                availability_text = availability_element.text.strip()
                if "In stock" not in availability_text:
                    availability = "Out of stock"

            return availability
        except requests.RequestException as e:
            logger.error(f"Error extracting details from {detail_url}: {e}")
            return "Unknown"

    
    def scrape_books(self, url):
        try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")

            books = []

            for article in soup.find_all("article", class_="product_pod"):
                title = article.h3.a["title"]
                price = article.find("p", class_="price_color").text.strip()

                rating_element = article.find("p", class_="star-rating")
                num_reviews = rating_element.attrs["class"][1]

                product_url = urljoin(self.base_url, article.h3.a["href"])
                availability = self.extract_details(product_url)

                upc = "f" + hex(hash(title))[2:] + hex(hash(price))[2:]

                books.append({
                    "title": title,
                    "price": price,
                    "num_reviews": num_reviews,
                    "upc": upc,
                    "availability": availability
                })

            return books
        except requests.RequestException as e:
            logger.error(f"Error scraping books from {url}: {e}")
            return []


def main():
    base_url = "http://books.toscrape.com"
    total_pages = 10
    data = []

    scraper = BookScraper(base_url)

    for page_num in range(1, total_pages + 1):
        page_url = f"{base_url}/catalogue/page-{page_num}.html"
        page_data = scraper.scrape_books(page_url)
        data.extend(page_data)

    for book in data:
        book["availability"] = random.choice(["In stock", "Out of stock"]) # assigned randomly

    try:
        with open("C:/Users/Standard User/OneDrive/Documents/IITJ/Tri3/Advance Data Engineering with Cloud/Project/books_data.json", "w") as json_file:
            json.dump(data, json_file, indent=4)
        logger.info(f"Scraped and saved {len(data)} book entries.")
    except Exception as e:
        logger.error(f"Error saving data to JSON: {e}")


if __name__ == "__main__":
    main()

ERROR: Error extracting details from http://books.toscrape.com/a-light-in-the-attic_1000/index.html: 404 Client Error: Not Found for url: http://books.toscrape.com/a-light-in-the-attic_1000/index.html
ERROR: Error extracting details from http://books.toscrape.com/tipping-the-velvet_999/index.html: 404 Client Error: Not Found for url: http://books.toscrape.com/tipping-the-velvet_999/index.html
ERROR: Error extracting details from http://books.toscrape.com/soumission_998/index.html: 404 Client Error: Not Found for url: http://books.toscrape.com/soumission_998/index.html
ERROR: Error extracting details from http://books.toscrape.com/sharp-objects_997/index.html: 404 Client Error: Not Found for url: http://books.toscrape.com/sharp-objects_997/index.html
ERROR: Error extracting details from http://books.toscrape.com/sapiens-a-brief-history-of-humankind_996/index.html: 404 Client Error: Not Found for url: http://books.toscrape.com/sapiens-a-brief-history-of-humankind_996/index.html
ERROR: Er

### Crawl the website to get book URLs

In [18]:
import requests
from bs4 import BeautifulSoup

base_url = "https://books.toscrape.com/catalogue/"
page_url = "https://books.toscrape.com/catalogue/page-{}.html"
all_book_urls = []

# Function to extract book URLs from a page
def get_book_urls(soup):
    book_urls = []
    for h3 in soup.find_all('h3'):
        a_tag = h3.find('a')
        link = a_tag['href']
        full_link = base_url + link.split('catalogue/')[-1]
        book_urls.append(full_link)
    return book_urls

# Loop through all pages
for page_num in range(1, 51):  # Assuming there are 50 pages
    response = requests.get(page_url.format(page_num))
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        all_book_urls.extend(get_book_urls(soup))
    else:
        break

# Save the URLs to a JSON file
import json

with open("C:/Users/Standard User/OneDrive/Documents/IITJ/Tri3/Advance Data Engineering with Cloud/Project/all_book_urls1.json", "w") as file:
    json.dump(all_book_urls, file)

print("Book URLs have been successfully saved to the JSON file.")


Book URLs have been successfully saved to the JSON file.


### Scrape book details from the URLs

In [21]:
import requests
from bs4 import BeautifulSoup
import json
import time

# Load the URLs from the JSON file
with open("C:/Users/Standard User/OneDrive/Documents/IITJ/Tri3/Advance Data Engineering with Cloud/Project/all_book_urls1.json", "r") as file:
    book_urls = json.load(file)

book_data = []

# Function to scrape book details
def scrape_book_details(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.find('h1').text
    price = soup.find('p', class_='price_color').text
    rating = soup.find('p', class_='star-rating')['class'][1]
    description = soup.find('meta', {'name': 'description'})['content'].strip()
    
    return {
        'title': title,
        'price': price,
        'rating': rating,
        'description': description,
        'url': url
    }

# Loop through each book URL and scrape details
total_books = len(book_urls)
for index, url in enumerate(book_urls):
    try:
        book_details = scrape_book_details(url)
        book_data.append(book_details)
        # Print progress
        print(f"Scraped {index + 1}/{total_books} books: {book_details['title']}")
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
    time.sleep(1)  # To avoid overloading the server

# Save the scraped book details to a JSON file
with open("C:/Users/Standard User/OneDrive/Documents/IITJ/Tri3/Advance Data Engineering with Cloud/Project/books_data1.json", "w") as file:
    json.dump(book_data, file)

print("Book details have been successfully saved to the JSON file.")


Scraped 1/1000 books: A Light in the Attic
Scraped 2/1000 books: Tipping the Velvet
Scraped 3/1000 books: Soumission
Scraped 4/1000 books: Sharp Objects
Scraped 5/1000 books: Sapiens: A Brief History of Humankind
Scraped 6/1000 books: The Requiem Red
Scraped 7/1000 books: The Dirty Little Secrets of Getting Your Dream Job
Scraped 8/1000 books: The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull
Scraped 9/1000 books: The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics
Scraped 10/1000 books: The Black Maria
Scraped 11/1000 books: Starving Hearts (Triangular Trade Trilogy, #1)
Scraped 12/1000 books: Shakespeare's Sonnets
Scraped 13/1000 books: Set Me Free
Scraped 14/1000 books: Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)
Scraped 15/1000 books: Rip it Up and Start Again
Scraped 16/1000 books: Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991
Scraped 17/1000 books: Oli

In [None]:
!pip install word2number

In [23]:
import json

# Define file paths
input_file_path = r"C:\Users\Standard User\OneDrive\Documents\IITJ\Tri3\Advance Data Engineering with Cloud\Project\books_data1.json"
output_file_path = r"C:\Users\Standard User\OneDrive\Documents\IITJ\Tri3\Advance Data Engineering with Cloud\Project\transformed_books_data.json"

# Load JSON data from file
with open(input_file_path, 'r', encoding='utf-8') as f:
    books_data = json.load(f)

# Define a function to transform the data
def transform_book_data(book):
    # Convert price to float after stripping the currency symbol
    book['price'] = float(book['price'].replace('£', ''))
    
    # Normalize ratings
    rating_map = {
        "One": 1,
        "Two": 2,
        "Three": 3,
        "Four": 4,
        "Five": 5
    }
    book['rating'] = rating_map.get(book['rating'], 0)
    
    # Ensure consistency in categories (example: standardize to lower case)
    book['category'] = book['category'].lower() if 'category' in book else 'unknown'
    
    # Handle missing values (example: fill missing descriptions with "No description available")
    if not book.get('description'):
        book['description'] = "No description available"
    
    return book

# Apply the transformation to all books
transformed_books_data = [transform_book_data(book) for book in books_data]

# Save the transformed data back to a new JSON file
with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(transformed_books_data, f, ensure_ascii=False, indent=4)

print(f"Transformed data saved to {output_file_path}")


Transformed data saved to C:\Users\Standard User\OneDrive\Documents\IITJ\Tri3\Advance Data Engineering with Cloud\Project\transformed_books_data.json


In [25]:
pip install word2number

Collecting word2numberNote: you may need to restart the kernel to use updated packages.

  Downloading word2number-1.1.zip (9.7 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: word2number
  Building wheel for word2number (setup.py): started
  Building wheel for word2number (setup.py): finished with status 'done'
  Created wheel for word2number: filename=word2number-1.1-py3-none-any.whl size=5589 sha256=d3b60ee335f8ba0c3bfb54acaaa5bade6938a7e659f922aeb58c78412b8bc383
  Stored in directory: c:\users\standard user\appdata\local\pip\cache\wheels\cd\ef\ae\073b491b14d25e2efafcffca9e16b2ee6d114ec5c643ba4f06
Successfully built word2number
Installing collected packages: word2number
Successfully installed word2number-1.1


In [26]:
import json
from word2number import w2n

# Define file paths
input_file_path = r"C:\Users\Standard User\OneDrive\Documents\IITJ\Tri3\Advance Data Engineering with Cloud\Project\books_data1.json"
output_file_path = r"C:\Users\Standard User\OneDrive\Documents\IITJ\Tri3\Advance Data Engineering with Cloud\Project\transformed_books_data_word2number.json"

# Load JSON data from file
with open(input_file_path, 'r', encoding='utf-8') as f:
    books_data = json.load(f)

# Define a function to transform the data
def transform_book_data(book):
    # Convert price to float after stripping the currency symbol
    book['price'] = float(book['price'].replace('£', ''))
    
    # Normalize ratings using word2number
    try:
        book['rating'] = w2n.word_to_num(book['rating'])
    except ValueError:
        book['rating'] = 0  # Handle cases where rating is not a recognizable number word
    
    # Ensure consistency in categories (example: standardize to lower case)
    book['category'] = book['category'].lower() if 'category' in book else 'unknown'
    
    # Handle missing values (example: fill missing descriptions with "No description available")
    if not book.get('description'):
        book['description'] = "No description available"
    
    return book

# Apply the transformation to all books
transformed_books_data = [transform_book_data(book) for book in books_data]

# Save the transformed data back to a new JSON file
with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(transformed_books_data, f, ensure_ascii=False, indent=4)

print(f"Transformed data saved to {output_file_path}")


Transformed data saved to C:\Users\Standard User\OneDrive\Documents\IITJ\Tri3\Advance Data Engineering with Cloud\Project\transformed_books_data_word2number.json
