**CodeAlpha Data Analytics Internship TASK 1: Web Scraping**

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re # For regex operations

# --- Configuration ---
BASE_URL = 'http://books.toscrape.com/catalogue/page-{}.html'
TOTAL_PAGES = 50 # The website has 50 pages of books
scraped_data = []
# --- Helper Function: Convert text rating to numeric value ---
def convert_rating(rating_text):
    """Converts the star rating class name (e.g., 'Three') to a number (e.g., 3)."""
    # The rating is stored as a class name like 'star-rating Three'
    # We use a dictionary to map the text to a numerical value
    rating_map = {
        'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5
    }
    # Find the class that contains the rating text (e.g., 'Three')
    for key, value in rating_map.items():
        if key in rating_text:
            return value
    return 0 # Return 0 if no rating is found

# --- Main Scraping Logic ---
print("Starting web scraping...")
for page_num in range(1, TOTAL_PAGES + 1):
    # Construct the URL for the current page
    # The first page is an exception: http://books.toscrape.com/index.html
    # Subsequent pages follow the pattern: http://books.toscrape.com/catalogue/page-N.html
    if page_num == 1:
        url = 'http://books.toscrape.com/index.html'
    else:
        url = BASE_URL.format(page_num)

    print(f"Scraping Page {page_num} of {TOTAL_PAGES}: {url}")

    try:
        # 1. Send the HTTP request
        response = requests.get(url, timeout=10)
        response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)

        # 2. Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # 3. Find all book containers on the page
        # Each book listing is wrapped in an <article class="product_pod"> tag
        books = soup.find_all('article', class_='product_pod')

        if not books:
            print("No more books found. Stopping.")
            break

        # 4. Extract data for each book
        for book in books:
            # Title (inside h3 > a tag, stored in the 'title' attribute)
            title = book.h3.a['title']

            # Price (inside p tag with class 'price_color')
            # Use regex to clean the currency symbol (e.g., 'Â£51.77' -> 51.77)
            price_element = book.find('p', class_='price_color').text.strip()
            price_match = re.search(r'[\d\.]+', price_element)
            price = float(price_match.group(0)) if price_match else None

            # Availability (inside p tag with class 'instock availability')
            availability = book.find('p', class_='instock availability').text.strip()
            # Clean up the text: 'In stock (22 available)' -> 'In stock'
            availability = availability.split('(')[0].strip()

            # Rating (inside p tag with a class that starts with 'star-rating')
            rating_class = book.find('p', class_=re.compile(r'star-rating'))['class']
            rating = convert_rating(rating_class)

            # Store the data in the list
            scraped_data.append({
                'Title': title,
                'Price': price,
                'Availability': availability,
                'Rating_Stars': rating
            })

        # Be a good web citizen: wait a small amount of time before the next request
        time.sleep(1)

    except requests.exceptions.RequestException as e:
        print(f"Error accessing {url}: {e}. Skipping page.")
        continue

# 5. Convert the list of dictionaries to a Pandas DataFrame
df = pd.DataFrame(scraped_data)

# 6. Save the DataFrame to a CSV file
output_file = 'books_data.csv'
df.to_csv(output_file, index=False)

print("\n--- Task 1 Complete ---")
print(f"Successfully scraped {len(df)} books.")
print(f"Data saved to {output_file}")
print("First 5 rows of your data:")
print(df.head())

Starting web scraping...
Scraping Page 1 of 50: http://books.toscrape.com/index.html
Scraping Page 2 of 50: http://books.toscrape.com/catalogue/page-2.html
Scraping Page 3 of 50: http://books.toscrape.com/catalogue/page-3.html
Scraping Page 4 of 50: http://books.toscrape.com/catalogue/page-4.html
Scraping Page 5 of 50: http://books.toscrape.com/catalogue/page-5.html
Scraping Page 6 of 50: http://books.toscrape.com/catalogue/page-6.html
Scraping Page 7 of 50: http://books.toscrape.com/catalogue/page-7.html
Scraping Page 8 of 50: http://books.toscrape.com/catalogue/page-8.html
Scraping Page 9 of 50: http://books.toscrape.com/catalogue/page-9.html
Scraping Page 10 of 50: http://books.toscrape.com/catalogue/page-10.html
Scraping Page 11 of 50: http://books.toscrape.com/catalogue/page-11.html
Scraping Page 12 of 50: http://books.toscrape.com/catalogue/page-12.html
Scraping Page 13 of 50: http://books.toscrape.com/catalogue/page-13.html
Scraping Page 14 of 50: http://books.toscrape.com/catal