In [None]:
import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import pandas as pd
import time

# Fetch the maximum page number for the search
def get_maxPageNumber(HEADERS):
    # Webpage URL
    URL = "https://www.amazon.co.uk/s?k=data+books&crid=3TFWSUVPVID3R&sprefix=data+books%2Caps%2C86&ref=nb_sb_noss_1"
    
    # HTTP Request
    try:
        webpage = requests.get(URL, headers=HEADERS)
        webpage.raise_for_status()  # Raise HTTPError for bad responses (4xx and 5xx)
    except RequestException as e:
        print(f"Failed to retrieve the main page: {e}")
        exit()
        
    # Soup object containing all data extracted from Amazon
    soup = BeautifulSoup(webpage.content, "html.parser")  # Converting to HTML format from bytes
    
    # Total number of pages
    pagenumber = soup.find('span',attrs={'class':'s-pagination-item s-pagination-disabled'}).text
    return int(pagenumber.strip())

def get_links(URL, HEADERS):
    try:
        webpage = requests.get(URL, headers=HEADERS)
        webpage.raise_for_status()  # Raise HTTPError for bad responses (4xx and 5xx)
    except RequestException as e:
        print(f"Failed to retrieve the main page: {e}")
        return []  # Return an empty list if there's an error
        
    # Soup object containing all data extracted from Amazon
    soup = BeautifulSoup(webpage.content, "html.parser")  # Converting to HTML format from bytes
       
    # Fetching links as List of Tag objects 
    links = soup.find_all("a", attrs={'class': "a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"})
    
    # Link list 
    links_list = []
    
    for link in links:
        href = link.get('href')
        if href and href.startswith("/"):
            href = "https://www.amazon.co.uk" + href
        links_list.append(href)
    
    return links_list  # Make sure to return the list of links


# Function to extract book title
def get_bookTitle(soup):
    try:
        title = soup.find("span", attrs={"id": 'productTitle'})
        title_value = title.text
        title_string = title_value.strip()
    except AttributeError:
        title_string = ""
    return title_string

# Function to extract author name
def get_authorName(soup):
    try:
        name = soup.find('span', attrs={'class': 'author notFaded'}).text
        name_string = name.strip()
    except AttributeError:
        name_string = ""
    return name_string

# Function to extract book price
def get_sellingPrice(soup):
    try:
        sellingPrice = soup.find('span', attrs={'class': 'a-price aok-align-center reinventPricePriceToPayMargin priceToPay'}).text.strip()
    except AttributeError:
        sellingPrice = ""
    return sellingPrice

# Function to get book listing price
def get_listingPrice(soup):
    try:
        listingPrice = soup.find('span', attrs={'class': 'a-size-small aok-offscreen'}).text.strip()
    except AttributeError:
        listingPrice = ""
    return listingPrice

# Function to get book type
def get_bookType(soup):
    try:
        typeOfBook = soup.find("span", attrs={"id": 'productSubtitle'}).text.strip()
    except AttributeError:
        typeOfBook = ""
    return typeOfBook

# Function to get book page length
def get_printLength(soup):
    try:
        printLength = soup.find('div', attrs={'id': 'rpi-attribute-book_details-fiona_pages'}).text.strip()
    except AttributeError:
        printLength = ""
    return printLength

def get_publicationDate(soup):
    try:
        publicationDate = soup.find('div', attrs={'id': 'rpi-attribute-book_details-publication_date'}).text.strip()
    except AttributeError:
        publicationDate = ""
    return publicationDate

# Function to extract book ratings
def get_bookRating(soup):
    try:
        rating = soup.find("span", attrs={"class": 'a-icon-alt'}).text.strip()
    except AttributeError:
        rating = ""
    return rating

# Function to extract review count
def get_reviewCount(soup):
    try:
        reviews = soup.find('span', attrs={'id': 'acrCustomerReviewText'}).text.strip()
    except AttributeError:
        reviews = ""
    return reviews

# Function to extract book availability - In stock or out of stock
def get_availability(soup):
    try:
        available = soup.find('div', attrs={'id': 'availability'}).text.strip()
    except AttributeError:
        available = "Out of stock"
    return available


In [None]:

if __name__ == '__main__':
    
    HEADERS = { 'User-Agent':'',
               'Accept-Language': 'en-GB,en;q=0.9'}
    
    max_pageNumber =  2  # Set to a fixed number for testing
    # max_pageNumber =  get_maxPageNumber(HEADERS)
    
    all_links = []
    all_links.append(get_links('https://www.amazon.co.uk/s?k=data+books&crid=3TFWSUVPVID3R&sprefix=data+books%2Caps%2C86&ref=nb_sb_noss_1', HEADERS))
    
    # Loop through each page starting from page 2
    for i in range(2, max_pageNumber + 1):
        URL = f'https://www.amazon.co.uk/s?k=data+books&page={i}&crid=3TFWSUVPVID3R&qid=1721330789&sprefix=data+books%2Caps%2C86&ref=sr_pg_{i}'
        all_links.append(get_links(URL, HEADERS))
        
    all_links = [item for sublist in all_links for item in sublist]  # Flatten the list of lists
    
    d = {"title": [], "name": [], "sellingPrice": [],"listingPrice": [],"typeOfBook": [],"printLength": [], "publicationDate": [],"rating": [], "reviews": [],"availability": []}
    
    # Loop to extract book details from each link  
    for link in all_links:
        try:
            new_webpage = requests.get(link, headers=HEADERS)
            new_webpage.raise_for_status()  # Raise HTTPError for bad responses
            new_soup = BeautifulSoup(new_webpage.content, "html.parser")
            
            # Function calls to display all book details
            d['title'].append(get_bookTitle(new_soup))
            d['name'].append(get_authorName(new_soup))
            d['sellingPrice'].append(get_sellingPrice(new_soup))
            d['listingPrice'].append(get_listingPrice(new_soup))
            d['typeOfBook'].append(get_bookType(new_soup))
            d['printLength'].append(get_printLength(new_soup))
            d['publicationDate'].append(get_publicationDate(new_soup))
            d['rating'].append(get_bookRating(new_soup))
            d['reviews'].append(get_reviewCount(new_soup))
            d['availability'].append(get_availability(new_soup))
            
            # Delay between requests
            time.sleep(2)  # Sleep for 2 seconds
        except RequestException as e:
            print(f"Failed to retrieve the page: {e}")
            continue
        
    amazon_df = pd.DataFrame.from_dict(d)
    #amazon_df.to_csv("amazon_data.csv", header=True, index=False)