<a href="https://colab.research.google.com/github/Sri200522/amazon-scraper-using-python/blob/main/amazon_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests
!pip install bs4
!pip install pandas

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [None]:
import os
import pandas as pd
from datetime import date
from bs4 import BeautifulSoup
import requests
import concurrent.futures

class AmazonProductScraper:
    def __init__(self):
        self.category_name = None
        self.formatted_category_name = None
        self.max_pages = 100  # Maximum number of pages to scrape

    def fetch_webpage_content(self, url):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36"
        }
        response = requests.get(url, headers=headers)
        return response.text

    def get_category_url(self):
        self.category_name = input("\n>> Enter the product/category to be searched: ")
        self.formatted_category_name = self.category_name.replace(" ", "+")
        category_url = f"https://www.glassdoor.com/Job/software-engineer-jobs-SRCH_KO0,17.htm"
        print(">> Category URL: ", category_url)
        return category_url

    def truncate_title(self, title, max_words=15):
        words = title.split()[:max_words]
        return ' '.join(words)

    @staticmethod
    def extract_product_information(page_results):
        temp_record = []
        for item in page_results:
            description = item.h2.a.text.strip()

            try:
                product_price = item.find('span', 'a-offscreen').text
            except AttributeError:
                product_price = "N/A"

            try:
                product_title = item.h2.a.text.strip().replace(',', '')
                name = product_title.split()[:5]
                name = ' '.join(name)
            except AttributeError:
                name = "N/A"

            try:
                product_review = item.i.text.strip()
            except AttributeError:
                product_review = "N/A"

            try:
                review_number = item.find('span', {'class': 'a-size-base'}).text
            except AttributeError:
                review_number = "N/A"

            product_information = (name, product_price[1:], product_review, review_number, description)
            temp_record.append(product_information)

        return temp_record

    def process_page(self, page_number, category_url):
        print(f">> Page {page_number} - webpage information extracted")
        next_page_url = category_url + f"&page={page_number}"
        page_content = self.fetch_webpage_content(next_page_url)
        soup = BeautifulSoup(page_content, 'html.parser')
        page_results = soup.find_all('div', {'data-component-type': 's-search-result'})
        return self.extract_product_information(page_results)

    def navigate_to_other_pages(self, category_url):
        records = []
        with concurrent.futures.ThreadPoolExecutor() as executor:
            future_to_page = {executor.submit(self.process_page, page_number, category_url): page_number for page_number in range(2, self.max_pages + 1)}
            for future in concurrent.futures.as_completed(future_to_page):
                page_number = future_to_page[future]
                try:
                    temp_record = future.result()
                    records += temp_record
                except Exception as e:
                    print(f"Exception occurred for page {page_number}: {e}")

        print("\n>> Creating an Excel sheet and entering the details...")
        return records

    def product_information_spreadsheet(self, records):
        today = date.today().strftime("%d-%m-%Y")
        file_name = f"{self.category_name}_{today}.xlsx"

        # Creating a DataFrame from records
        df = pd.DataFrame(records, columns=['Title', 'Price', 'Rating', 'Review Count', 'Description'])
        df.to_excel(file_name, index=False)  # Write to Excel file without the index

        message = f">> Information about the product '{self.category_name}' is stored in {file_name}\n"
        print(message)

if __name__ == "__main__":
    my_amazon_bot = AmazonProductScraper()

    category_details = my_amazon_bot.get_category_url()

    navigation = my_amazon_bot.navigate_to_other_pages(category_details)

    my_amazon_bot.product_information_spreadsheet(navigation)



>> Enter the product/category to be searched: python developer
>> Category URL:  https://www.glassdoor.com/Job/software-engineer-jobs-SRCH_KO0,17.htm
>> Page 2 - webpage information extracted
>> Page 3 - webpage information extracted
>> Page 4 - webpage information extracted
>> Page 5 - webpage information extracted
>> Page 6 - webpage information extracted
>> Page 7 - webpage information extracted
>> Page 8 - webpage information extracted
>> Page 9 - webpage information extracted
>> Page 10 - webpage information extracted
>> Page 11 - webpage information extracted
>> Page 12 - webpage information extracted
>> Page 13 - webpage information extracted
>> Page 14 - webpage information extracted
>> Page 15 - webpage information extracted
>> Page 16 - webpage information extracted
>> Page 17 - webpage information extracted
>> Page 18 - webpage information extracted>> Page 19 - webpage information extracted

>> Page 20 - webpage information extracted
>> Page 21 - webpage information extract