In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
from datetime import datetime
import random


In [1]:
import os
import pandas as pd

def merge_csv_files(input_directory, output_file='merged_output.csv'):
    # Ensure the input directory exists
    if not os.path.exists(input_directory):
        print(f"Error: Directory {input_directory} does not exist.")
        return 0
    
    # List to store individual DataFrames
    dataframes = []
    
    # Counter for merged files
    merged_files_count = 0
    
    # Iterate through all files in the directory
    for filename in os.listdir(input_directory):
        # Check if file is a CSV
        if filename.endswith('.csv'):
            file_path = os.path.join(input_directory, filename)
            try:
                # Read the CSV file
                df = pd.read_csv(file_path)
                
                # Add a column to track the source file if desired
                df['source_file'] = filename
                
                # Append to list of DataFrames
                dataframes.append(df)
                
                # Increment counter
                merged_files_count += 1
                print(f"Merged: {filename}")
            
            except Exception as e:
                print(f"Error reading {filename}: {e}")
    
    # Check if any files were found
    if not dataframes:
        print("No CSV files found in the directory.")
        return 0
    
    # Concatenate all DataFrames
    merged_df = pd.concat(dataframes, ignore_index=True)
    
    # Create output directory if it doesn't exist
    output_directory = os.path.dirname(output_file) or '.'
    os.makedirs(output_directory, exist_ok=True)
    
    # Save merged DataFrame
    merged_df.to_csv(output_file, index=False, encoding='utf-8')
    
    print(f"\nMerging complete!")
    print(f"Total files merged: {merged_files_count}")
    print(f"Output file: {output_file}")
    print(f"Total rows in merged file: {len(merged_df)}")
    
    return merged_files_count

def main():
    # Example usage
    input_directory = 'D:/app/Bigdata-IS405.P11/Crawl/Airflow/amazon_scraper_output'  # Directory containing CSV files
    output_file = 'amazon_products1.csv'  # Output merged file path1
    
    merge_csv_files(input_directory, output_file)

if __name__ == "__main__":
    main()

Merged: amazon_products_20241211_120944.csv
Merged: amazon_products_20241211_135603.csv
Merged: amazon_products_20241211_145330.csv
Merged: amazon_products_20241211_172619.csv
Merged: amazon_products_20241211_204940.csv

Merging complete!
Total files merged: 5
Output file: amazon_products1.csv
Total rows in merged file: 7540


In [None]:
# Function to extract Product Title
def get_title(soup):
    try:
        title = soup.find("span", class_="a-size-base-plus a-color-base a-text-normal").get_text(strip=True)
    except AttributeError:
        title = ""
    return title

# Function to extract Product Price
def get_price(soup):
    try:
        price = soup.find('span', class_='a-price').find('span', class_='a-offscreen').text.strip()
    except AttributeError:
        price = ""
    return price

def get_old_price(soup):
    try:
        old_price = soup.find("div", class_="a-section aok-inline-block") \
                        .find("span", class_="a-offscreen").text.strip()
    except AttributeError:
        old_price = ""
    return old_price
# Function to extract Discount Percent
def get_discount_percent(soup):
    try:
        price_whole = soup.find('span', class_='a-price-whole').text.strip()
        price_decimal = soup.find('span', class_='a-price-decimal').next_sibling.strip()

        discount_percent = price_whole + '.' + price_decimal
    except AttributeError:
        discount_percent = ""
    return discount_percent

# Function to extract Brand Name
# def get_brand(soup):
#     try:
#         # Find the div with the specific class attributes
#         brand_div = soup.find("div", class_="width_common txt_color_1 space_bottom_3")
#         # Extract the brand name within the strong tag inside this div
#         brand = brand_div.find("strong").get_text(strip=True)
#     except AttributeError:
#         brand = ""
#     return brand


def get_product_url(soup):
    try:
        product_url = soup.find("a", class_="a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal")['href']
        product_url = "https://www.amazon.com" + product_url
    except (TypeError, AttributeError):
        product_url = ""
    return product_url


# Function to extract Product Rating
def get_rating(soup):
    try:
        rating = soup.find('i', class_='a-icon a-icon-star-small a-star-small-4-5').find('span', class_='a-icon-alt').text.strip()
    except (AttributeError, IndexError):
        rating = ""
    return rating

# Function to extract Number of Reviews

def get_review_count(soup):
    try:
        review_count = soup.find('span', class_='a-size-base s-underline-text').text.strip()
    except AttributeError:
        review_count = ""
    return review_count


# Function to extract Number of Purchases
def get_purchase_count(soup):
    try:
        purchase_count = soup.find("span", class_="a-size-base a-color-secondary").get_text(strip=True)
    except AttributeError:
        purchase_count = ""
    return purchase_count

In [3]:
def get_product_details(product_url):
    try:
        HEADERS = ({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 OPR/112.0.0.0',
            'Accept-Language': 'en-US, en;q=0.5'
        })
        page = requests.get(product_url, headers=HEADERS)
        soup = BeautifulSoup(page.content, "html.parser")

        # Tìm bảng trong thẻ div chứa các thông tin chi tiết
        details = {}
        info_section = soup.find("div", class_="a-section a-spacing-small a-spacing-top-small")
        
        if info_section:
            table = info_section.find("table")
            if table:
                # Duyệt qua các hàng trong bảng và lấy thông tin
                for row in table.find_all("tr"):
                    try:
                        key = row.find("td", class_="a-span3").get_text(strip=True)
                        value = row.find("td", class_="a-span9").get_text(strip=True)
                        details[key] = value
                    except AttributeError:
                        continue

    except requests.RequestException as e:
        print(f"Request failed: {e}")
        details = {}

    return details

In [4]:
def scrape_amazon(base_url, max_pages):
    HEADERS = ({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 OPR/114.0.0.0',
        'Accept-Language': 'en-US, en;q=0.5'
    })

    d = {
        "title": [], "price": [], "old_price": [], 
        "product_url": [], "rating": [], "reviews": [], "purchases": []
    }

    for page in range(1, max_pages + 1):
        url = f"{base_url}&page={page}"
        
        print(f"Scraping page {page} with URL: {url}")
        response = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(response.content, "html.parser")

        # Tìm kiếm sản phẩm trên trang
        products = soup.find_all("div", attrs={"data-asin": True})
        if not products:
            print("No products found on this page, stopping.")
            break  # Dừng nếu không có sản phẩm

        for product in products:
            d['title'].append(get_title(product))
            d['price'].append(get_price(product))
            d['old_price'].append(get_old_price(product))
            product_url = get_product_url(product)
            d['product_url'].append(product_url)
            d['rating'].append(get_rating(product))
            d['reviews'].append(get_review_count(product))
            d['purchases'].append(get_purchase_count(product))

            # Kiểm tra nếu product_url không rỗng trước khi gọi hàm get_product_details
            if product_url:
                details = get_product_details(product_url)
                for key, value in details.items():
                    # Thêm cột động vào từ điển `d` nếu chưa có
                    if key not in d:
                        d[key] = [""] * len(d['title'])
                    d[key].append(value)
                
                # Đảm bảo mỗi mục đã thêm đều có đúng số lượng phần tử
                for key in d:
                    if len(d[key]) < len(d['title']):
                        d[key].append("")
            else:
                for key in d:
                    if len(d[key]) == len(d['title']) - 1:
                        d[key].append("")

        # Thêm độ trễ để giảm khả năng bị chặn
        time.sleep(5)

    # Chuyển dữ liệu thành DataFrame và lưu vào CSV
    amazon_df = pd.DataFrame.from_dict(d)
    amazon_df['title'] = amazon_df['title'].replace('', np.nan)
    amazon_df = amazon_df.dropna(subset=['title'])
    amazon_df.to_csv("amazon_data.csv", header=True, index=False)

In [5]:
if __name__ == '__main__':
    base_url = "https://www.amazon.com/s?i=computers-intl-ship&bbn=16225007011&rh=n%3A16225007011%2Cn%3A1292110011"  # Cập nhật link cơ sở
    max_pages = 2  
    scrape_amazon(base_url, max_pages)

Scraping page 1 with URL: https://www.amazon.com/s?i=computers-intl-ship&bbn=16225007011&rh=n%3A16225007011%2Cn%3A1292110011&page=1
Scraping page 2 with URL: https://www.amazon.com/s?i=computers-intl-ship&bbn=16225007011&rh=n%3A16225007011%2Cn%3A1292110011&page=2


  amazon_df['title'] = amazon_df['title'].replace('', np.nan)
