In [1]:
import requests
from bs4 import BeautifulSoup
import random
import time
import pandas as pd

# List of user agents to rotate
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
]

# Function to get the HTML content of a product page
def get_product_page(asin):
    url = f"https://www.amazon.com/dp/{asin}"
    headers = {
        'User-Agent': random.choice(USER_AGENTS),
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'DNT': '1',  # Do Not Track Request Header
        'Connection': 'keep-alive',
    }

    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to retrieve page for ASIN {asin}, Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching page for ASIN {asin}: {e}")
        return None

# Function to parse product details from the HTML content
def parse_product_details(asin, html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    product_details = {
        "ASIN": asin,
        "Product Name": None,
        "Price": None,
        "Brand": None,
        "Rating": None,
        "Product Description": None,
        "Dimensions": None,
        "Weight": None
    }

    # Parse product name
    product_name_tag = soup.find(id='productTitle')
    if product_name_tag:
        product_details['Product Name'] = product_name_tag.get_text(strip=True)

    # Parse price
    price_tag = soup.find('span', {'id': 'priceblock_ourprice'}) or soup.find('span', {'id': 'priceblock_dealprice'})
    if price_tag:
        product_details['Price'] = price_tag.get_text(strip=True)

    # Parse brand
    brand_tag = soup.find('a', {'id': 'bylineInfo'})
    if brand_tag:
        product_details['Brand'] = brand_tag.get_text(strip=True)

    # Parse rating
    rating_tag = soup.find('span', {'class': 'a-icon-alt'})
    if rating_tag:
        product_details['Rating'] = rating_tag.get_text(strip=True)

    # Parse product description
    description_tag = soup.find('div', {'id': 'feature-bullets'})
    if description_tag:
        product_details['Product Description'] = description_tag.get_text(strip=True)

    # Parse dimensions and weight
    technical_details = soup.find('table', {'id': 'productDetails_techSpec_section_1'})
    if technical_details:
        for row in technical_details.find_all('tr'):
            th = row.find('th').get_text(strip=True)
            td = row.find('td').get_text(strip=True)
            if 'Dimensions' in th:
                product_details['Dimensions'] = td
            if 'Weight' in th:
                product_details['Weight'] = td

    return product_details

# Function to scrape product details using ASIN
def scrape_product_details(asin):
    print(f"Scraping ASIN: {asin}")
    html_content = get_product_page(asin)
    if html_content:
        product_details = parse_product_details(asin, html_content)
        return product_details
    else:
        return None

# Read the ASINs from the laptop_reviews.csv file
df_reviews = pd.read_csv("amazon_tv_reviews.csv")

# List of ASINs from the csv file
asins = df_reviews['ASIN'].unique()  # Ensure only unique ASINs are used

# Scrape product details for each ASIN and store in a list
product_data = []
for asin in asins:
    details = scrape_product_details(asin)
    if details:
        product_data.append(details)
    # Sleep to avoid being blocked
    time.sleep(random.uniform(1, 3))

# Convert the list of product details into a DataFrame
df_products = pd.DataFrame(product_data)

# Merge product details with the existing reviews
df_combined = pd.merge(df_reviews, df_products, on='ASIN', how='left')

# Save the combined data to a new CSV file
df_combined.to_csv("combined_TV_reviews_details.csv", index=False)

# Print the combined DataFrame
print(df_combined)

Scraping ASIN: B0BTTVRWPR
Scraping ASIN: B0BCMRRKRX
Scraping ASIN: B09ZLTMWWH
Scraping ASIN: B0CJDSNN4T
Scraping ASIN: B0D2WGBQ5Y
Scraping ASIN: B0B3GTSQ9Q
Scraping ASIN: B0CVS183ZP
Scraping ASIN: B0C1HZ8QF4
Scraping ASIN: B094RJ41WY
Scraping ASIN: B0C1J1TWQM
Scraping ASIN: B0BTTV2P88
Scraping ASIN: B07CL4GLQW
Scraping ASIN: B0C1HYMT79
Scraping ASIN: B094RKDNMZ
Scraping ASIN: B0CW1C5S69
Scraping ASIN: B0D4PD799H
Scraping ASIN: B0C1J2SVKD
Scraping ASIN: B0CVBL2J34
Scraping ASIN: B0CV9RBKKX
Scraping ASIN: B0CLFD3NF5
Scraping ASIN: B0CZMC1YXY
Scraping ASIN: B0BZTB81QV
Scraping ASIN: B0C1HZ9HCM
Scraping ASIN: B0CVS18PH9
Scraping ASIN: B0B286BGSL
Scraping ASIN: B08T6J1HG8
Scraping ASIN: B0CJCRCQH9
Scraping ASIN: B09N6ZRH6C
Scraping ASIN: 1524878731
Scraping ASIN: B0CPTB7H6Q
Scraping ASIN: B0BYR8GQQS
Scraping ASIN: B0BHKPG2NP
Scraping ASIN: B09WNJT9X3
Scraping ASIN: B09QRM1LVN
Scraping ASIN: B0B3JMQG8Q
Scraping ASIN: B0B4FJ89VP
Scraping ASIN: B0CXDNJ9QW
Scraping ASIN: 0984157697
Scraping ASI

In [6]:
import requests
from bs4 import BeautifulSoup
import random
import time
import pandas as pd

# List of user agents to rotate
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
]

# Function to get the HTML content of a product page
def get_product_page(asin):
    url = f"https://www.amazon.com/dp/{asin}"
    headers = {
        'User-Agent': random.choice(USER_AGENTS),
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'DNT': '1',  # Do Not Track Request Header
        'Connection': 'keep-alive',
    }

    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to retrieve page for ASIN {asin}, Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching page for ASIN {asin}: {e}")
        return None

# Function to parse product details from the HTML content
def parse_product_details(asin, html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    product_details = {
        "ASIN": asin,
        "Product Name": None,
        "Price": None,
        "Brand": None,
        "Rating": None,
        "Product Description": None,
        "Dimensions": None,
        "Weight": None
    }

    # Parse product name
    product_name_tag = soup.find(id='productTitle')
    if product_name_tag:
        product_details['Product Name'] = product_name_tag.get_text(strip=True)

    # Parse price
    price_tag = soup.find('span', {'id': 'priceblock_ourprice'}) or soup.find('span', {'id': 'priceblock_dealprice'})
    if price_tag:
        product_details['Price'] = price_tag.get_text(strip=True)

    # Parse brand
    brand_tag = soup.find('a', {'id': 'bylineInfo'})
    if brand_tag:
        product_details['Brand'] = brand_tag.get_text(strip=True)

    # Parse rating
    rating_tag = soup.find('span', {'class': 'a-icon-alt'})
    if rating_tag:
        product_details['Rating'] = rating_tag.get_text(strip=True)

    # Parse product description
    description_tag = soup.find('div', {'id': 'feature-bullets'})
    if description_tag:
        product_details['Product Description'] = description_tag.get_text(strip=True)

    # Parse dimensions and weight
    technical_details = soup.find('table', {'id': 'productDetails_techSpec_section_1'})
    if technical_details:
        for row in technical_details.find_all('tr'):
            th = row.find('th').get_text(strip=True)
            td = row.find('td').get_text(strip=True)
            if 'Dimensions' in th:
                product_details['Dimensions'] = td
            if 'Weight' in th:
                product_details['Weight'] = td

    return product_details

# Function to scrape product details using ASIN
def scrape_product_details(asin):
    print(f"Scraping ASIN: {asin}")
    html_content = get_product_page(asin)
    if html_content:
        product_details = parse_product_details(asin, html_content)
        return product_details
    else:
        return None

# Read the ASINs from the earphones_reviews.csv file
df_reviews = pd.read_csv("earphones_reviews.csv")

# List of ASINs from the csv file
asins = df_reviews['ASIN'].unique()  # Ensure only unique ASINs are used

# Scrape product details for each ASIN and store in a list
product_data = []
for asin in asins:
    details = scrape_product_details(asin)
    if details:
        product_data.append(details)
    # Sleep to avoid being blocked
    time.sleep(random.uniform(1, 3))

# Convert the list of product details into a DataFrame
df_products = pd.DataFrame(product_data)

# Merge product details with the existing reviews
df_combined = pd.merge(df_reviews, df_products, on='ASIN', how='left')

# Save the combined data to a new CSV file
df_combined.to_csv("combined_earphones_reviews_details.csv", index=False)

# Print the combined DataFrame
print(df_combined)


Scraping ASIN: B07PXGQC1Q
Scraping ASIN: B0BQPNMXQV
Scraping ASIN: B0D635YLCT
Scraping ASIN: B0BTYCRJSS
Scraping ASIN: B0C1QNRGHC
Scraping ASIN: B09FT58QQP
Scraping ASIN: B0C3W4MNN1
Scraping ASIN: B096SV8SJG
Scraping ASIN: B0C1QWWZR4
Scraping ASIN: B0CZNFV5W2
Scraping ASIN: B07HH1QSLB
Scraping ASIN: B09FLNSYDZ
Scraping ASIN: B09DT48V16
Scraping ASIN: B0CX1SZPH3
Scraping ASIN: B0CH9LSX5M
Scraping ASIN: B0D9K5K5M3
Scraping ASIN: B0BM4HGSFJ
Scraping ASIN: B0B1LVC5VZ
Scraping ASIN: B0D5HG7TC3
Scraping ASIN: B09KGLRF8J
Scraping ASIN: B07R5QD598
Scraping ASIN: B09TN4MP6V
Scraping ASIN: B0CRT6HQ82
Scraping ASIN: B0DB2DSVWF
Scraping ASIN: B01M0GB8CC
Scraping ASIN: B0D17TFVTF
Scraping ASIN: B09BFFGQ5N
Scraping ASIN: B07J2Z5DBM
Scraping ASIN: B0CTD56NJ2
Scraping ASIN: B0DDKSSWTC
Scraping ASIN: B09JL41N9C
Scraping ASIN: B0CPJG173J
Scraping ASIN: B0CP8PFX7Y
Scraping ASIN: B0D28SBFK9
Scraping ASIN: B09HN594TL
Scraping ASIN: B0CF7GYNW2
Scraping ASIN: B08L6ZYW21
Scraping ASIN: B0CSXV3GK4
Scraping ASI

In [1]:
import requests
from bs4 import BeautifulSoup
import random
import time
import pandas as pd

# List of user agents to rotate
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
]

# Function to get the HTML content of a product page
def get_product_page(asin):
    url = f"https://www.amazon.com/dp/{asin}"
    headers = {
        'User-Agent': random.choice(USER_AGENTS),
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'DNT': '1',  # Do Not Track Request Header
        'Connection': 'keep-alive',
    }

    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to retrieve page for ASIN {asin}, Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching page for ASIN {asin}: {e}")
        return None

# Function to parse product details from the HTML content
def parse_product_details(asin, html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    product_details = {
        "ASIN": asin,
        "Product Name": None,
        "Price": None,
        "Brand": None,
        "Rating": None,
        "Product Description": None,
        "Dimensions": None,
        "Weight": None
    }

    # Parse product name
    product_name_tag = soup.find(id='productTitle')
    if product_name_tag:
        product_details['Product Name'] = product_name_tag.get_text(strip=True)

    # Parse price
    price_tag = soup.find('span', {'id': 'priceblock_ourprice'}) or soup.find('span', {'id': 'priceblock_dealprice'})
    if price_tag:
        product_details['Price'] = price_tag.get_text(strip=True)

    # Parse brand
    brand_tag = soup.find('a', {'id': 'bylineInfo'})
    if brand_tag:
        product_details['Brand'] = brand_tag.get_text(strip=True)

    # Parse rating
    rating_tag = soup.find('span', {'class': 'a-icon-alt'})
    if rating_tag:
        product_details['Rating'] = rating_tag.get_text(strip=True)

    # Parse product description
    description_tag = soup.find('div', {'id': 'feature-bullets'})
    if description_tag:
        product_details['Product Description'] = description_tag.get_text(strip=True)

    # Parse dimensions and weight
    technical_details = soup.find('table', {'id': 'productDetails_techSpec_section_1'})
    if technical_details:
        for row in technical_details.find_all('tr'):
            th = row.find('th').get_text(strip=True)
            td = row.find('td').get_text(strip=True)
            if 'Dimensions' in th:
                product_details['Dimensions'] = td
            if 'Weight' in th:
                product_details['Weight'] = td

    return product_details

# Function to scrape product details using ASIN
def scrape_product_details(asin):
    print(f"Scraping ASIN: {asin}")
    html_content = get_product_page(asin)
    if html_content:
        product_details = parse_product_details(asin, html_content)
        return product_details
    else:
        return None

# Read the ASINs from the laptop_reviews.csv file
df_reviews = pd.read_csv("amazon_mobile_reviews.csv")

# List of ASINs from the csv file
asins = df_reviews['ASIN'].unique()  # Ensure only unique ASINs are used

# Scrape product details for each ASIN and store in a list
product_data = []
for asin in asins:
    details = scrape_product_details(asin)
    if details:
        product_data.append(details)
    # Sleep to avoid being blocked
    time.sleep(random.uniform(1, 3))

# Convert the list of product details into a DataFrame
df_products = pd.DataFrame(product_data)

# Merge product details with the existing reviews
df_combined = pd.merge(df_reviews, df_products, on='ASIN', how='left')

# Save the combined data to a new CSV file
df_combined.to_csv("combined_mobile_reviews_details.csv", index=False)

# Print the combined DataFrame
print(df_combined)

  df_reviews = pd.read_csv("amazon_mobile_reviews.csv")


Scraping ASIN: B0CMDJ844V
Scraping ASIN: B0CMZ8ZBVN
Scraping ASIN: B0CF2PV74C
Scraping ASIN: B0C544TBQM
Scraping ASIN: B08PNB9B5Z
Scraping ASIN: B07Z3XZDT5
Scraping ASIN: B077RM9WJB
Scraping ASIN: B0C2W7YYHM
Scraping ASIN: B0BV5SV5HL
Scraping ASIN: B08G5B4PV5
Scraping ASIN: B08CFSZLQ4
Scraping ASIN: B08KRKFHGV
Scraping ASIN: B09G2BN89Q
Scraping ASIN: B0BYHC3ZMS
Scraping ASIN: B08HVXC89J
Scraping ASIN: B083B8HH98
Scraping ASIN: B0BLW47H3M
Scraping ASIN: B00R2K4LCY
Scraping ASIN: B08NWBY8YJ
Scraping ASIN: B0BYLRBG4M
Scraping ASIN: B089HNHZ17
Scraping ASIN: B0845ZVJW3
Scraping ASIN: B0BYHB4DF5
Scraping ASIN: B09FRBJZSY
Scraping ASIN: B08126B44P
Scraping ASIN: B0BYLM8RRW
Scraping ASIN: B0BVL4VRXN
Scraping ASIN: B0C5443MCX
Scraping ASIN: B07Z43ZPTT
Scraping ASIN: B08HC5GMK2
Scraping ASIN: B08Z63QRD2
Scraping ASIN: B07H8PXT7K
Scraping ASIN: B093CB2R6L
Scraping ASIN: B07FNZ85WF
Scraping ASIN: B08M5RFKWK
Scraping ASIN: B093ZCPS14
Scraping ASIN: B07X4B9G32
Scraping ASIN: B07KFN43WC
Scraping ASI