In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [2]:

def get_title(soup):

    try:

        title = soup.find("span", attrs={"id":'productTitle'})

        title_value = title.text

        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string

def get_price(soup):

    try:
        price = soup.find("span", attrs={'id':'priceblock_ourprice'}).string.strip()

    except AttributeError:

        try:

            price = soup.find("span", attrs={'id':'priceblock_dealprice'}).string.strip()

        except:
            price = ""

    return price

def get_rating(soup):

    try:
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
    
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""	

    return rating

def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count = ""	

    return review_count


def get_availability(soup):
    try:
        available = soup.find("div", attrs={'id':'availability'})
        available = available.find("span").string.strip()

    except AttributeError:
        available = "Not Available"	

    return available



In [3]:

if __name__ == '__main__':


    HEADERS = ({'User-Agent':'', 'Accept-Language': 'en-US, en;q=0.5'})

    URL = "https://www.amazon.com/s?k=playstation+4&ref=nb_sb_noss_2"


    webpage = requests.get(URL, headers=HEADERS)

    soup = BeautifulSoup(webpage.content, "html.parser")

    links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})

    links_list = []

    for link in links:
            links_list.append(link.get('href'))

    d = {"title":[], "price":[], "rating":[], "reviews":[],"availability":[]}

    for link in links_list:
        new_webpage = requests.get("https://www.amazon.com" + link, headers=HEADERS)

        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        d['title'].append(get_title(new_soup))
        d['price'].append(get_price(new_soup))
        d['rating'].append(get_rating(new_soup))
        d['reviews'].append(get_review_count(new_soup))
        d['availability'].append(get_availability(new_soup))

    
    amazon_df = pd.DataFrame.from_dict(d)
    amazon_df['title'].replace('', np.nan, inplace=True)
    amazon_df = amazon_df.dropna(subset=['title'])
    amazon_df.to_csv("amazon_data.csv", header=True, index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  amazon_df['title'].replace('', np.nan, inplace=True)


In [4]:
if __name__ == '__main__':


    HEADERS = ({'User-Agent':'', 'Accept-Language': 'en-US, en;q=0.5'})

    URL = "https://www.amazon.com/s?k=playstation+4&ref=nb_sb_noss_2"

    webpage = requests.get(URL, headers=HEADERS)

    soup = BeautifulSoup(webpage.content, "html.parser")

    links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})

    links_list = []


    for link in links:
        links_list.append(link.get('href'))

    d = {"title":[], "price":[], "rating":[], "reviews":[],"availability":[]}
    

    for link in links_list:
        new_webpage = requests.get("https://www.amazon.com" + link, headers=HEADERS)
        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        d['title'].append(get_title(new_soup))
        d['price'].append(get_price(new_soup))
        d['rating'].append(get_rating(new_soup))
        d['reviews'].append(get_review_count(new_soup))
        d['availability'].append(get_availability(new_soup))
    
    amazon_df = pd.DataFrame.from_dict(d)
    amazon_df['title'].replace('', np.nan, inplace=True)
    amazon_df = amazon_df.dropna(subset=['title'])
    amazon_df.to_json("amazon_data.json", orient="records", lines=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  amazon_df['title'].replace('', np.nan, inplace=True)


In [5]:
amazon_df

Unnamed: 0,title,price,rating,reviews,availability
1,Sony Playstation PS4 1TB Black Console,,4.6 out of 5 stars,"1,424 ratings",In Stock
2,"Playstation Sony 4, 500GB Slim System [CUH-221...",,4.5 out of 5 stars,342 ratings,Only 2 left in stock - order soon.
3,PlayStation 4 500GB Console [Old Model][Discon...,,4.6 out of 5 stars,13579,
4,Sony PlayStation 4 Pro 1TB Console - Black (PS...,,4.5 out of 5 stars,"4,049 ratings",Only 7 left in stock - order soon.
5,PlayStation®5 Digital Edition (slim),,4.7 out of 5 stars,"4,078 ratings",Only 2 left in stock - order soon.
6,Play-Station 4 PS4 1TB Slim Edition Jet Black ...,,4.2 out of 5 stars,50,Usually ships within 6 to 7 days
7,Flagship Newest Play Station 4 1TB HDD Only on...,,4.5 out of 5 stars,200 ratings,Not Available
8,PlayStation 4 1TB Console - Call of Duty: Blac...,,4.6 out of 5 stars,896 ratings,Only 1 left in stock - order soon.
9,PlayStation 4 Slim 500GB Console - Uncharted 4...,,4.8 out of 5 stars,"6,458 ratings",Only 1 left in stock - order soon.
10,PlayStation 4 Slim 1TB Console - Marvel's Spid...,,4.7 out of 5 stars,2817,Only 1 left in stock - order soon.


In [6]:
import pandas as pd

amazon_data = pd.read_json("amazon_data.json", lines=True)

print(amazon_data)


                                                title price  \
0              Sony Playstation PS4 1TB Black Console         
1   Playstation Sony 4, 500GB Slim System [CUH-221...         
2   PlayStation 4 500GB Console [Old Model][Discon...         
3   Sony PlayStation 4 Pro 1TB Console - Black (PS...         
4                PlayStation®5 Digital Edition (slim)         
5   Play-Station 4 PS4 1TB Slim Edition Jet Black ...         
6   Flagship Newest Play Station 4 1TB HDD Only on...         
7   PlayStation 4 1TB Console - Call of Duty: Blac...         
8   PlayStation 4 Slim 500GB Console - Uncharted 4...         
9   PlayStation 4 Slim 1TB Console - Marvel's Spid...         
10           Sony PlayStation 4 500GB Console - Black         
11  Wireless Controller Dual Vibration Game Joysti...         
12  PlayStation 4 Slim 1TB Limited Edition Console...         
13  OWC 2.0 TB External Hard Drive Upgrade for Son...         
14             Sony PlayStation 4 Pro 1TB White (PS4)  

In [7]:
pip install streamlit


Collecting streamlit
  Downloading streamlit-1.33.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.8.1b0-py2.py3-none-any.whl.metadata (3.9 kB)
Collecting watchdog>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.0-py3-none-manylinux2014_x86_64.whl.metadata (37 kB)
Downloading streamlit-1.33.0-py2.py3-none-any.whl (8.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading pydeck-0.8.1b0-py2.py3-none-any.whl (4.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading watchdog-4.0.0-py3-none-manylinux2014_x86_64.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.0/83.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: watchdog, pydeck, streamlit
Successfully install

In [8]:

import streamlit as st
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import json

# Function to extract Product Title
def get_title(soup):
    try:
        title = soup.find("span", attrs={"id": 'productTitle'})
        title_value = title.string.strip()
    except AttributeError:
        title_value = ""
    return title_value

# Function to extract Product Price
def get_price(soup):
    try:
        price = soup.find("span", attrs={'id': 'priceblock_ourprice'}).string.strip()
    except AttributeError:
        try:
            price = soup.find("span", attrs={'id': 'priceblock_dealprice'}).string.strip()
        except:
            price = ""
    return price

# Function to extract Product Rating
def get_rating(soup):
    try:
        rating = soup.find("i", attrs={'class': 'a-icon a-icon-star a-star-4-5'}).string.strip()
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class': 'a-icon-alt'}).string.strip()
        except:
            rating = ""
    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id': 'acrCustomerReviewText'}).string.strip()
    except AttributeError:
        review_count = ""
    return review_count

# Function to extract Availability Status
def get_availability(soup):
    try:
        available = soup.find("div", attrs={'id': 'availability'}).find("span").string.strip()
    except AttributeError:
        available = "Not Available"
    return available

# Function to scrape Amazon
def scrape_amazon(url):
    HEADERS = ({'User-Agent': '', 'Accept-Language': 'en-US, en;q=0.5'})
    webpage = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(webpage.content, "html.parser")
    links = soup.find_all("a", attrs={'class': 'a-link-normal s-no-outline'})
    links_list = [link.get('href') for link in links if link.get('href').startswith('/dp/')]

    d = {"title": [], "price": [], "rating": [], "reviews": [], "availability": []}
    for link in links_list:
        new_webpage = requests.get("https://www.amazon.com" + link, headers=HEADERS)
        new_soup = BeautifulSoup(new_webpage.content, "html.parser")
        d['title'].append(get_title(new_soup))
        d['price'].append(get_price(new_soup))
        d['rating'].append(get_rating(new_soup))
        d['reviews'].append(get_review_count(new_soup))
        d['availability'].append(get_availability(new_soup))

    amazon_df = pd.DataFrame.from_dict(d)
    amazon_df['title'].replace('', np.nan, inplace=True)
    amazon_df.dropna(subset=['title'], inplace=True)
    return amazon_df

# Streamlit UI
def main():
    st.title('Amazon Product Scraper')

    url_input = st.text_input("Enter Amazon Search URL", "")

    if st.button('Scrape'):
        if url_input:
            scraped_data = scrape_amazon(url_input)
            if scraped_data.empty:
                st.write("No data found. Please check the URL and try again.")
            else:
                st.write("Scraped Data (Table Format):")
                st.dataframe(scraped_data)

                st.write("Scraped Data (JSON Format):")
                json_data = scraped_data.to_json(orient="records", lines=False, indent=4)
                st.json(json_data)

                csv = scraped_data.to_csv(index=False)
                st.download_button(
                    label="Download data as CSV",
                    data=csv,
                    file_name='amazon_data.csv',
                    mime='text/csv',
                )
        else:
            st.write("Please enter a valid Amazon URL")

if __name__ == "__main__":
    main()


2024-04-06 18:34:27.008 
  command:

    streamlit run /opt/conda/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]
