In [2]:
# Software Development Intern at SkillCraft Technology. 
# Task 4 : Create a program that extracts product information, such as names, prices, and ratings,
# from an online e-commerce website and stores the data in a structured format like a CSV file.
# Submited by Harinandhan
# ID: SCT/JUN24/0751 
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [3]:
# Web scraping is a data extraction method used to exclusively gather data from websites. 
# It is widely used for Data mining or collecting valuable insights from large websites. 
# Web scraping comes in handy for personal use as well.
# Python contains an amazing library called BeautifulSoup to allow web scraping.
# We will be using it to scrape product information and save the details in a CSV file.

In [31]:
# Function to extract Product Title   Functions Which i have taken
def get_title(soup):

    try:
        # Outer Tag Object
        title = soup.find("span", attrs={"id":'productTitle'})
        
        # Inner NavigatableString Object
        title_value = title.text

        # Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string

# Function to extract Product Price
def get_price(soup):

    try:
        price = soup.find("span", attrs={'id':'priceblock_ourprice'}).string.strip()

    except AttributeError:

        try:
            # If there is some deal price
            price = soup.find("span", attrs={'id':'priceblock_dealprice'}).string.strip()

        except:
            price = ""

    return price

# Function to extract Product Rating
def get_rating(soup):

    try:
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
    
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""	

    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count = ""	

    return review_count

# Function to extract Availability Status
def get_availability(soup):
    try:
        available = soup.find("div", attrs={'id':'availability'})
        available = available.find("span").string.strip()

    except AttributeError:
        available = "Not Available"	

    return available



In [35]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# Define the headers for HTTP requests
HEADERS = {
    'Accept-Language': 'en-US, en;q=0.5',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def get_title(soup):
    title = soup.find(id='productTitle')
    return title.get_text(strip=True) if title else 'N/A'

def get_price(soup):
    price = soup.find('span', {'id': 'priceblock_ourprice'}) or soup.find('span', {'id': 'priceblock_dealprice'})
    return price.get_text(strip=True) if price else 'N/A'

def get_rating(soup):
    rating = soup.find('span', {'class': 'a-icon-alt'})
    return rating.get_text(strip=True) if rating else 'N/A'

def get_review_count(soup):
    reviews = soup.find('span', {'id': 'acrCustomerReviewText'})
    return reviews.get_text(strip=True) if reviews else 'N/A'

def get_availability(soup):
    availability = soup.find('div', {'id': 'availability'})
    return availability.get_text(strip=True) if availability else 'N/A'

def main():
    URL = "https://www.amazon.com/s?k=playstation+4&ref=nb_sb_noss_2"

    # Make HTTP Request
    webpage = requests.get(URL, headers=HEADERS)

    # Parse the webpage content
    soup = BeautifulSoup(webpage.content, "html.parser")

    # Fetch product links
    links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})
    links_list = [link.get('href') for link in links]

    d = {"title": [], "price": [], "rating": [], "reviews": [], "availability": []}

    # Extract product details from each link
    for link in links_list:
        product_url = "https://www.amazon.com" + link
        new_webpage = requests.get(product_url, headers=HEADERS)
        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        d['title'].append(get_title(new_soup))
        d['price'].append(get_price(new_soup))
        d['rating'].append(get_rating(new_soup))
        d['reviews'].append(get_review_count(new_soup))
        d['availability'].append(get_availability(new_soup))

    # Create DataFrame and save to CSV
    amazon_df = pd.DataFrame.from_dict(d)
    amazon_df['title'].replace('', np.nan, inplace=True)
    amazon_df = amazon_df.dropna(subset=['title'])
    amazon_df.to_csv("amazon_data.csv", header=True, index=False)



In [15]:
#Here’s how our out.csv looks like.

In [44]:



    URL = "https://www.amazon.com/s?k=playstation+4&ref=nb_sb_noss_2"
    print("Fetching webpage...")
    webpage = requests.get(URL, headers=HEADERS)
    print("Parsing content...")
    soup = BeautifulSoup(webpage.content, "html.parser")

    # Fetch product links
    links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})
    links_list = [link.get('href') for link in links]
    print(f"Found {len(links_list)} product links.")

    d = {"title": [], "price": [], "rating": [], "reviews": [], "availability": []}

    # Extract product details from each link
    for link in links_list:
        product_url = "https://www.amazon.com" + link
        print(f"Fetching product page: {product_url}")
        new_webpage = requests.get(product_url, headers=HEADERS)
        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        d['title'].append(get_title(new_soup))
        d['price'].append(get_price(new_soup))
        d['rating'].append(get_rating(new_soup))
        d['reviews'].append(get_review_count(new_soup))
        d['availability'].append(get_availability(new_soup))

    print("Creating DataFrame...")
    amazon_df = pd.DataFrame.from_dict(d)
    amazon_df['title'].replace('', np.nan, inplace=True)
    amazon_df = amazon_df.dropna(subset=['title'])
    
    print("Saving to CSV...")
    amazon_df.to_csv("amazon_data.csv", header=True, index=False)
    print("Data saved to amazon_data.csv")



Fetching webpage...
Parsing content...
Found 0 product links.
Creating DataFrame...
Saving to CSV...
Data saved to amazon_data.csv


In [12]:
# This can be downloaded as A csv file 