<a href="https://colab.research.google.com/github/Praneetha-NM/WebScrape/blob/main/WebScrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import random

In [5]:
# Define Headers and Base URL
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5) AppleWebKit/537.36 (KHTML, like Gecko) Version/17.5 Safari/537.36',
    'Accept-Language': 'en-US, en;q=0.5'
}
BASE_URL = "https://www.amazon.com"

In [6]:
# Extract Links
def get_product_links(url):
    webpage = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(webpage.content, "html.parser")
    links = soup.find_all("a", attrs={'class': 'a-link-normal s-no-outline'})
    return [link.get('href') for link in links if link.get('href')]

In [7]:
def get_product_details(link):
    time.sleep(random.uniform(2, 5))  # Avoid bot detection
    if not link.startswith('http'):
        link = BASE_URL + link
    response = requests.get(link, headers=HEADERS)
    soup = BeautifulSoup(response.content, "html.parser")

    return {
        "title": get_title(soup),
        "price": get_price(soup),
        "rating": get_rating(soup),
        "reviews": get_review_count(soup),
        "availability": get_availability(soup),
        "product_details": get_product_description(soup),
        "image_url": get_image_url(soup),
        "brand": get_brand(soup)
    }

In [8]:
# Function Definitions
def get_title(soup):
    try:
        return soup.find("span", attrs={"id": "productTitle"}).text.strip()
    except AttributeError:
        return "No Title Found"

In [9]:
def get_price(soup):
    try:
        # Attempt to find prices under different IDs or classes
        price = soup.find("span", attrs={"id": "priceblock_ourprice"})
        if not price:
            price = soup.find("span", attrs={"id": "priceblock_dealprice"})
        if not price:
            price = soup.find("span", attrs={"class": "a-price-whole"})
        if price:
            return price.text.strip()
        else:
            return "Price Not Available"
    except AttributeError:
        return "Price Not Available"

In [10]:
def get_rating(soup):
    try:
        return soup.find("span", attrs={"class": "a-icon-alt"}).text.strip()
    except AttributeError:
        return "No Rating Found"

In [11]:
def get_review_count(soup):
    try:
        return soup.find("span", attrs={"id": "acrCustomerReviewText"}).text.strip()
    except AttributeError:
        return "No Reviews Found"

In [12]:
def get_availability(soup):
    try:
        availability = soup.find("div", attrs={"id": "availability"}).find("span").text.strip()
        if "in stock" in availability.lower():
            return "In Stock"
        elif "out of stock" in availability.lower():
            return "Out of Stock"
        else:
            return "Availability Unknown"
    except AttributeError:
        return "Availability Unknown"

In [13]:
def get_product_description(soup):
    try:
        description = soup.find("div", attrs={"id": "productDescription"})
        if description:
            return description.get_text(separator=" ", strip=True)
        return "No Description Available"
    except AttributeError:
        return "No Description Available"

In [14]:
def get_image_url(soup):
    try:
        image_tag = soup.find("img", attrs={"id": "landingImage"})
        return image_tag['src'] if image_tag else "No Image Available"
    except AttributeError:
        return "No Image Available"

In [15]:
def get_brand(soup):
    try:
        brand = soup.find("a", attrs={"id": "bylineInfo"})
        return brand.text.strip() if brand else "No Brand Available"
    except AttributeError:
        return "No Brand Available"

In [16]:
# Main Script
if __name__ == "__main__":
    # Prompt for user input
    search_url = input("Enter the Amazon search URL : ")

    # Extract product links from the search URL
    links = get_product_links(search_url)

    # Collect Data
    data = {"title": [], "price": [], "rating": [], "reviews": [], "availability": [],
            "product_details": [], "image_url": [], "brand": []}

    for link in links:
        details = get_product_details(link)
        for key in data.keys():
            data[key].append(details[key])

    # Create DataFrame and Save to CSV
    amazon_df = pd.DataFrame.from_dict(data)
    amazon_df.replace('', np.nan, inplace=True)
    amazon_df.dropna(subset=['title'], inplace=True)
    amazon_df.to_csv("amazon_data.csv", index=False)

    print("Data scraping complete. The results have been saved to 'amazon_data.csv'.")

Enter the Amazon search URL : https://www.amazon.com/s?k=playstation+4
Data scraping complete. The results have been saved to 'amazon_data.csv'.


In [17]:
amazon_df

Unnamed: 0,title,price,rating,reviews,availability,product_details,image_url,brand
0,Sony PlayStation 4 500GB Console (Renewed),173.0,4.4 out of 5 stars,"2,812 ratings",In Stock,This pre-owned or refurbished product has been...,https://m.media-amazon.com/images/I/31oeRyFo94...,Visit the Amazon Renewed Store
1,Sony PlayStation 4 500GB Premium Bundle (Renewed),224.0,4.5 out of 5 stars,26 ratings,In Stock,This pre-owned or refurbished product has been...,https://m.media-amazon.com/images/I/31jy-7fYIz...,Brand: Amazon Renewed
2,Sony Playstation PS4 1TB Black Console,368.0,4.5 out of 5 stars,"1,560 ratings",In Stock,No Description Available,https://m.media-amazon.com/images/I/417m+qsw4J...,Visit the Sony Store
3,Sony PlayStation 4 Slim Limited Edition 1TB Ga...,216.0,4.5 out of 5 stars,"1,497 ratings",In Stock,Edition:Slim 1TB The all new lighter and slimm...,https://m.media-amazon.com/images/I/31NHoF8tL6...,Visit the Amazon Renewed Store
4,"Playstation SONY 4, 500GB Slim System [CUH-221...",224.0,4.5 out of 5 stars,482 ratings,In Stock,"The PS4 system focuses on the gamer, ensuring ...",https://m.media-amazon.com/images/I/51tbWVPtck...,Visit the Amazon Renewed Store
5,PlayStation 4 Slim 1TB Console,369.0,4.5 out of 5 stars,"15,737 ratings",In Stock,The all new lighter and slimmer PlayStation4 s...,https://m.media-amazon.com/images/I/3142A+G-ej...,Visit the PlayStation Store
6,"Sony Playstation 4 500GB white, 9466314 (Renewed)",184.0,4.5 out of 5 stars,44 ratings,In Stock,This pre-owned or refurbished product has been...,https://m.media-amazon.com/images/I/31PtzA5+mz...,Visit the Amazon Renewed Store
7,"Playstation Sony 4, 500GB Slim System [CUH-221...",324.0,4.5 out of 5 stars,372 ratings,In Stock,The all new lighter and slimmer PlayStation4 s...,https://m.media-amazon.com/images/I/31aelHds6g...,Visit the PlayStation Store
8,"Sony PlayStation 4 Pro w/ Accessories, 1TB HDD...",224.0,4.5 out of 5 stars,"1,038 ratings",In Stock,Enhanced games - PS4 Pro games burst into life...,https://m.media-amazon.com/images/I/31KK-7Ru8h...,Visit the Amazon Renewed Store
9,Sony PlayStation 4 Pro 1TB Premium Bundle (Ren...,274.0,4.5 out of 5 stars,1 rating,Availability Unknown,The PlayStation 4 Pro 1TB is a powerful gaming...,https://m.media-amazon.com/images/I/31Mi8SvydJ...,Brand: Amazon Renewed
