# WEB SCRAPING-ASSIGNMENT3 

### Solution sheet

1.Write a python program which searches all the product under a particular product from www.amazon.in. The 
product to be searched will be taken as input from user. For e.g. If user input is ‘guitar’. Then search for 
guitars.

In [None]:
import requests
from bs4 import BeautifulSoup

def search_amazon_products(product_name, num_pages=1, base_url="https://www.amazon.in/s?k={}"):
    """
    Searches for products under a given product name on Amazon.in and extracts basic details.

    Args:
        product_name (str): The name of the product to search for.
        num_pages (int, optional): The number of pages to search (default: 1).
        base_url (str, optional): The base URL for search queries (default: Amazon India search).

    Returns:
        list: A list of dictionaries, where each dictionary contains extracted product details.

    Raises:
        HTTPError: If an HTTP error occurs during requests.
        ValueError: If product_name is empty or num_pages is negative.
    """

    if not product_name:
        raise ValueError("product_name cannot be empty")
    if num_pages < 1:
        raise ValueError("num_pages must be at least 1")

    products = []
    for page in range(1, num_pages + 1):
        url = base_url.format(product_name.replace(" ", "+") + f"&page={page}")
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        soup = BeautifulSoup(response.content, "lxml")

        # Refine selectors based on Amazon.in's dynamic layout
        product_elements = soup.find_all(
            lambda tag: tag.name == "div" and tag.has_attr("data-component-type") and "s-search-result" in tag["data-component-type"]
        )

        for product_element in product_elements:
            try:
                # Extract product details using refined selectors
                product_url = product_element.find("a", href=True)["href"]
                product_name = product_element.find("span", class_="a-size-medium a-color-base a-text-normal").text.strip()
                price_element = product_element.find("span", class_="a-price-whole")
                price = price_element.text.strip() if price_element else None
                rating_element = product_element.find("span", class_="a-star-normal")
                rating = rating_element.get("aria-label") if rating_element else None

                products.append(
                    {
                        "url": product_url,
                        "name": product_name,
                        "price": price,
                        "rating": rating,
                    }
                )
            except AttributeError:  # Ignore invalid product elements
                pass

    return products

if __name__ == "__main__":
    product_name = input("Enter a product name to search for: ")
    num_pages = int(input("Enter the number of pages to search (default: 1): ") or 1)
    products = search_amazon_products(product_name, num_pages)

    if products:
        for product in products:
            print(f"Product Name: {product['name']}")
            print(f"Product URL: {product['url']}")
            if product["price"]:
                print(f"Price: ₹{product['price']}")
            if product["rating"]:
                print(f"Rating: {product['rating']}")
            print("-" * 30)
    else:
        print("No products found for the given search query.")


2.In the above question, now scrape the following details of each product listed in first 3 pages of your search 
results and save it in a data frame and csv. In case if any product has less than 3 pages in search results then 
scrape all the products available under that product name. Details to be scraped are: "Brand 
Name", "Name of the Product", "Price", "Return/Exchange", "Expected Delivery", "Availability" and 
“Product URL”. In case, if any of the details are missing for any of the product then replace it by “-“.

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def search_amazon_products(product_name, num_pages=3, base_url="https://www.amazon.in/"):
    products_data = []

    for page in range(1, num_pages + 1):
        url = f"{base_url}s?k={product_name}&page={page}"

        # Make a request to the Amazon search results page
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract product details from the page
        products = soup.find_all('div', class_='s-result-item')

        for product in products:
            brand_name = product.find('span', class_='a-size-base-plus a-color-base').text.strip()
            product_name = product.find('span', class_='a-size-base-plus a-color-base a-text-normal').text.strip()

            # Extracting other details, replace with '-' if not found
            price = product.find('span', class_='a-offscreen')
            price = price.text.strip() if price else '-'

            return_exchange = product.find('div', class_='a-row a-size-small')
            return_exchange = return_exchange.text.strip() if return_exchange else '-'

            expected_delivery = product.find('span', class_='a-text-bold')
            expected_delivery = expected_delivery.text.strip() if expected_delivery else '-'

            availability = product.find('span', class_='a-size-medium a-color-success')
            availability = availability.text.strip() if availability else '-'

            product_url = product.find('a', class_='a-link-normal')['href']
            product_url = f"{base_url}{product_url}" if product_url.startswith('/') else product_url

            # Append product details to the list
            products_data.append({
                'Brand Name': brand_name,
                'Name of the Product': product_name,
                'Price': price,
                'Return/Exchange': return_exchange,
                'Expected Delivery': expected_delivery,
                'Availability': availability,
                'Product URL': product_url
            })

    # Create a DataFrame from the list of product details
    df = pd.DataFrame(products_data)

    # Save the DataFrame to a CSV file
    df.to_csv('amazon_products_data.csv', index=False)

# Example usage
search_amazon_products("laptop")


3.Write a python program to access the search bar and search button on images.google.com and scrape 10 
images each for keywords ‘fruits’, ‘cars’ and ‘Machine Learning’, ‘Guitar’, ‘Cakes’.

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os
import requests

def google_images_scraper(keywords, num_images=10):
    # Set up the WebDriver (specify the path to your WebDriver)
    driver_path = '/path/to/chromedriver'
    driver = webdriver.Chrome(executable_path=driver_path)

    for keyword in keywords:
        # Open Google Images
        driver.get("https://images.google.com/")

        # Find the search bar and enter the keyword
        search_bar = driver.find_element("name", "q")
        search_bar.clear()
        search_bar.send_keys(keyword)
        search_bar.send_keys(Keys.RETURN)

        # Wait for the search results to load
        time.sleep(2)

        # Scroll down to load more images
        for _ in range(3):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)

        # Find and scrape image URLs
        img_elements = driver.find_elements_by_css_selector('img.Q4LuWd')
        image_urls = [img.get_attribute('src') for img in img_elements]

        # Download the images
        save_directory = f"{keyword}_images"
        os.makedirs(save_directory, exist_ok=True)

        for i, img_url in enumerate(image_urls[:num_images]):
            response = requests.get(img_url, stream=True)
            with open(f"{save_directory}/image_{i + 1}.jpg", 'wb') as img_file:
                img_file.write(response.content)

    # Close the browser window
    driver.quit()

# Keywords for image searches
keywords = ['fruits', 'cars', 'Machine Learning', 'Guitar', 'Cakes']

# Specify the number of images to scrape for each keyword
num_images_per_keyword = 10

# Run the scraper
google_images_scraper(keywords, num_images_per_keyword)


4.Write a python program to search for a smartphone(e.g.: Oneplus Nord, pixel 4A, etc.) on www.flipkart.com
and scrape following details for all the search results displayed on 1st page. Details to be scraped: “Brand 
Name”, “Smartphone name”, “Colour”, “RAM”, “Storage(ROM)”, “Primary Camera”, 
“Secondary Camera”, “Display Size”, “Battery Capacity”, “Price”, “Product URL”. Incase if any of the 
details is missing then replace it by “- “. Save your results in a dataframe and CSV.

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_flipkart_smartphones(keyword):
    url = f'https://www.flipkart.com/search?q={keyword}&page=1'

    # Make a request to the Flipkart search results page
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract smartphone details from the page
    products = soup.find_all('div', class_='_1AtVbE')

    smartphones_data = []

    for product in products:
        brand_name = product.find('div', class_='_4rR01T').text.strip()
        smartphone_name = product.find('a', class_='IRpwTa').text.strip()
        colour = product.find('div', class_='uEhfTl').text.strip()

        details = product.find_all('li', class_='rgWa7D')
        ram = details[0].text.strip() if len(details) > 0 else '-'
        storage = details[1].text.strip() if len(details) > 1 else '-'
        primary_camera = details[2].text.strip() if len(details) > 2 else '-'
        secondary_camera = details[3].text.strip() if len(details) > 3 else '-'
        display_size = details[4].text.strip() if len(details) > 4 else '-'
        battery_capacity = details[5].text.strip() if len(details) > 5 else '-'

        price = product.find('div', class_='_30jeq3').text.strip()

        product_url = product.find('a', class_='IRpwTa')['href']
        product_url = f"https://www.flipkart.com{product_url}" if product_url.startswith('/') else product_url

        # Append smartphone details to the list
        smartphones_data.append({
            'Brand Name': brand_name,
            'Smartphone Name': smartphone_name,
            'Colour': colour,
            'RAM': ram,
            'Storage(ROM)': storage,
            'Primary Camera': primary_camera,
            'Secondary Camera': secondary_camera,
            'Display Size': display_size,
            'Battery Capacity': battery_capacity,
            'Price': price,
            'Product URL': product_url
        })

    # Create a DataFrame from the list of smartphone details
    df = pd.DataFrame(smartphones_data)

    # Save the DataFrame to a CSV file
    df.to_csv(f'{keyword}_smartphones_data.csv', index=False)

# Example usage
search_keyword = 'Oneplus Nord'  # You can replace this with any smartphone you want to search
scrape_flipkart_smartphones(search_keyword)


5. Write a program to scrap geospatial coordinates (latitude, longitude) of a city searched on google maps.

In [None]:
import requests

def get_coordinates(api_key, city_name):
    base_url = "https://maps.googleapis.com/maps/api/geocode/json"

    # Prepare parameters for the API request
    params = {
        'address': city_name,
        'key': api_key,
    }

    # Make the API request
    response = requests.get(base_url, params=params)
    data = response.json()

    # Check if the request was successful
    if data['status'] == 'OK':
        # Extract latitude and longitude
        location = data['results'][0]['geometry']['location']
        latitude = location['lat']
        longitude = location['lng']
        return latitude, longitude
    else:
        print(f"Error: {data['status']}")
        return None

# Replace 'your_api_key' with your actual Google Maps API key
google_maps_api_key = 'your_api_key'
city_to_search = 'New York'  # Replace with the city you want to search

coordinates = get_coordinates(google_maps_api_key, city_to_search)

if coordinates:
    print(f"Coordinates for {city_to_search}: Latitude = {coordinates[0]}, Longitude = {coordinates[1]}")
else:
    print("Failed to retrieve coordinates.")


6.Write a program to scrap all the available details of best gaming laptops from digit.in.

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_digit_gaming_laptops():
    url = "https://www.digit.in/top-products/best-gaming-laptops-40.html"

    # Make a request to the digit.in page
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract gaming laptop details from the page
    laptops_data = []

    products = soup.find_all('div', class_='TopNumbeHeading')

    for product in products:
        laptop_name = product.find('div', class_='TopNumbeProductTitle').text.strip()
        specs = product.find('div', class_='TopNumbeRow').text.strip().split('|')
        
        processor = specs[0].strip() if len(specs) > 0 else '-'
        ram = specs[1].strip() if len(specs) > 1 else '-'
        storage = specs[2].strip() if len(specs) > 2 else '-'
        display_size = specs[3].strip() if len(specs) > 3 else '-'
        price = product.find('div', class_='TopNumbePrice').text.strip()

        # Append laptop details to the list
        laptops_data.append({
            'Laptop Name': laptop_name,
            'Processor': processor,
            'RAM': ram,
            'Storage': storage,
            'Display Size': display_size,
            'Price': price
        })

    # Create a DataFrame from the list of laptop details
    df = pd.DataFrame(laptops_data)

    # Save the DataFrame to a CSV file
    df.to_csv('digit_gaming_laptops_data.csv', index=False)

# Run the scraper
scrape_digit_gaming_laptops()


7. Write a python program to scrape the details for all billionaires from www.forbes.com. Details to be scrapped: 
“Rank”, “Name”, “Net worth”, “Age”, “Citizenship”, “Source”, “Industry”.

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_forbes_billionaires():
    url = "https://www.forbes.com/billionaires/"

    # Make a request to the Forbes Billionaires page
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract billionaire details from the page
    billionaires_data = []

    billionaires = soup.find_all('div', class_='personName')

    for billionaire in billionaires:
        rank = billionaire.find_previous('div', class_='rank').text.strip()
        name = billionaire.text.strip()

        # Find the parent div and extract other details
        parent_div = billionaire.find_parent('div', class_='personInfo')
        net_worth = parent_div.find('div', class_='netWorth').text.strip()
        age = parent_div.find('div', class_='age').text.strip()
        citizenship = parent_div.find('div', class_='countryOfCitizenship').text.strip()
        source = parent_div.find('div', class_='source').text.strip()
        industry = parent_div.find('div', class_='category').text.strip()

        # Append billionaire details to the list
        billionaires_data.append({
            'Rank': rank,
            'Name': name,
            'Net Worth': net_worth,
            'Age': age,
            'Citizenship': citizenship,
            'Source': source,
            'Industry': industry
        })

    # Create a DataFrame from the list of billionaire details
    df = pd.DataFrame(billionaires_data)

    # Save the DataFrame to a CSV file
    df.to_csv('forbes_billionaires_data.csv', index=False)

# Run the scraper
scrape_forbes_billionaires()


8.Write a program to extract at least 500 Comments, Comment upvote and time when comment was posted 
from any YouTube Video.

In [None]:
import google_auth_oauthlib.flow
import googleapiclient.discovery

def get_youtube_comments(api_key, video_id, max_comments=500):
    # Set up the YouTube Data API
    youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=api_key)

    # Get video comments
    request = youtube.commentThreads().list(
        part="snippet",
        videoId=video_id,
        maxResults=max_comments
    )
    response = request.execute()

    comments_data = []

    # Extract comments, comment upvotes, and time when comments were posted
    for item in response['items']:
        comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
        upvotes = item['snippet']['topLevelComment']['snippet']['likeCount']
        time_posted = item['snippet']['topLevelComment']['snippet']['publishedAt']

        comments_data.append({
            'Comment': comment,
            'Upvotes': upvotes,
            'Time Posted': time_posted
        })

    return comments_data

# Replace 'your_api_key' and 'your_video_id' with your actual API key and YouTube video ID
api_key = 'your_api_key'
video_id = 'your_video_id'

comments_data = get_youtube_comments(api_key, video_id)

# Print the first few comments
for i, comment_info in enumerate(comments_data[:5], start=1):
    print(f"Comment {i}:")
    print(f"Comment: {comment_info['Comment']}")
    print(f"Upvotes: {comment_info['Upvotes']}")
    print(f"Time Posted: {comment_info['Time Posted']}")
    print("\n")


In [None]:
!pip install google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client


9.Write a python program to scrape a data for all available Hostels from https://www.hostelworld.com/ in 
“London” location. You have to scrape hostel name, distance from city centre, ratings, total reviews, overall 
reviews, privates from price, dorms from price, facilities and property description.

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_hostelworld_data(location='London'):
    url = f'https://www.hostelworld.com/s?q={location}&country=England&city={location}&dateFrom=2024-02-01&dateTo=2024-02-07&number_of_guests=1&page=1'

    # Make a request to the Hostelworld page
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract hostel details from the page
    hostels_data = []

    hostels = soup.find_all('div', class_='property-card')

    for hostel in hostels:
        name = hostel.find('h2', class_='title title-6').text.strip()
        distance = hostel.find('span', class_='description').text.strip().split()[0]
        ratings = hostel.find('div', class_='score orange big').text.strip()
        total_reviews = hostel.find('div', class_='reviews').text.strip().split()[0]
        overall_reviews = hostel.find('div', class_='reviews').text.strip().split()[3]
        privates_price = hostel.find('div', class_='price-col privates from-price').text.strip()
        dorms_price = hostel.find('div', class_='price-col dorms from-price').text.strip()
        facilities = ', '.join([facility.text.strip() for facility in hostel.find_all('li', class_='facility-badge')])
        property_description = hostel.find('div', class_='rating-factors').find_next('span').text.strip()

        # Append hostel details to the list
        hostels_data.append({
            'Name': name,
            'Distance from City Centre': distance,
            'Ratings': ratings,
            'Total Reviews': total_reviews,
            'Overall Reviews': overall_reviews,
            'Privates from Price': privates_price,
            'Dorms from Price': dorms_price,
            'Facilities': facilities,
            'Property Description': property_description
        })

    # Create a DataFrame from the list of hostel details
    df = pd.DataFrame(hostels_data)

    # Save the DataFrame to a CSV file
    df.to_csv(f'{location}_hostels_data.csv', index=False)

# Run the scraper
scrape_hostelworld_data()
