In [None]:
'''Write a python program which searches all the product under a particular product from www.amazon.in. The
product to be searched will be taken as input from user. For e.g. If user input is ‘guitar’. Then search for
guitars'''

import requests
from bs4 import BeautifulSoup

def search_amazon(product):
    # Set up the URL and query parameters
    url = 'https://www.amazon.in/s'
    params = {'k': product}
    
    # Send a GET request to Amazon with the query parameters
    response = requests.get(url, params=params)
    
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all the product listings on the page
    products = soup.find_all('div', {'class': 's-result-item'})
    
    # Extract the relevant information from each product listing
    for product in products:
        # Get the product title
        title = product.find('h2').text.strip()
        
        # Get the product price, if available
        price_element = product.find('span', {'class': 'a-price-whole'})
        price = price_element.text.strip() if price_element else 'N/A'
        
        # Get the product rating, if available
        rating_element = product.find('span', {'class': 'a-icon-alt'})
        rating = rating_element.text.strip() if rating_element else 'N/A'
        
        # Print the information for the product
        print(f'{title}\nPrice: {price}\nRating: {rating}\n')

search_amazon('guitar')

In [None]:
'''In the above question, now scrape the following details of each product listed in first 3 pages of your search
results and save it in a data frame and csv. In case if any product has less than 3 pages in search results then
scrape all the products available under that product name. Details to be scraped are: "Brand
Name", "Name of the Product", "Price", "Return/Exchange", "Expected Delivery", "Availability" and
“Product URL”. In case, if any of the details are missing for any of the product then replace it by “-“ '''

import pandas as pd

def scrape_products(product, pages):
    # Set up the URL and query parameters
    url = 'https://www.amazon.in/s'
    params = {'k': product}
    
    # Create an empty list to store the data for each product
    products_data = []
    
    # Iterate over the specified number of pages of search results
    for page in range(1, pages+1):
        params['page'] = page
        
        # Send a GET request to Amazon with the query parameters
        response = requests.get(url, params=params)
        
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all the product listings on the page
        products = soup.find_all('div', {'class': 's-result-item'})
        
        # Extract the relevant information from each product listing
        for product in products:
            # Get the product title and URL
            title_element = product.find('h2')
            if title_element:
                title = title_element.text.strip()
                url = 'https://www.amazon.in' + title_element.find('a')['href']
            else:
                title = '-'
                url = '-'
            
            # Get the product brand and name
            brand_element = product.find('span', {'class': 'a-size-base-plus'})
            if brand_element:
                brand_name = brand_element.text.strip().split(' ')[0]
                name = ' '.join(brand_element.text.strip().split(' ')[1:])
            else:
                brand_name = '-'
                name = title
            
            # Get the product price and availability
            price_element = product.find('span', {'class': 'a-price-whole'})
            if price_element:
                price = price_element.text.strip()
                availability = product.find('span', {'class': 'a-size-medium a-color-success'}).text.strip()
            else:
                price = '-'
                availability = '-'
            
            # Get the product return/exchange policy
            return_element = product.find('div', {'class': 'a-section a-spacing-none'})
            if return_element:
                return_policy = return_element.find('div', {'class': 'a-row a-size-small'}).text.strip()
            else:
                return_policy = '-'
            
            # Get the product expected delivery date
            delivery_element = product.find('div', {'class': 'a-section a-spacing-base a-text-center'})
            if delivery_element:
                delivery_text = delivery_element.find_all('div')[1].text.strip()
                expected_delivery = delivery_text.replace('Delivery by ', '')
            else:
                expected_delivery = '-'
            
            # Add the data for this product to the list
            products_data.append({
                'Brand Name': brand_name,
                'Name of the Product': name,
                'Price': price,
                'Return/Exchange': return_policy,
                'Expected Delivery': expected_delivery,
                'Availability': availability,
                'Product URL': url
            })
    
    # Create a pandas DataFrame from the products data
    df = pd.DataFrame(products_data)
    
    # Save the data to a CSV file
    df.to_csv(f'{product}_products.csv', index=False)
    
    return df

In [None]:
'''Write a python program to access the search bar and search button on images.google.com and scrape 10
images each for keywords ‘fruits’, ‘cars’ and ‘Machine Learning’, ‘Guitar’, ‘Cakes’ '''

from selenium import webdriver
import urllib.request
import time

# create a webdriver object and set options
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')

# set the path to the chromedriver.exe file (make sure to download and place it in the working directory)
driver_path = './chromedriver.exe'
driver = webdriver.Chrome(driver_path, options=options)

# navigate to images.google.com
driver.get('https://www.google.com/imghp')

# locate the search bar and search button
search_bar = driver.find_element_by_name('q')
search_button = driver.find_element_by_css_selector('button[jsaction="click:trigger.search"]')

# define the search terms
search_terms = ['fruits', 'cars', 'Machine Learning', 'Guitar', 'Cakes']

# loop through the search terms and scrape 10 images for each term
for term in search_terms:
    # enter the search term in the search bar and click the search button
    search_bar.send_keys(term)
    search_button.click()
    time.sleep(2)

    # scroll down to load more images
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # scrape the image urls and download the images
    image_urls = driver.find_elements_by_css_selector('img.rg_i')
    count = 1
    for image in image_urls[:10]:
        try:
            image_url = image.get_attribute('src')
            urllib.request.urlretrieve(image_url, f'{term}_{count}.jpg')
            print(f'{term} image {count} downloaded')
            count += 1
        except:
            print(f'Error downloading {term} image {count}')
            count += 1

    # clear the search bar for the next search term
    search_bar.clear()

# close the webdriver
driver.quit()

In [None]:
'''Write a python program to search for a smartphone(e.g.: Oneplus Nord, pixel 4A, etc.) on www.flipkart.com
and scrape following details for all the search results displayed on 1st page. Details to be scraped: “Brand
Name”, “Smartphone name”, “Colour”, “RAM”, “Storage(ROM)”, “Primary Camera”,
“Secondary Camera”, “Display Size”, “Battery Capacity”, “Price”, “Product URL”'''

from selenium import webdriver
import time

# create a webdriver object and set options
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')

# set the path to the chromedriver.exe file (make sure to download and place it in the working directory)
driver_path = './chromedriver.exe'
driver = webdriver.Chrome(driver_path, options=options)

# navigate to flipkart.com
driver.get('https://www.flipkart.com/')

# close the login popup if it appears
try:
    driver.find_element_by_xpath('/html/body/div[2]/div/div/button').click()
except:
    pass

# locate the search bar and search button
search_bar = driver.find_element_by_name('q')
search_button = driver.find_element_by_xpath('/html/body/div/div/div[1]/div[1]/div[2]/div[2]/form/div/button')

# enter the search term and click the search button
search_term = 'Oneplus Nord' # change this to the desired smartphone name
search_bar.send_keys(search_term)
search_button.click()
time.sleep(2)

# loop through the search results and scrape the required details
results = driver.find_elements_by_css_selector('div._2kHMtA')
for result in results:
    try:
        brand = result.find_element_by_css_selector('div._2WkVRV').text
        name = result.find_element_by_css_selector('a._1fQZEK').text
        url = result.find_element_by_css_selector('a._1fQZEK').get_attribute('href')
        details = result.find_element_by_css_selector('ul._1xgFaf')
        color = details.find_elements_by_css_selector('li')[0].text
        ram = details.find_elements_by_css_selector('li')[1].text
        rom = details.find_elements_by_css_selector('li')[2].text
        camera = details.find_elements_by_css_selector('li')[3].text
        front_camera = details.find_elements_by_css_selector('li')[4].text
        display = details.find_elements_by_css_selector('li')[5].text
        battery = details.find_elements_by_css_selector('li')[6].text
        price = result.find_element_by_css_selector('div._30jeq3._1_WHN1').text
        print(f'Brand: {brand}\nName: {name}\nColor: {color}\nRAM: {ram}\nROM: {rom}\nPrimary Camera: {camera}\nSecondary Camera: {front_camera}\nDisplay Size: {display}\nBattery Capacity: {battery}\nPrice: {price}\nProduct URL: {url}\n')
    except:
        pass

# close the webdriver
driver.quit()

In [None]:
# Write a program to scrap geospatial coordinates (latitude, longitude) of a city searched on google maps

import requests
from bs4 import BeautifulSoup

# ask user to enter the city name
city = input('Enter the name of the city: ')

# send a GET request to Google Maps with the city name as the search query
url = f'https://www.google.com/maps/search/{city}'
response = requests.get(url)

# parse the HTML response using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# locate the div containing the coordinates and extract the latitude and longitude
coordinates_div = soup.find('div', class_='ugiz4pqJLAG__primary-text gm2-body-2')
coordinates = coordinates_div.text.split(',')

latitude = coordinates[0].strip()
longitude = coordinates[1].strip()

# print the latitude and longitude
print(f'Latitude: {latitude}\nLongitude: {longitude}')


In [None]:
# Write a program to scrap all the available details of best gaming laptops from digit.in

import requests
from bs4 import BeautifulSoup

# send a GET request to the URL of the webpage we want to scrape
url = "https://www.digit.in/top-products/best-gaming-laptops-40.html"
response = requests.get(url)

# create soup object to parse HTML
soup = BeautifulSoup(response.content, 'html.parser')

# find the section containing all the laptops
laptops = soup.find_all('div', class_='TopNumbeHeading sticky-footer')

# create lists to store the laptop details
names = []
prices = []
ratings = []
specs = []

# iterate over each laptop and extract details
for laptop in laptops:
    # extract name, price, and rating
    name = laptop.a.text.strip()
    price = laptop.find('div', class_='smprice').text.strip()[1:]
    rating = laptop.find('div', class_='rating').text.strip()
    
    # extract specs
    url = laptop.a['href']
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    specs_dict = {}
    specs_table = soup.find('div', class_='specifications_table')
    rows = specs_table.find_all('tr')
    for row in rows:
        key = row.find('td', class_='smkey').text.strip()
        value = row.find('td', class_='smvalue').text.strip()
        specs_dict[key] = value
    
    # add details to lists
    names.append(name)
    prices.append(price)
    ratings.append(rating)
    specs.append(specs_dict)

# print the details of each laptop
for i in range(len(laptops)):
    print(f"Laptop {i+1}:")
    print(f"Name: {names[i]}")
    print(f"Price: {prices[i]}")
    print(f"Rating: {ratings[i]}")
    print("Specifications:")
    for key, value in specs[i].items():
        print(f"\t{key}: {value}")
    print()

In [None]:
'''Write a python program to scrape the details for all billionaires from www.forbes.com. Details to be scrapped:
“Rank”, “Name”, “Net worth”, “Age”, “Citizenship”, “Source”, “Industry”'''

import requests
from bs4 import BeautifulSoup

# send a GET request to the URL of the webpage we want to scrape
url = "https://www.forbes.com/billionaires/"
response = requests.get(url)

# create soup object to parse HTML
soup = BeautifulSoup(response.content, 'html.parser')

# find the table containing all the billionaires
table = soup.find('table', class_='table')

# create lists to store the billionaire details
ranks = []
names = []
net_worths = []
ages = []
citizenships = []
sources = []
industries = []

# iterate over each row in the table and extract details
rows = table.find_all('tr')
for row in rows[1:]:
    # extract rank, name, and net worth
    rank = row.find('td', class_='rank').text.strip()
    name = row.find('td', class_='name').text.strip()
    net_worth = row.find('td', class_='netWorth').text.strip()[1:]
    
    # extract age, citizenship, source, and industry
    columns = row.find_all('td', class_='')
    age = columns[2].text.strip()
    citizenship = columns[3].text.strip()
    source = columns[4].text.strip()
    industry = columns[5].text.strip()
    
    # add details to lists
    ranks.append(rank)
    names.append(name)
    net_worths.append(net_worth)
    ages.append(age)
    citizenships.append(citizenship)
    sources.append(source)
    industries.append(industry)

# print the details of each billionaire
for i in range(len(ranks)):
    print(f"Billionaire {i+1}:")
    print(f"Rank: {ranks[i]}")
    print(f"Name: {names[i]}")
    print(f"Net worth: {net_worths[i]}")
    print(f"Age: {ages[i]}")
    print(f"Citizenship: {citizenships[i]}")
    print(f"Source: {sources[i]}")
    print(f"Industry: {industries[i]}")
    print()

In [None]:
# Write a program to extract at least 500 Comments, Comment upvote and time when comment was posted from any YouTube Video. 

pip install google-auth google-auth-oauthlib

from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from datetime import datetime, timedelta
import time

# Replace with your own API key
API_KEY = "YOUR_API_KEY_HERE"

# Set up the YouTube Data API client
youtube = build("youtube", "v3", developerKey=API_KEY)

# Get the video ID of the Leo Promo video
video_id = "VIDEO_ID_HERE"

# Get the comment thread iterator
def get_comment_threads(video_id):
    try:
        response = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            textFormat="plainText",
            maxResults=100
        ).execute()
        yield response
        while 'nextPageToken' in response:
            time.sleep(1)
            response = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                textFormat="plainText",
                maxResults=100,
                pageToken=response['nextPageToken']
            ).execute()
            yield response
    except HttpError as e:
        print(f"An HTTP error {e.resp.status} occurred:\n{e.content}")

# Get the comments and their metadata
comments = []
for thread in get_comment_threads(video_id):
    for item in thread['items']:
        comment = item['snippet']['topLevelComment']['snippet']
        comment_data = {
            "text": comment['textDisplay'],
            "upvotes": comment['likeCount'],
            "timestamp": datetime.strptime(comment['publishedAt'], '%Y-%m-%dT%H:%M:%S%z').strftime('%Y-%m-%d %H:%M:%S')
        }
        comments.append(comment_data)
        if len(comments) >= 500:
            break
    if len(comments) >= 500:
        break

# Print the comments
for comment in comments:
    print(f"{comment['timestamp']} - {comment['upvotes']} upvotes\n{comment['text']}\n")


In [None]:
'''Write a python program to scrape a data for all available Hostels from https://www.hostelworld.com/ in
“London” location. You have to scrape hostel name, distance from city centre, ratings, total reviews, overall
reviews, privates from price, dorms from price, facilities and property description'''

import requests
from bs4 import BeautifulSoup

# Define the URL for the search results page
url = "https://www.hostelworld.com/search?search_keywords=London,%20England&country=England&city=London&date_from=2023-03-10&date_to=2023-03-13&number_of_guests=1"

# Send a GET request to the URL and get the response
response = requests.get(url)

# Parse the HTML content of the response using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Find all the hostel listings on the search results page
hostel_listings = soup.find_all("div", class_="property-card")

# Loop over the hostel listings and extract the data we want
for listing in hostel_listings:
    # Get the hostel name
    hostel_name = listing.find("h2", class_="title").text.strip()

    # Get the distance from the city center
    distance = listing.find("span", class_="description").text.strip()

    # Get the overall rating and total reviews
    rating = listing.find("div", class_="score orange big").text.strip()
    total_reviews = listing.find("div", class_="reviews").text.strip()

    # Get the prices for privates and dorms
    prices = listing.find("div", class_="price-col")
    private_price = prices.find("span", class_="price").text.strip()
    dorm_price = prices.find("span", class_="price").find_next_sibling().text.strip()

    # Get the hostel facilities
    facilities = listing.find("div", class_="facilities-label").find_next_sibling().text.strip()

    # Get the property description
    description = listing.find("div", class_="property-description").text.strip()

    # Print the hostel data
    print("Hostel Name:", hostel_name)
    print("Distance from City Center:", distance)
    print("Overall Rating:", rating)
    print("Total Reviews:", total_reviews)
    print("Private Room Prices:", private_price)
    print("Dorm Room Prices:", dorm_price)
    print("Facilities:", facilities)
    print("Description:", description)
    print("\n")