<a href="https://colab.research.google.com/github/SammyGbabs/APIs-and-Web-scrapping-/blob/main/PLD5_Web_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hockey Teams Website scraping:

In [None]:
# Importing libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [None]:
# Define a list to collect all rows across all pages
all_row_data = []


# in our case taking only the first 3 pages
for page in range(1, 4):
    url = f'https://www.scrapethissite.com/pages/forms/?page_num={page}'

    # Sending get request to the pages
    res = requests.get(url)

    # Creating soup object
    soup = BeautifulSoup(res.text, 'html.parser')

    # Finding the first table in the page
    table = soup.find('table')

    # Extracting row data (ignore the first header row)
    rows = table.find_all('tr')[1:]

    # Looping through each row and extracting the columns
    for row in rows:
        row_data = row.find_all('td')
        individual_row_data = [data.text.strip() for data in row_data]
        if individual_row_data:  # Avoid appending empty lists
            all_row_data.append(individual_row_data)

print(all_row_data)

[['Boston Bruins', '1990', '44', '24', '', '0.55', '299', '264', '35'], ['Buffalo Sabres', '1990', '31', '30', '', '0.388', '292', '278', '14'], ['Calgary Flames', '1990', '46', '26', '', '0.575', '344', '263', '81'], ['Chicago Blackhawks', '1990', '49', '23', '', '0.613', '284', '211', '73'], ['Detroit Red Wings', '1990', '34', '38', '', '0.425', '273', '298', '-25'], ['Edmonton Oilers', '1990', '37', '37', '', '0.463', '272', '272', '0'], ['Hartford Whalers', '1990', '31', '38', '', '0.388', '238', '276', '-38'], ['Los Angeles Kings', '1990', '46', '24', '', '0.575', '340', '254', '86'], ['Minnesota North Stars', '1990', '27', '39', '', '0.338', '256', '266', '-10'], ['Montreal Canadiens', '1990', '39', '30', '', '0.487', '273', '249', '24'], ['New Jersey Devils', '1990', '32', '33', '', '0.4', '272', '264', '8'], ['New York Islanders', '1990', '25', '45', '', '0.312', '223', '290', '-67'], ['New York Rangers', '1990', '36', '31', '', '0.45', '297', '265', '32'], ['Philadelphia Flyer

In [None]:
# getting the column titles
table_titles = table.find_all('th')

text_table_titles = [title.text.strip() for title in table_titles] # Just get the text

print(text_table_titles)

['Team Name', 'Year', 'Wins', 'Losses', 'OT Losses', 'Win %', 'Goals For (GF)', 'Goals Against (GA)', '+ / -']


In [None]:
# Now let's get that into a dataframe and save it in csv format
df = pd.DataFrame(all_row_data, columns = text_table_titles)
df

Unnamed: 0,Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -
0,Boston Bruins,1990,44,24,,0.55,299,264,35
1,Buffalo Sabres,1990,31,30,,0.388,292,278,14
2,Calgary Flames,1990,46,26,,0.575,344,263,81
3,Chicago Blackhawks,1990,49,23,,0.613,284,211,73
4,Detroit Red Wings,1990,34,38,,0.425,273,298,-25
...,...,...,...,...,...,...,...,...,...
70,Calgary Flames,1993,42,29,,0.5,302,256,46
71,Chicago Blackhawks,1993,39,36,,0.464,254,240,14
72,Dallas Stars,1993,42,29,,0.5,286,265,21
73,Detroit Red Wings,1993,46,30,,0.548,356,275,81


In [None]:
# Export the DataFrame to a CSV file excluding the indexing
df.to_csv('hockey_teams_data.csv', index=False)

# Amazon Website

In [20]:
import requests
from bs4 import BeautifulSoup
import os
import urllib

# Set headers to mimic a real browser
headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
    'Accept-Language': 'en-US, en;q=0.5'
}



urls = [
    'https://www.amazon.com/s?k=electronics',
    'https://www.amazon.com/s?k=watches',
    'https://www.amazon.com/s?k=toys',
    'https://www.amazon.com/s?k=clothing',
    'https://www.amazon.com/s?k=kitchen',
]

# Create a directory to save images
if not os.path.exists('amazon_images'):
    os.makedirs('amazon_images')

# Function to download images
def download_image(url, file_path):
    urllib.request.urlretrieve(url, file_path)

# Scraping function
def scrape_amazon_product_data(url):
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.content, 'html.parser')

    # Check if the element exists before accessing it
    products_container = soup.find('div', {'class': 's-main-slot s-result-list s-search-results sg-row'})

    if products_container:
        items = products_container.find_all('div', {'data-component-type': 's-search-result'}, limit=1)

        for item in items:
            try:
                # Extract product name
                product_name = item.h2.text.strip()

                # Extract image URL
                image = item.find('img', {'class': 's-image'})
                image_url = image['src']

                # Save the image
                image_name = product_name.replace(' ', '').replace('/', '')[:50]
                image_path = f'amazon_images/{image_name}.jpg'
                download_image(image_url, image_path)

                # Print product name and the image path
                print(f'Product: {product_name}')
                print(f'Image saved at: {image_path}')
                print('-' * 40)
            except Exception as e:
                print(f"Error processing item: {e}")
    else:
        print("No products found on the page or the page structure has changed.")


# Iterate through each category URL and scrape data
for url in urls:
    print(f"Scraping category: {url}")
    scrape_amazon_product_data(url)

Scraping category: https://www.amazon.com/s?k=electronics
No products found on the page or the page structure has changed.
Scraping category: https://www.amazon.com/s?k=watches
No products found on the page or the page structure has changed.
Scraping category: https://www.amazon.com/s?k=toys
No products found on the page or the page structure has changed.
Scraping category: https://www.amazon.com/s?k=clothing
Product: AUTOMET
Image saved at: amazon_images/AUTOMET.jpg
----------------------------------------
Scraping category: https://www.amazon.com/s?k=kitchen
No products found on the page or the page structure has changed.
