## Nederland

### Aldi

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
from datetime import datetime
from selenium.webdriver.chrome.service import Service



# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize the Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of URLs to scrape
urls = ["https://www.aldi.nl/zoeken.html?query=noten&searchCategory=Submitted%20Search&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&configure%5BclickAnalytics%5D=true"
        ,"https://www.aldi.nl/producten/chips-noten/noten-zaden-en-pitten.html"
       ,"https://www.aldi.nl/producten/chips-noten/zoutjes.html"]

# Create an empty list to store all product details
all_products = []

# Loop over the list of URLs
for url in urls:

    driver.get(url)

    # Wait for the articles to load
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "mod-article-tile--default")))

    # Retrieve the elements after the wait
    articles = driver.find_elements(By.CLASS_NAME, "mod-article-tile--default")

    # Extract details for each article on the page
    for article in articles:
        # Use BeautifulSoup to parse the individual article's HTML
        soup = BeautifulSoup(article.get_attribute('outerHTML'), "html.parser")

        title = soup.find('span', class_='mod-article-tile__title').get_text(strip=True) if soup.find('span', class_='mod-article-tile__title') else 'Title not found'
        promo_price_element = soup.find('s', class_='price__previous')
        promo_price = promo_price_element.get_text(strip=True) if promo_price_element else 'Promo price not found'
        current_price_element = soup.find('span', class_='price__wrapper')
        current_price = current_price_element.get_text(strip=True) if current_price_element else 'Price not found'
        weight = soup.find('span', class_='price__unit').get_text(strip=True) if soup.find('span', class_='price__unit') else 'Weight not found'

        all_products.append((title, current_price, promo_price, weight))

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD HH:MM:SS

# Write the data to a CSV file
with open('Berrie.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write header only if the file is empty
    if file.tell() == 0:
        writer.writerow(['Title', 'Price', 'Promo Price', 'Weight', 'Timestamp'])  # CSV header
    for product in all_products:
        writer.writerow((*product, timestamp))  # Write product data with timestamp

print("Data has been successfully saved")

# Close the driver
driver.quit()


Data has been successfully saved


### Albert Heijn

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import re  # Importing the regular expression module
from datetime import datetime  # Importing datetime for timestamp
from selenium.webdriver.chrome.service import Service

# Initialize Chrome driver with Service
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=Options())

url = "https://www.ah.nl/producten/chips-noten-toast-popcorn/noten?merk=AH&page=6"
driver.get(url)
time.sleep(5)

# Borrowed the 'very cool :)' accepteer cookies button from TOTO scraper(group assignment)
accept_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "decline-cookies")))
accept_button.click()

html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

time.sleep(5)

# List to store the extracted product information
products = []

# Loop through all product articles
for article in soup.find_all('article', class_='product-card-portrait_root__ZiRpZ'):
    # Extract the price from the aria-label of the sr-only span
    price_span = article.find('span', class_='sr-only')
    if price_span:
        # Use regular expression to extract the numeric price (e.g., 1.99)
        match = re.search(r'[\d]+[.,][\d]+', price_span.get('aria-label'))
        price = match.group() if match else 'Price not found'
    else:
        price = 'Price not found'
        
    # Extract the promo price (if available) from the correct div
    promo_price_span = article.find('div', class_='price-amount_highlight__ekL92')
    if promo_price_span:
        # Use a nested find to get the sr-only span within the promo price div
        promo_price_span_inner = promo_price_span.find('span', class_='sr-only')
        if promo_price_span_inner:
            # Use regular expression to extract the numeric promo price (e.g., 6.53)
            match_promo_price = re.search(r'[\d]+[.,][\d]+', promo_price_span_inner.get('aria-label'))
            promo_price = match_promo_price.group() if match_promo_price else 'Promo price not found'
        else:
            promo_price = 'Promo price not found'
    else:
        promo_price = 'Promo price not found'

    # Extract the product title from the title attribute of the anchor tag
    title_tag = article.find('a', class_='link_root__EqRHd')
    title = title_tag.get('title') if title_tag else 'Title not found'
    
    # Extract the weight from the product-unit-size span
    weight_span = article.find('span', class_='price_unitSize__Hk6E4')
    weight = weight_span.get_text(strip=True) if weight_span else 'Weight not found'

    # Store the extracted information as a tuple, including promo price
    products.append((title, price, promo_price, weight))
    
# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD HH:MM:SS

# Write the data to a CSV file
with open('AH_Berrie.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    for product in products:
        writer.writerow((*product, timestamp))  # Write product data with timestamp

print(f"Data has been successfully saved")

# Close the driver
driver.quit()

Data has been successfully saved


## Duitsland

### Aldi

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
from datetime import datetime

from selenium.webdriver.chrome.service import Service

# Initialize Chrome driver with Service
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=Options())

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize the Chrome driver

# List of URLs to scrape
urls = ["https://www.aldi-nord.de/suchergebnisse.html?query=asiatisce%20snack&searchCategory=Submitted%20Search&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&configure%5BclickAnalytics%5D=true"
       ,"https://www.aldi-nord.de/suchergebnisse.html?query=kerne&searchCategory=Submitted%20Search&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&configure%5BclickAnalytics%5D=true"]
# Create an empty list to store all product details
all_products = []

# Loop over the list of URLs
for url in urls:

    driver.get(url)

    # Wait for the articles to load
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "mod-article-tile--default")))

    # Retrieve the elements after the wait
    articles = driver.find_elements(By.CLASS_NAME, "mod-article-tile--default")

    # Extract details for each article on the page
    for article in articles:
        # Use BeautifulSoup to parse the individual article's HTML
        soup = BeautifulSoup(article.get_attribute('outerHTML'), "html.parser")

        title = soup.find('span', class_='mod-article-tile__title').get_text(strip=True) if soup.find('span', class_='mod-article-tile__title') else 'Title not found'
        promo_price_element = soup.find('s', class_='price__previous')
        promo_price = promo_price_element.get_text(strip=True) if promo_price_element else 'Promo price not found'
        current_price_element = soup.find('span', class_='price__wrapper')
        current_price = current_price_element.get_text(strip=True) if current_price_element else 'Price not found'
        weight = soup.find('span', class_='price__unit').get_text(strip=True) if soup.find('span', class_='price__unit') else 'Weight not found'

        all_products.append((title, current_price, promo_price, weight))

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD HH:MM:SS

# Write the data to a CSV file
with open('Berrie.csv', mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write header only if the file is empty
    if file.tell() == 0:
        writer.writerow(['Title', 'Price', 'Promo Price', 'Weight', 'Timestamp'])  # CSV header
    for product in all_products:
        writer.writerow((*product, timestamp))  # Write product data with timestamp

print("Data has been successfully saved")

# Close the driver
driver.quit()


Data has been successfully saved


### Globus

In [4]:
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import time
from datetime import datetime  # Importing datetime module

# Initialize Chrome driver with Service
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=Options())

# Base URL for pagination
base_url = "https://produkte.globus.de/bedburg/suesses-salziges/nuesse-fruechte/?max-price=6.89&min-price=1.19&p="

# Open the URL (first page)
driver.get(base_url + "1")
time.sleep(5)

# Wait for the page to load and extract the number of pages
soup = BeautifulSoup(driver.page_source, "html.parser")
pagination_element = soup.find("li", class_="page-item page-last")
total_pages = int(pagination_element.find("input")["value"]) if pagination_element else 1  # Default to 1 page if not found

print(f"Total Pages: {total_pages}")

# Get the current timestamp for CSV file
timestamp = datetime.now().strftime('%Y-%m-%d')

# Open the CSV file for writing
with open('Berrie.csv', mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Product Title", "Price", "Weight", "Promo Price", "Timestamp"])  # Write header row
    
    # Loop over each page
    for page_number in range(1, total_pages + 1):
        print(f"Scraping page {page_number}...")
        
        # Get the URL for the current page
        page_url = base_url + str(page_number)
        driver.get(page_url)
        time.sleep(5)
        
        # Get the page source and parse it with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, "html.parser")
        
        # Loop through all product cards and extract data
        for product_card in soup.find_all("div", class_="product-info"):
            # Extract product title
            title_tag = product_card.find("a", class_="product-image-link product-name")
            title = title_tag.get("title").strip() if title_tag else "Title not found"

            # Extract price
            price_div = product_card.find("div", class_="unit-price js-unit-price")
            price = price_div.get("data-value") if price_div and price_div.has_attr("data-value") else "Price not found"

            # Extract weight
            weight_div = product_card.find("div", class_="price-unit-content")
            weight = weight_div.text.strip() if weight_div else "Weight not found"
            
            # Extract promo price
            promo_price = "Promo price not found"  # Default value in case promo price is not found
            promo_price_div = product_card.find("div", class_="product-price-globus-discount")
            if promo_price_div:
                promo_price_element = promo_price_div.find("div", class_="unit-price js-unit-price discount-price")
                if promo_price_element:
                    promo_price = promo_price_element.text.strip()

            # Write the product data to the CSV file
            writer.writerow([title, price, promo_price, weight, timestamp])  # Adding timestamp to the row

# Close the driver after extracting data
driver.quit()

print("Data has been written to berrie.csv")


Total Pages: 4
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Data has been written to berrie.csv


## Frankrijk

### Aldi

In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
from datetime import datetime
from selenium.webdriver.chrome.service import Service

# Initialize Chrome driver with Service
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=Options())


# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# List of URLs to scrape
urls = ["https://www.aldi.fr/produits/epicerie-salee/biscuit-aperitif-chips.html"
        ,"https://www.aldi.fr/recherche.html?query=trader%20joe&searchCategory=Submitted%20Search"]

        # Create an empty list to store all product details
all_products = []

# Loop over the list of URLs
for url in urls:

    driver.get(url)

    # Wait for the articles to load
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "mod-article-tile--default")))

    # Retrieve the elements after the wait
    articles = driver.find_elements(By.CLASS_NAME, "mod-article-tile--default")

    # Extract details for each article on the page
    for article in articles:
        # Use BeautifulSoup to parse the individual article's HTML
        soup = BeautifulSoup(article.get_attribute('outerHTML'), "html.parser")

        title = soup.find('span', class_='mod-article-tile__title').get_text(strip=True) if soup.find('span', class_='mod-article-tile__title') else 'Title not found'
        promo_price_element = soup.find('s', class_='price__previous')
        promo_price = promo_price_element.get_text(strip=True) if promo_price_element else 'Promo price not found'
        current_price_element = soup.find('span', class_='price__wrapper')
        current_price = current_price_element.get_text(strip=True) if current_price_element else 'Price not found'
        weight = soup.find('span', class_='price__unit').get_text(strip=True) if soup.find('span', class_='price__unit') else 'Weight not found'

        all_products.append((title, current_price, promo_price, weight))

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD HH:MM:SS

# Write the data to a CSV file
with open('Berrie.csv', mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    for product in all_products:
        writer.writerow((*product, timestamp))  # Write product data with timestamp

print("Data has been successfully saved")

# Close the driver
driver.quit()


Data has been successfully saved


### Carrefour

In [6]:
import csv
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service
import time

# Initialize Chrome driver with Service
options = Options()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Open the URL
url = "https://www.carrefour.fr/s?filters%5Bfacet_marque%5D%5B0%5D=CARREFOUR&q=melange&noRedirect=1&userIsPro=0&page=1"
driver.get(url)
time.sleep(5)  # Wait for the page to load

# Wait for the new button to be clickable
param_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "onetrust-pc-btn-handler")))
param_button.click()

# Wait for and click the "refuser" button
confirm_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, "ot-pc-refuse-all-handler")))
confirm_button.click()

# Parse page source with BeautifulSoup
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

# List to store product information
products = []

# Extract product details
for product_pod in soup.find_all("div", class_="main-layout__info-zone"):
    # Extract title
    title_tag = product_pod.find("a", class_="product-card-title")
    title = title_tag.text.strip() if title_tag else "Title not found"

    # Extract weight
    weight_tag = product_pod.find("p", class_="pl-text--size-m")
    weight = weight_tag.text.strip() if weight_tag else "Weight not found"

    # Extract current price (main price)
    price_main_tag = product_pod.find("div", class_="product-price__amount--main")
    if price_main_tag:
        price_main_parts = price_main_tag.find_all("p", class_="product-price__content")
        if len(price_main_parts) >= 2:
            current_price = f"{price_main_parts[0].text.strip()}{price_main_parts[1].text.strip()} €"
        else:
            current_price = "Price not found"
    else:
        current_price = "Price not found"

    # Extract promotional price
    promo_price_tag = product_pod.find("div", class_="product-price__amount--old")
    if promo_price_tag:
        promo_price_parts = promo_price_tag.find_all("p", class_="product-price__content")
        if len(promo_price_parts) >= 2:
            promo_price = f"{promo_price_parts[0].text.strip()},{promo_price_parts[1].text.strip()} €"
        else:
            promo_price = "Promo price not found"
    else:
        promo_price = "Promo price not found"

    # Append extracted information to the list
    products.append((title, current_price, promo_price, weight))

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD HH:MM:SS

# Write the data to a CSV file
with open('Berrie.csv', mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    for product in products:
        writer.writerow((*product, timestamp))  # Write product data with timestamp

print("Data has been successfully saved")

# Close the driver
driver.quit()


Data has been successfully saved


## Polen

### Aldi

In [7]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
from datetime import datetime
from selenium.webdriver.chrome.service import Service

# Initialize Chrome driver with Service
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=Options())

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")


# List of URLs to scrape
urls = ["https://www.aldi.pl/szukaj.html?query=orzechy%20trader&searchCategory=Suggested%20Search&configure%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12"
        ,"https://www.aldi.pl/szukaj.html?query=asia&searchCategory=Suggested%20Search&configure%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12"]
# Create an empty list to store all product details
all_products = []

# Loop over the list of URLs
for url in urls:

    driver.get(url)

    # Wait for the articles to load
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "mod-article-tile--default")))

    # Retrieve the elements after the wait
    articles = driver.find_elements(By.CLASS_NAME, "mod-article-tile--default")

    # Extract details for each article on the page
    for article in articles:
        # Use BeautifulSoup to parse the individual article's HTML
        soup = BeautifulSoup(article.get_attribute('outerHTML'), "html.parser")

        title = soup.find('span', class_='mod-article-tile__title').get_text(strip=True) if soup.find('span', class_='mod-article-tile__title') else 'Title not found'
        promo_price_element = soup.find('s', class_='price__previous')
        promo_price = promo_price_element.get_text(strip=True) if promo_price_element else 'Promo price not found'
        current_price_element = soup.find('span', class_='price__wrapper')
        current_price = current_price_element.get_text(strip=True) if current_price_element else 'Price not found'
        weight = soup.find('span', class_='price__unit').get_text(strip=True) if soup.find('span', class_='price__unit') else 'Weight not found'

        all_products.append((title, current_price, promo_price, weight))

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD HH:MM:SS

# Write the data to a CSV file
with open('Berrie.csv', mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write header only if the file is empty
    if file.tell() == 0:
        writer.writerow(['Title', 'Price', 'Promo Price', 'Weight', 'Timestamp'])  # CSV header
    for product in all_products:
        writer.writerow((*product, timestamp))  # Write product data with timestamp

print("Data has been successfully saved")

# Close the driver
driver.quit()


Data has been successfully saved


### Bydronka

In [8]:
# Importing necessary libraries
import csv
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service
import time
from selenium.common.exceptions import TimeoutException

# Initialize Chrome driver with Service
options = Options()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# List of URLs to scrape
urls = [
    "https://zakupy.biedronka.pl/artykuly-spozywcze/przekaski/orzeszki/",
    "https://zakupy.biedronka.pl/artykuly-spozywcze/przekaski/bakalie/"
]

# List to store all product information across multiple pages
all_products = []

# Loop over each URL
for url in urls:
    driver.get(url)
    time.sleep(5)  # Wait for the page to load

    try:
        # Wait for the cookie consent button to be clickable (increased timeout)
        param_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.ID, "onetrust-pc-btn-handler")))
        param_button.click()

        # Wait for and click the button to confirm cookie consent
        confirm_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CLASS_NAME, "ot-pc-refuse-all-handler")))
        confirm_button.click()
    except TimeoutException:
        print(f"Cookie consent not found for URL: {url} or took too long to load")

    # Parse page source with BeautifulSoup
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # Extract product information for the current page
    for product_pod in soup.find_all("div", class_="product-tile js-product-tile"):
        # Extract title
        title_tag = product_pod.find("div", class_="product-tile__name product-tile__name--overflow")
        title = title_tag.text.strip() if title_tag else "Title not found"

        # Extract weight (only the weight value, e.g., "0.2kg")
        weight_tag = product_pod.find("div", class_="packaging-details")
        if weight_tag:
            weight = weight_tag.contents[0].strip()  # Get the first part before the <span> tag
        else:
            weight = "Weight not found"
        
        # Extract current price (main price)
        price_main_tag = product_pod.find("div", class_="price-tile__sales")
        if price_main_tag:
            # Extract the integer part of the price
            integer_part = price_main_tag.find(text=True, recursive=False).strip() if price_main_tag else None
            decimal_part = price_main_tag.find("span", class_="price-tile__decimal")
            if integer_part and decimal_part:
                # Combine integer and decimal parts into one properly formatted price
                raw_price = f"{integer_part.strip()}{decimal_part.text.strip()}"  # Combine without formatting
                if len(raw_price) > 2:
                    current_price = f"{raw_price[:-2]}.{raw_price[-2:]} zł"  # Insert decimal point two digits from the end
                else:
                    current_price = f"0.{raw_price} zł"  # Handle cases where price is less than 1 zł
            else:
                current_price = "Price not found"
        else:
            current_price = "Price not found"

        # Remove any extra spaces (just in case)
        current_price = current_price.replace(" ", "").strip()

        # Extract promo price if available
        promo_price_tag = product_pod.find("div", class_="product-tile-prices__regular")
        if promo_price_tag:
            promo_price = promo_price_tag.find("span", class_="product-tile-prices__amount")
            if promo_price:
                promo_price = promo_price.text.strip()
            else:
                promo_price = "Promo Price not found"
        else:
            promo_price = "Promo Price not found"

        # Append extracted information to the list
        all_products.append((title, current_price, promo_price, weight))

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD

# Write the data to a CSV file
with open('Berrie.csv', mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    for product in all_products:
        writer.writerow((*product, timestamp))  # Write product data with timestamp

print("Data has been successfully saved")

# Quit the driver
driver.quit()


Cookie consent not found for URL: https://zakupy.biedronka.pl/artykuly-spozywcze/przekaski/bakalie/ or took too long to load
Data has been successfully saved
