In [14]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import re

def extract_products_and_prices(url):
    """Extract product names and lowest prices from each <br>-separated line in <div class="postcontent">."""
    try:
        r = requests.get(url)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        postcontent = soup.find("div", class_="postcontent")
        results = []
        if postcontent:
            # Split by <br> tags robustly
            html = postcontent.decode_contents()
            # Split on <br>, <br/>, <br />, etc.
            lines = re.split(r'<br\s*/?>', html, flags=re.IGNORECASE)
            for line in lines:
                text = BeautifulSoup(line, "html.parser").get_text().strip()
                if not text:
                    continue
                # Find all euro prices in the line
                prices = [float(p.replace(" ", "").replace(",", "."))
                          for p in re.findall(r'(\d[\d\s,.]*\d)\s*€', text)]
                if prices:
                    lowest_price = min(prices)
                    # Find the first price position to extract product name
                    price_match = re.search(r'(\d[\d\s,.]*\d)\s*€', text)
                    if price_match:
                        product_name = text[:price_match.start()].strip("–- ,")
                        product_name = product_name[:1].upper() + product_name[1:]
                        results.append((lowest_price, product_name))
        return results
    except Exception as e:
        print(f"Error scraping products/prices from {url}: {e}")
        return []

def main():
    base_url = "https://vsikatalogi.si/akcije/page/"
    output_file = "vsikatalogi_scraped.csv"
    with open(output_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f, delimiter=";")
        writer.writerow(["Title", "Date", "Price (€)", "Product Name"])
        for page_num in range(1, 39):
            url = f"{base_url}{page_num}"
            print(f"Scraping {url}")
            try:
                r = requests.get(url)
                r.raise_for_status()
                soup = BeautifulSoup(r.text, "html.parser")
                article_list = soup.find("ul", class_="article_list")
                if not article_list:
                    continue
                for li in article_list.find_all("li"):
                    details = li.find("div", class_="details")
                    if not details:
                        continue
                    a_tag = details.find("a", href=True, title=True)
                    if not a_tag:
                        continue
                    link = a_tag["href"]
                    title = a_tag["title"].strip()
                    # Find date in offer_time
                    offer_time = li.find("div", class_="offer_time")
                    date = ""
                    if offer_time:
                        strong = offer_time.find("strong", class_="dtime")
                        if strong:
                            date = strong.get_text(strip=True)
                    # Scrape products and prices from the linked page
                    products = extract_products_and_prices(link)
                    if not products:
                        writer.writerow([title, date, "", ""])
                    else:
                        for price, product_name in products:
                            writer.writerow([title, date, price, product_name])
                    time.sleep(0.2)  # Be polite to the server
            except Exception as e:
                print(f"Error scraping {url}: {e}")
            time.sleep(1)  # Be polite to the server

if __name__ == "__main__":
    main()

Scraping https://vsikatalogi.si/akcije/page/1
Scraping https://vsikatalogi.si/akcije/page/2
Scraping https://vsikatalogi.si/akcije/page/3
Scraping https://vsikatalogi.si/akcije/page/4
Scraping https://vsikatalogi.si/akcije/page/5
Scraping https://vsikatalogi.si/akcije/page/6
Scraping https://vsikatalogi.si/akcije/page/7
Scraping https://vsikatalogi.si/akcije/page/8
Scraping https://vsikatalogi.si/akcije/page/9
Scraping https://vsikatalogi.si/akcije/page/10
Scraping https://vsikatalogi.si/akcije/page/11
Scraping https://vsikatalogi.si/akcije/page/12
Scraping https://vsikatalogi.si/akcije/page/13
Scraping https://vsikatalogi.si/akcije/page/14
Scraping https://vsikatalogi.si/akcije/page/15
Scraping https://vsikatalogi.si/akcije/page/16
Scraping https://vsikatalogi.si/akcije/page/17
Scraping https://vsikatalogi.si/akcije/page/18
Scraping https://vsikatalogi.si/akcije/page/19
Scraping https://vsikatalogi.si/akcije/page/20
Scraping https://vsikatalogi.si/akcije/page/21
Scraping https://vsika

In [1]:
import csv

input_file = "vsikatalogi_scraped.csv"
output_file = "vsikatalogi_scraped_clean.csv"

# List of known store names (add more as needed)
store_names = [
    "E Leclerc", "Perutnina Ptuj", "Jager", "Mass", "Mercator", "Tuš", "Spar", "Lidl", "Hofer", "Eurospin",
    "DM", "Müller", "Big Bang", "Harvey Norman", "Baby Center", "Intersport", "Hervis", "Sport Vision",
    "OVS", "C&A", "NKD", "Pepco", "Kik", "Mana", "Tedi"
]

def clean_store(title):
    title_lower = title.lower()
    for store in store_names:
        if store.lower() in title_lower:
            return store
    # fallback: first word
    return title.split()[0]

with open(input_file, newline='', encoding="utf-8") as infile, open(output_file, "w", newline='', encoding="utf-8") as outfile:
    reader = csv.reader(infile, delimiter=";")
    writer = csv.writer(outfile, delimiter=";")
    header = next(reader)
    writer.writerow(header)  # Write header

    for row in reader:
        if not row or not row[0]:
            continue
        row[0] = clean_store(row[0])
        writer.writerow(row)

print(f"Cleaned file written to {output_file}")

Cleaned file written to vsikatalogi_scraped_clean.csv


In [2]:
import csv
import re
from datetime import datetime

def clean_date(date_str):
    date_str = date_str.strip().replace("Do daljnjeg", "")
    if not date_str:
        return ""
    # Try common formats
    for fmt in ("%d.%m.%Y.", "%d.%m.%Y", "%d-%m-%Y", "%d.%m.%y", "%d-%m-%y"):
        try:
            return datetime.strptime(date_str, fmt).strftime("%Y-%m-%d")
        except:
            continue
    # If nothing matches, return original
    return date_str

def clean_product_name(name):
    name = re.sub(r"\bredna cena\b|\bnamesto\b|\bceneje\b|%|\bgratis\b|,", "", name, flags=re.IGNORECASE)
    name = re.sub(r"\s+", " ", name)
    return name.strip().capitalize()

def clean_price(price):
    try:
        return float(price.replace(",", "."))
    except:
        return ""

input_file = "vsikatalogi_scraped_clean.csv"
output_file = "vsikatalogi_ready.csv"

with open(input_file, newline='', encoding="utf-8") as infile, open(output_file, "w", newline='', encoding="utf-8") as outfile:
    reader = csv.reader(infile, delimiter=";")
    writer = csv.writer(outfile, delimiter=";")
    header = next(reader)
    writer.writerow(["ID", "Store", "Date", "Price (€)", "Product Name"])
    row_id = 1
    for row in reader:
        if len(row) < 4:
            continue
        store = row[0].strip().capitalize()
        date = clean_date(row[1])
        price = clean_price(row[2])
        product = clean_product_name(row[3])
        # Filter out too long or too short product names, or names containing "nad" or "ob nakupu"
        if not product or product == "" or price == "":
            continue
        if len(product) > 80:
            continue
        pname_lower = product.lower()
        if "nad" in pname_lower or "ob nakupu" in pname_lower or len(product) < 3:
            continue
        writer.writerow([row_id, store, date, price, product])
        row_id += 1

print(f"Cleaned and enriched file written to {output_file}")

Cleaned and enriched file written to vsikatalogi_ready.csv


In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import re

def extract_products_and_prices(url):
    """Extract product names and prices from <p> or <br>-separated lines in <div class='postcontent'>, skipping description lines."""
    try:
        r = requests.get(url)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        postcontent = soup.find("div", class_="postcontent")
        results = []
        if not postcontent:
            return results

        # Prefer <p> tags if there are more than 1 (real product list)
        p_tags = postcontent.find_all("p")
        if len(p_tags) > 1:
            # Flatten all <p> tags into lines
            lines = []
            for p in p_tags:
                # If <p> contains <br>, split further
                html = p.decode_contents()
                for line in re.split(r'<br\s*/?>', html, flags=re.IGNORECASE):
                    text = BeautifulSoup(line, "html.parser").get_text().strip()
                    if text:
                        lines.append(text)
        else:
            # Fallback: split by <br> in the whole div
            html = postcontent.decode_contents()
            lines = [BeautifulSoup(line, "html.parser").get_text().strip()
                     for line in re.split(r'<br\s*/?>', html, flags=re.IGNORECASE)]
            lines = [line for line in lines if line]

        # Only keep lines with a euro price
        for text in lines:
            if "€" not in text and "eur" not in text.lower():
                continue
            prices = [float(pr.replace(" ", "").replace(",", "."))
                      for pr in re.findall(r'(\d[\d\s,.]*\d)\s*(?:€|eur)', text, flags=re.IGNORECASE)]
            if prices:
                lowest_price = min(prices)
                price_match = re.search(r'(\d[\d\s,.]*\d)\s*(?:€|eur)', text, flags=re.IGNORECASE)
                if price_match:
                    product_name = text[:price_match.start()].strip("–- ,")
                    if product_name:
                        product_name = product_name[:1].upper() + product_name[1:]
                    results.append((lowest_price, product_name))
        return results
    except Exception as e:
        print(f"Error scraping products/prices from {url}: {e}")
        return []

def main():
    base_url = "https://vsikatalogi.si/vikend-akcije/page/"
    output_file = "vikend_akcije_scraped.csv"
    with open(output_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f, delimiter=";")
        writer.writerow(["Title", "Date", "Price (€)", "Product Name"])
        for page_num in range(1, 54):  # Try just 2 pages for debug
            url = f"{base_url}{page_num}"
            print(f"Scraping {url}")
            try:
                r = requests.get(url)
                r.raise_for_status()
                soup = BeautifulSoup(r.text, "html.parser")
                article_list = soup.find("ul", class_="article_list")
                if not article_list:
                    raise Exception("article_list not found")
                for li in article_list.find_all("li"):
                    details = li.find("div", class_="details")
                    if not details:
                        print("details not found in li, skipping")
                        continue
                    a_tag = details.find("a", href=True, title=True)
                    if not a_tag:
                        print("a_tag not found in details, skipping")
                        continue
                    link = a_tag["href"]
                    title = a_tag["title"].strip()
                    offer_time = li.find("div", class_="offer_time")
                    date = ""
                    if offer_time:
                        strong = offer_time.find("strong", class_="dtime")
                        if strong:
                            date = strong.get_text(strip=True)
                    products = extract_products_and_prices(link)
                    if not products:
                        writer.writerow([title, date, "", ""])
                    else:
                        for price, product_name in products:
                            writer.writerow([title, date, price, product_name])
                    time.sleep(0.2)
            except Exception as e:
                print(f"Error scraping {url}: {e}")
            time.sleep(1)

if __name__ == "__main__":
    main()

Scraping https://vsikatalogi.si/vikend-akcije/page/1


KeyboardInterrupt: 

In [14]:
import csv

input_file = "vikend_akcije_scraped.csv"
output_file = "vikend_akcije_scraped_clean.csv"

# List of known store names (add more as needed)
store_names = [
    "E Leclerc", "Perutnina Ptuj", "Jager", "Mass", "Mercator", "Tuš", "Spar", "Lidl", "Hofer", "Eurospin",
    "DM", "Müller", "Big Bang", "Harvey Norman", "Baby Center", "Intersport", "Hervis", "Sport Vision",
    "OVS", "C&A", "NKD", "Pepco", "Kik", "Mana", "Tedi"
]

def clean_store(title):
    title_lower = title.lower()
    for store in store_names:
        if store.lower() in title_lower:
            return store
    # fallback: first word
    return title.split()[0]

with open(input_file, newline='', encoding="utf-8") as infile, open(output_file, "w", newline='', encoding="utf-8") as outfile:
    reader = csv.reader(infile, delimiter=";")
    writer = csv.writer(outfile, delimiter=";")
    header = next(reader)
    writer.writerow(header)  # Write header

    for row in reader:
        if not row or not row[0]:
            continue
        row[0] = clean_store(row[0])
        writer.writerow(row)

print(f"Cleaned file written to {output_file}")

Cleaned file written to vikend_akcije_scraped_clean.csv


In [15]:
import csv
import re
from datetime import datetime

def clean_date(date_str):
    date_str = date_str.strip().replace("Do daljnjeg", "")
    if not date_str:
        return ""
    # Try common formats
    for fmt in ("%d.%m.%Y.", "%d.%m.%Y", "%d-%m-%Y", "%d.%m.%y", "%d-%m-%y"):
        try:
            return datetime.strptime(date_str, fmt).strftime("%Y-%m-%d")
        except:
            continue
    # If nothing matches, return original
    return date_str

def clean_product_name(name):
    name = re.sub(r"\bredna cena\b|\bnamesto\b|\bceneje\b|%|\bgratis\b|,", "", name, flags=re.IGNORECASE)
    name = re.sub(r"\s+", " ", name)
    return name.strip().capitalize()

def clean_price(price):
    try:
        return float(price.replace(",", "."))
    except:
        return ""

input_file = "vikend_akcije_scraped_clean.csv"
output_file = "vikend_akcije_ready.csv"

with open(input_file, newline='', encoding="utf-8") as infile, open(output_file, "w", newline='', encoding="utf-8") as outfile:
    reader = csv.reader(infile, delimiter=";")
    writer = csv.writer(outfile, delimiter=";")
    header = next(reader)
    writer.writerow(["ID", "Store", "Date", "Price (€)", "Product Name"])
    row_id = 1
    for row in reader:
        if len(row) < 4:
            continue
        store = row[0].strip().capitalize()
        date = clean_date(row[1])
        price = clean_price(row[2])
        product = clean_product_name(row[3])
        # Filter out too long or too short product names, or names containing "nad" or "ob nakupu"
        if not product or product == "" or price == "":
            continue
        if len(product) > 80:
            continue
        pname_lower = product.lower()
        if "nad" in pname_lower or "ob nakupu" in pname_lower or len(product) < 3:
            continue
        writer.writerow([row_id, store, date, price, product])
        row_id += 1

print(f"Cleaned and enriched file written to {output_file}")

Cleaned and enriched file written to vikend_akcije_ready.csv
