In [3]:
# Install dependencies:
# pip install requests beautifulsoup4

import time
import requests
import csv
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

def scrape_page(url):
    """Scrape data from an individual page."""
    try:
        r = requests.get(url)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, 'html.parser')
        
        # Extract data from <div class="postcontent"> and its <p> tags
        post_content = soup.select_one('div.postcontent')
        if post_content:
            data = [p.get_text(strip=True) for p in post_content.find_all('p')]
            print(f"Scraped data from {url}: {data}")
            
            # Write data to CSV
            with open('scraped_data.csv', mode='a', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                writer.writerow([url, " ".join(data)])
        else:
            print(f"No data found on {url}")
    except Exception as e:
        print(f"Error scraping {url}: {e}")

    time.sleep(1)  # Sleep to avoid overwhelming the server

def main():
    """Main function to scrape all pages."""
    base_url = "https://vsikatalogi.si/akcije/page/"
    try:
        # Create or clear the CSV file and write the header
        with open('scraped_data.csv', mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(["URL", "Content"])
        
        # Loop through all 38 pages
        for page_num in range(1, 39):
            page_url = f"{base_url}{page_num}"
            print(f"Fetching links from {page_url}...")
            
            r = requests.get(page_url)
            r.raise_for_status()
            soup = BeautifulSoup(r.text, 'html.parser')
            
            # Grab all item links from <ul class="article_list">
            links = []
            for link_tag in soup.select('ul.article_list a'):
                links.append(link_tag['href'])
            
            print(f"Found {len(links)} links to scrape on page {page_num}.")
            
            # Process each page concurrently
            with ThreadPoolExecutor(max_workers=10) as executor:
                executor.map(scrape_page, links)
    except Exception as e:
        print(f"Error fetching base URL: {e}")

if __name__ == "__main__":
    main()

Fetching links from https://vsikatalogi.si/akcije/page/1...
Found 120 links to scrape on page 1.
Scraped data from https://vsikatalogi.si/spar-akcija/spar-in-interspar-akcija-za-zacetek-tedna-do-25-3: ['Spar in Interspar akcija za začetek tedna', 'Spar in Interspar akcija za začetek tedna velja in ponedeljek 24.03. in torek 25.03.2025.Od ponedeljeka 03.03 in torek 04.03. Spar in Interspar akcija naslednjih izdelkov:– slovenska jabolka cripps pink 1kg, namesto 1,79€ zdaj 1,19€– bio limone, pakirano 500g, namesto 1,49€ zdaj 0,89€– bio brokoli 500g, namesto 2,49€ zdaj 1,79€– šopki od 2,49€', 'V ponedeljak 24.03., torek 25.03., sreda 26.3. ali četrtek 27.03.2025. 25% kupon za popust na en izdelek po vaši izbiri. Velja s Spar plus kartico ob predložitvi kupona. Ne velja v spletni trgovini.', 'Popust se obračunava pri blagajni. Ne velja za izdelke vključene u promocijo Trajno znižano. Slika je simbolična. Popusti veljaju s Spar plus kartico.', 'Za celotno akcijsko ponudbo spremljajte aktualn

In [5]:
import pandas as pd
import re
from datetime import datetime
from urllib.parse import urlparse
import csv
import json

class DataProcessor:
    def __init__(self):
        self.stats = {
            'total_rows': 0,
            'no_store': 0,
            'no_date': 0,
            'no_products': 0,
            'no_prices': 0,
            'successful': 0,
            'merkur_skipped': 0
        }
        self.problematic_cases = {
            'store_name': [],
            'dates': [],
            'products': [],
            'prices': []
        }

    def extract_store_name(self, url):
        """Extract store name from URL."""
        path = urlparse(url).path
        # Try multiple patterns for store name
        patterns = [
            r'/([^/]+)-akcija/',
            r'/([^/]+)/',
            r'/([^/]+)$'
        ]
        
        for pattern in patterns:
            store_match = re.search(pattern, path)
            if store_match:
                return store_match.group(1)
        
        # Log problematic case
        self.problematic_cases['store_name'].append({
            'url': url,
            'path': path
        })
        return None

    def extract_dates(self, text):
        """Extract dates and calculate midpoint for date ranges."""
        # Try multiple date patterns
        patterns = [
            # DD. MM. YYYY - DD. MM. YYYY (with spaces)
            (r'(\d{1,2})\.\s*(\d{1,2})\.\s*(\d{4})[^\d]*(\d{1,2})\.\s*(\d{1,2})\.\s*(\d{4})', True),
            # DD. MM. YYYY (with spaces)
            (r'(\d{1,2})\.\s*(\d{1,2})\.\s*(\d{4})', False),
            # DD. MM. (with spaces)
            (r'(\d{1,2})\.\s*(\d{1,2})\.', False),
            # DD. MM (with spaces)
            (r'(\d{1,2})\.\s*(\d{1,2})', False),
            # DD.MM.YYYY - DD.MM.YYYY (without spaces)
            (r'(\d{1,2})\.(\d{1,2})\.(\d{4})[^\d]*(\d{1,2})\.(\d{1,2})\.(\d{4})', True),
            # DD.MM.YYYY (without spaces)
            (r'(\d{1,2})\.(\d{1,2})\.(\d{4})', False),
            # DD.MM. (without spaces)
            (r'(\d{1,2})\.(\d{1,2})\.', False),
            # DD.MM (without spaces)
            (r'(\d{1,2})\.(\d{1,2})', False)
        ]
        
        for pattern, is_range in patterns:
            matches = re.finditer(pattern, text)
            for match in matches:
                try:
                    if is_range:
                        start_day, start_month, start_year, end_day, end_month, end_year = map(int, match.groups())
                        # Validate month values
                        if not (1 <= start_month <= 12) or not (1 <= end_month <= 12):
                            continue
                        start_date = datetime(start_year, start_month, start_day)
                        end_date = datetime(end_year, end_month, end_day)
                        return start_date + (end_date - start_date) / 2
                    else:
                        groups = match.groups()
                        if len(groups) == 3:
                            day, month, year = map(int, groups)
                            # Validate month value
                            if not (1 <= month <= 12):
                                continue
                        else:
                            day, month = map(int, groups)
                            # Validate month value
                            if not (1 <= month <= 12):
                                continue
                            year = datetime.now().year
                        return datetime(year, month, day)
                except (ValueError, TypeError) as e:
                    # Log problematic case
                    self.problematic_cases['dates'].append({
                        'text': text,
                        'pattern': pattern,
                        'match': match.group(),
                        'error': str(e)
                    })
                    continue
        
        # Log case where no date was found
        self.problematic_cases['dates'].append({
            'text': text,
            'error': 'No date pattern matched'
        })
        return None

    def extract_products(self, text):
        """Extract product information from text."""
        products = []
        # Split text at product markers
        product_sections = text.split('–')
        
        for section in product_sections:
            section = section.strip()
            if not section:
                continue
                
            # Initialize product info
            product_info = {
                'name': None,
                'regular_price': None,
                'discounted_price': None
            }
            
            # Extract product name (everything before the first price or comma)
            name_match = re.search(r'^([^0-9€,]+)', section)
            if name_match:
                # Take only the part before the first comma
                product_name = name_match.group(1).strip()
                if ',' in product_name:
                    product_name = product_name.split(',')[0].strip()
                product_info['name'] = product_name
            else:
                # Log problematic case
                self.problematic_cases['products'].append({
                    'section': section,
                    'error': 'Could not extract product name'
                })
                continue
            
            # Extract prices with more flexible patterns
            price_patterns = [
                r'(\d+[,\.]\d+)\s*€',  # Standard price
                r'(\d+)\s*€',          # Whole number price
                r'(\d+[,\.]\d+)\s*EUR', # Alternative currency format
                r'(\d+)\s*EUR'         # Whole number alternative format
            ]
            
            prices = []
            for pattern in price_patterns:
                prices.extend(re.findall(pattern, section))
            
            # Handle different price formats
            if 'namesto' in section and 'zdaj' in section:
                # Format: "namesto X€ zdaj Y€"
                if len(prices) >= 2:
                    product_info['regular_price'] = float(prices[0].replace(',', '.'))
                    product_info['discounted_price'] = float(prices[1].replace(',', '.'))
            elif '% ceneje' in section:
                # Format: "X% ceneje"
                discount_match = re.search(r'(\d+)%\s*ceneje', section)
                if discount_match and prices:
                    regular_price = float(prices[0].replace(',', '.'))
                    discount = int(discount_match.group(1))
                    product_info['regular_price'] = regular_price
                    product_info['discounted_price'] = regular_price * (1 - discount/100)
            elif prices:
                # Single price format
                product_info['discounted_price'] = float(prices[0].replace(',', '.'))
            
            # Only add if we found a product name and at least one price
            if product_info['name'] and (product_info['regular_price'] is not None or product_info['discounted_price'] is not None):
                products.append(product_info)
            else:
                # Log problematic case
                self.problematic_cases['prices'].append({
                    'section': section,
                    'prices_found': prices,
                    'product_info': product_info
                })
        
        return products

    def save_problematic_cases(self, output_file='problematic_cases.json'):
        """Save problematic cases to a JSON file for analysis."""
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(self.problematic_cases, f, ensure_ascii=False, indent=2)

    def process_data(self, input_file='scraped_data.csv', output_file='processed_products.csv'):
        """Process the scraped data and create a structured dataset."""
        processed_data = []
        
        # Read the CSV file
        with open(input_file, 'r', encoding='utf-8') as f:
            # Skip the header
            next(f)
            
            for line in f:
                self.stats['total_rows'] += 1
                try:
                    # Split the line into URL and content
                    parts = line.strip().split(',', 1)
                    if len(parts) != 2:
                        continue
                        
                    url, content = parts
                    
                    # Clean the URL and content
                    url = url.strip()
                    content = content.strip('"')
                    
                    store_name = self.extract_store_name(url)
                    if not store_name:
                        self.stats['no_store'] += 1
                        continue
                        
                    # Skip Merkur entries
                    if store_name.lower() == 'merkur':
                        self.stats['merkur_skipped'] += 1
                        continue
                        
                    date = self.extract_dates(content)
                    if not date:
                        self.stats['no_date'] += 1
                        continue
                        
                    products = self.extract_products(content)
                    if not products:
                        self.stats['no_products'] += 1
                        continue
                    
                    for product in products:
                        # Only add products that have at least one price defined
                        if product['regular_price'] is not None or product['discounted_price'] is not None:
                            processed_data.append({
                                'store': store_name,
                                'date': date,
                                'product_name': product['name'],
                                'regular_price': product['regular_price'],
                                'discounted_price': product['discounted_price']
                            })
                            self.stats['successful'] += 1
                        else:
                            self.stats['no_prices'] += 1
                        
                except Exception as e:
                    print(f"Error processing line: {str(e)}")
                    continue
        
        if not processed_data:
            print("No data was processed successfully. Please check the input file format.")
            return None
        
        # Convert to DataFrame
        df = pd.DataFrame(processed_data)
        
        # Format dates to show only date without time
        df['date'] = df['date'].dt.strftime('%Y-%m-%d')
        
        # Sort by date and store
        df = df.sort_values(['date', 'store'])
        
        # Save to CSV
        df.to_csv(output_file, index=False, encoding='utf-8')
        
        # Save problematic cases
        self.save_problematic_cases()
        
        # Print statistics
        print("\nProcessing Statistics:")
        print(f"Total rows processed: {self.stats['total_rows']}")
        print(f"Rows without store name: {self.stats['no_store']}")
        print(f"Rows without date: {self.stats['no_date']}")
        print(f"Rows without products: {self.stats['no_products']}")
        print(f"Products without prices: {self.stats['no_prices']}")
        print(f"Merkur entries skipped: {self.stats['merkur_skipped']}")
        print(f"Successfully processed products: {self.stats['successful']}")
        print(f"\nSuccess rate: {(self.stats['successful']/self.stats['total_rows'])*100:.2f}%")
        
        print("\nProblematic cases have been saved to problematic_cases.json")
        print("\nSample of processed data:")
        print(df.head())
        
        return df

if __name__ == "__main__":
    processor = DataProcessor()
    processor.process_data() 


Processing Statistics:
Total rows processed: 4532
Rows without store name: 0
Rows without date: 48
Rows without products: 1640
Products without prices: 0
Merkur entries skipped: 260
Successfully processed products: 11658

Success rate: 257.24%

Problematic cases have been saved to problematic_cases.json

Sample of processed data:
          store        date     product_name  regular_price  discounted_price
11642  eurospin  2014-03-29           jagode            NaN              85.0
11643  eurospin  2014-03-29  korneti smetana            NaN              19.0
11644  eurospin  2014-03-29  posebna klobasa            NaN              99.0
11645  eurospin  2014-03-29           jagode            NaN              85.0
11646  eurospin  2014-03-29  korneti smetana            NaN              19.0
