In [2]:
import requests
from bs4 import BeautifulSoup


In [None]:
from seleniumbase import Driver

driver = Driver(uc=True)

: 

In [12]:
# Improved browse method with cursor-based pagination
def fetch_all_aldi_products_browse():
    """
    Fetch ALL Aldi products using Algolia's browse method with cursor pagination
    This should get all 2336+ products by following cursor-based pagination
    """
    all_products = []
    cursor = None
    page_count = 0
    
    browse_url = "https://2hu29pf6bh-2.algolianet.com/1/indexes/an_prd_nl_nl_products/browse"
    
    browse_headers = headers.copy()
    browse_headers['X-Algolia-API-Key'] = '686cf0c8ddcf740223d420d1115c94c1'
    browse_headers['X-Algolia-Application-Id'] = '2HU29PF6BH'
    
    while True:
        print(f"Fetching browse page {page_count} (products so far: {len(all_products)})...")
        
        params = {
            'hitsPerPage': 1000,  # Max hits per page for browse
            'filters': 'isAvailable:true'
        }
        
        # Add cursor if we have one (for pagination)
        if cursor:
            params['cursor'] = cursor
            
        try:
            response = requests.get(browse_url, headers=browse_headers, params=params)
            
            if response.status_code == 200:
                data = response.json()
                hits = data.get('hits', [])
                cursor = data.get('cursor')  # Get cursor for next page
                
                print(f"  Page {page_count}: Found {len(hits)} products")
                
                if hits:
                    all_products.extend(hits)
                    page_count += 1
                    
                    # If no cursor, we've reached the end
                    if not cursor:
                        print("  No more cursor - reached end of results")
                        break
                else:
                    print("  No hits found - reached end")
                    break
                    
            else:
                print(f"Error: HTTP {response.status_code}")
                print(f"Response: {response.text}")
                break
                
        except Exception as e:
            print(f"Error in browse method: {e}")
            break
    
    return all_products

# Execute the improved browse method
print("Starting improved browse method to get ALL products...")
all_browse_products = fetch_all_aldi_products_browse()
print(f"\nFinal browse result: Fetched {len(all_browse_products)} total products")

Starting improved browse method to get ALL products...
Fetching browse page 0 (products so far: 0)...
  Page 0: Found 1000 products
Fetching browse page 1 (products so far: 1000)...
  Page 1: Found 1000 products
Fetching browse page 2 (products so far: 2000)...
  Page 2: Found 336 products
  No more cursor - reached end of results

Final browse result: Fetched 2336 total products


In [14]:
all_browse_products[:5]  # Display first 5 products for verification

[{'longDescription': '',
  'permanentLowPrice': False,
  'shortDescriptionFormatted': [{'type': 'text', 'text': '3x33 cl.'}],
  'isAvailable': True,
  'advertisingOnlineDateUntil': 1754870399000,
  'energyScale': None,
  'drainedWeightValue': None,
  'drainedWeightUnit': None,
  'hierarchicalCategories': {'lvl0': ['ALDI merken'],
   'lvl1': ['ALDI merken > Offers']},
  'isDrainedWeight': False,
  'recommendationLabel': '',
  'categories': ['offer'],
  'pledgeValue': None,
  'energyClass': None,
  'brandName': 'Aperito ',
  'images': [{'type': 'gallery',
    'url': 'https://s7g10.scene7.com/is/image/aldinord/variant_1235864_1_main_cms_1_nl_nl_3_pack_aperitivo_spritz_cw01_2024'},
   {'type': 'gallery',
    'url': 'https://s7g10.scene7.com/is/image/aldinord/variant_1235865_1_main_cms_1_nl_nl_3_pack_limoncello_spritz_cw01_2024'},
   {'type': 'primary',
    'url': 'https://s7g10.scene7.com/is/image/aldinord/98765432_wk32'},
   {'type': 'seal',
    'url': 'https://s7g10.scene7.com/is/image/a

In [15]:
# Examine the structure of a single product
if all_browse_products:
    sample_product = all_browse_products[0]
    print("Sample Aldi Product Structure:")
    print("="*50)
    
    # Print all keys and their types/values
    for key, value in sample_product.items():
        if isinstance(value, dict):
            print(f"{key}: dict with keys: {list(value.keys())}")
        elif isinstance(value, list):
            print(f"{key}: list with {len(value)} items")
            if value and len(value) > 0:
                print(f"  First item type: {type(value[0])}")
                if isinstance(value[0], dict):
                    print(f"  First item keys: {list(value[0].keys())}")
        else:
            print(f"{key}: {type(value).__name__} = {repr(value)}")
    
    print("\n" + "="*50)
    print("Key product fields:")
    print(f"Name: {sample_product.get('name', 'N/A')}")
    print(f"Price: {sample_product.get('price', 'N/A')}")
    print(f"Description: {sample_product.get('description', 'N/A')}")
    print(f"Category: {sample_product.get('category', 'N/A')}")
    print(f"Brand: {sample_product.get('brand', 'N/A')}")
    print(f"Unit/Size: {sample_product.get('size', 'N/A')}")
    print(f"Available: {sample_product.get('isAvailable', 'N/A')}")
else:
    print("No products available to examine")

Sample Aldi Product Structure:
longDescription: str = ''
permanentLowPrice: bool = False
shortDescriptionFormatted: list with 1 items
  First item type: <class 'dict'>
  First item keys: ['type', 'text']
isAvailable: bool = True
advertisingOnlineDateUntil: int = 1754870399000
energyScale: NoneType = None
drainedWeightValue: NoneType = None
drainedWeightUnit: NoneType = None
hierarchicalCategories: dict with keys: ['lvl0', 'lvl1']
isDrainedWeight: bool = False
recommendationLabel: str = ''
categories: list with 1 items
  First item type: <class 'str'>
pledgeValue: NoneType = None
energyClass: NoneType = None
brandName: str = 'Aperito '
images: list with 4 items
  First item type: <class 'dict'>
  First item keys: ['type', 'url']
isRecall: bool = False
currentPrice: dict with keys: ['priceValue', 'strikePriceValue', 'reduction', 'basePriceValue', 'basePriceScale']
isBiocidalProduct: bool = False
shortDescription: str = '3x33 cl.'
productSlug: str = '3-pack-aperitivo-of-limoncello-spritz'

In [16]:
# Debug the problematic products that are causing errors
import json

# Find products that might have issues
problematic_products = []
for i, product in enumerate(all_browse_products[:20]):
    # Check for lists in key fields
    issues = []
    
    if isinstance(product.get('salesUnit'), list):
        issues.append(f"salesUnit is list: {product.get('salesUnit')}")
    if isinstance(product.get('shortDescription'), list):
        issues.append(f"shortDescription is list: {product.get('shortDescription')}")
    if isinstance(product.get('variantName'), list):
        issues.append(f"variantName is list: {product.get('variantName')}")
    if isinstance(product.get('brandName'), list):
        issues.append(f"brandName is list: {product.get('brandName')}")
    
    if issues:
        print(f"\nProduct {i} (ID: {product.get('objectID')}):")
        for issue in issues:
            print(f"  - {issue}")
        problematic_products.append(product)

print(f"\nFound {len(problematic_products)} problematic products out of first 20")

# Show a detailed view of the first problematic product
if problematic_products:
    print("\nDetailed view of first problematic product:")
    problem_product = problematic_products[0]
    print(json.dumps(problem_product, indent=2, default=str)[:1000] + "...")


Found 0 problematic products out of first 20


In [17]:
# Check the categories field structure
sample_product = all_browse_products[0]
print("Category-related fields:")
print(f"hierarchicalCategories: {sample_product.get('hierarchicalCategories')}")
print(f"categories: {sample_product.get('categories')}")
print(f"categories type: {type(sample_product.get('categories'))}")

# Show a few more examples
for i in range(5):
    if i < len(all_browse_products):
        product = all_browse_products[i]
        print(f"\nProduct {i}:")
        print(f"  hierarchicalCategories: {product.get('hierarchicalCategories')}")
        print(f"  categories: {product.get('categories')}")
        print(f"  categories type: {type(product.get('categories'))}")

Category-related fields:
hierarchicalCategories: {'lvl0': ['ALDI merken'], 'lvl1': ['ALDI merken > Offers']}
categories: ['offer']
categories type: <class 'list'>

Product 0:
  hierarchicalCategories: {'lvl0': ['ALDI merken'], 'lvl1': ['ALDI merken > Offers']}
  categories: ['offer']
  categories type: <class 'list'>

Product 1:
  hierarchicalCategories: {'lvl0': ['ALDI merken'], 'lvl1': ['ALDI merken > Offers']}
  categories: ['offer']
  categories type: <class 'list'>

Product 2:
  hierarchicalCategories: {'lvl0': ['ALDI merken'], 'lvl1': ['ALDI merken > Offers']}
  categories: ['offer']
  categories type: <class 'list'>

Product 3:
  hierarchicalCategories: {'lvl0': ['ALDI merken'], 'lvl1': ['ALDI merken > Offers']}
  categories: ['offer']
  categories type: <class 'list'>

Product 4:
  hierarchicalCategories: {'lvl0': ['ALDI merken'], 'lvl1': ['ALDI merken > Offers']}
  categories: ['offer']
  categories type: <class 'list'>


In [18]:
# Test the product processing directly
import sys
sys.path.append('d:/Google Drive/PycharmProjects/Current/Netherland Supermarket')

from Supermarkets.aldi import AldiScraper
from database import DatabaseManager, get_db_config

# Set up scraper
config = get_db_config()
with DatabaseManager(config) as db:
    scraper = AldiScraper(db)
    
    # Test processing the first product
    test_product = all_browse_products[0]
    
    print("Testing product:")
    print(f"  ID: {test_product.get('objectID')}")
    print(f"  variantName: {test_product.get('variantName')}")
    print(f"  currentPrice: {test_product.get('currentPrice')}")
    print(f"  salesUnit: {test_product.get('salesUnit')}")
    
    try:
        result = scraper._process_api_product(test_product)
        if result:
            print(f"\n✓ SUCCESS: {result.name}")
            print(f"  Price: €{result.price}")
            print(f"  Category: {result.category_name}")
            print(f"  Unit: {result.unit_amount}")
        else:
            print("\n✗ FAILED: Returned None")
    except Exception as e:
        print(f"\n✗ ERROR: {e}")
        import traceback
        traceback.print_exc()

Testing product:
  ID: 98765432
  variantName: 3-pack aperitivo of limoncello spritz
  currentPrice: {'priceValue': None, 'strikePriceValue': None, 'reduction': None, 'basePriceValue': None, 'basePriceScale': None}
  salesUnit: 3x33 cl

✗ FAILED: Returned None


In [19]:
# Find products with valid prices
valid_price_products = []
for i, product in enumerate(all_browse_products[:50]):
    current_price = product.get('currentPrice', {})
    price_value = current_price.get('priceValue')
    if price_value is not None and price_value > 0:
        valid_price_products.append((i, product))
        if len(valid_price_products) >= 5:
            break

print(f"Found {len(valid_price_products)} products with valid prices:")
for i, (idx, product) in enumerate(valid_price_products):
    print(f"{i+1}. Product {idx}: {product.get('variantName')} - €{product.get('currentPrice', {}).get('priceValue')}")

# Test with a valid price product
if valid_price_products:
    test_idx, test_product = valid_price_products[0]
    print(f"\nTesting valid price product (index {test_idx}):")
    
    try:
        result = scraper._process_api_product(test_product)
        if result:
            print(f"✓ SUCCESS: {result.name}")
            print(f"  Price: €{result.price}")
            print(f"  Category: {result.category_name}")
            print(f"  Unit: {result.unit_amount}")
            print(f"  Price per unit: €{result.price_per_unit}/{result.unit_type.value}")
        else:
            print("✗ FAILED: Returned None")
    except Exception as e:
        print(f"✗ ERROR: {e}")
        import traceback
        traceback.print_exc()

Found 5 products with valid prices:
1. Product 1: 2-pack minuutje rijst - €2.5
2. Product 2: Platinum all-in-one vaatwastabletten - €8.99
3. Product 3: Wasmiddel - €2.99
4. Product 4: Dolce Gusto compatible koffiecups - €9.99
5. Product 5: Gepaneerde gehaktschnitzels - €1.49

Testing valid price product (index 1):
✓ SUCCESS: 2-pack minuutje rijst
  Price: €2.5
  Category: Offers
  Unit: 2x125 g
  Price per unit: €2.5/piece


In [20]:
# Test processing more products to ensure robustness
print("Testing batch processing of 100 products...")

successful_products = []
failed_products = []

for i, product in enumerate(all_browse_products[:100]):
    try:
        result = scraper._process_api_product(product)
        if result:
            successful_products.append(result)
        else:
            failed_products.append(f"Product {i}: Returned None (likely no price)")
    except Exception as e:
        failed_products.append(f"Product {i}: ERROR - {str(e)}")

print(f"\nResults:")
print(f"✓ Successfully processed: {len(successful_products)} products")
print(f"✗ Failed: {len(failed_products)} products")

print(f"\nSample successful products:")
for i, product in enumerate(successful_products[:5]):
    print(f"{i+1}. {product.name}")
    print(f"   Price: €{product.price}")
    print(f"   Category: {product.category_name}")
    print(f"   Unit: {product.unit_amount}")
    if product.brand:
        print(f"   Brand: {product.brand}")
    print()

if len(failed_products) < 10:
    print(f"Failed products:")
    for failure in failed_products:
        print(f"  - {failure}")
else:
    print(f"Sample failed products (showing first 5):")
    for failure in failed_products[:5]:
        print(f"  - {failure}")

Testing batch processing of 100 products...

Results:
✓ Successfully processed: 82 products
✗ Failed: 18 products

Sample successful products:
1. 2-pack minuutje rijst
   Price: €2.5
   Category: Offers
   Unit: 2x125 g


AttributeError: 'Product' object has no attribute 'brand'