In [9]:
"""
Enhanced Property24 Scraper with Image Extraction
This version extracts property images along with other data
"""

import requests
from bs4 import BeautifulSoup
import re
import time
import json
from typing import List, Dict, Optional
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class EnhancedProperty24Scraper:
    """
    Enhanced scraper that extracts images and more details
    """
    
    def __init__(self, delay_between_requests: float = 1.0):
        self.delay = delay_between_requests
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        })
    
    def extract_property_images(self, element) -> List[str]:
        """
        Extract image URLs from property listing element
        """
        images = []
        
        try:
            # Method 1: Look for img tags with data-src or src
            img_tags = element.find_all('img')
            for img in img_tags:
                # Skip icons and small images
                if 'icon' in str(img.get('class', [])):
                    continue
                
                # Try data-src first (lazy loading), then src
                img_url = img.get('data-src') or img.get('src')
                
                if img_url:
                    # Skip base64 images and placeholders
                    if 'data:image' in img_url or 'placeholder' in img_url:
                        continue
                    
                    # Ensure full URL
                    if not img_url.startswith('http'):
                        img_url = 'https:' + img_url if img_url.startswith('//') else 'https://www.property24.com' + img_url
                    
                    # Check if it's a property image (usually contains certain patterns)
                    if any(pattern in img_url for pattern in ['property24', 'listing', 'property', 'p24']):
                        images.append(img_url)
            
            # Method 2: Look for background images in style attributes
            elements_with_bg = element.find_all(style=re.compile('background-image'))
            for el in elements_with_bg:
                style = el.get('style', '')
                bg_match = re.search(r'url\(["\']?([^"\']+)["\']?\)', style)
                if bg_match:
                    img_url = bg_match.group(1)
                    if not img_url.startswith('http'):
                        img_url = 'https:' + img_url if img_url.startswith('//') else 'https://www.property24.com' + img_url
                    if 'property' in img_url or 'listing' in img_url:
                        images.append(img_url)
            
            # Method 3: Look for gallery data in JSON
            scripts = element.find_all('script', type='application/json')
            for script in scripts:
                try:
                    data = json.loads(script.string)
                    # Property24 often stores images in JSON
                    if isinstance(data, dict):
                        self._extract_images_from_json(data, images)
                except:
                    pass
            
            # Remove duplicates while preserving order
            seen = set()
            unique_images = []
            for img in images:
                if img not in seen:
                    seen.add(img)
                    unique_images.append(img)
            
            return unique_images[:5]  # Limit to 5 images per property
            
        except Exception as e:
            logger.debug(f"Error extracting images: {e}")
            return []
    
    def _extract_images_from_json(self, data: dict, images: list, depth: int = 0):
        """Recursively extract image URLs from JSON data"""
        if depth > 5:  # Prevent too deep recursion
            return
        
        if isinstance(data, dict):
            for key, value in data.items():
                if key in ['images', 'gallery', 'photos', 'imageUrl', 'image']:
                    if isinstance(value, list):
                        for item in value:
                            if isinstance(item, str) and item.startswith('http'):
                                images.append(item)
                            elif isinstance(item, dict) and 'url' in item:
                                images.append(item['url'])
                elif isinstance(value, (dict, list)):
                    self._extract_images_from_json(value, images, depth + 1)
        elif isinstance(data, list):
            for item in data:
                if isinstance(item, dict):
                    self._extract_images_from_json(item, images, depth + 1)
    
    def extract_property_details(self, element) -> Optional[Dict]:
        """
        Enhanced extraction with images and more details
        """
        text = element.get_text(separator=' ', strip=True)
        
        # Skip if too short or too long
        if len(text) < 30 or len(text) > 2000:
            return None
        
        property_data = {}
        
        # Extract price
        price_match = re.search(r'R\s*(\d{1,3}(?:[\s,]*\d{3})+)', text)
        if price_match:
            try:
                price_str = price_match.group(1).replace(',', '').replace(' ', '')
                property_data['price'] = int(price_str)
            except:
                pass
        elif 'development' in text.lower():
            property_data['price'] = None
            property_data['type'] = 'Development'
        else:
            return None  # No price, skip
        
        # Extract bedrooms
        bed_match = re.search(r'(\d+)\s*[Bb]ed', text)
        if bed_match:
            property_data['bedrooms'] = int(bed_match.group(1))
        
        # Extract bathrooms
        bath_match = re.search(r'(\d+)\s*[Bb]ath', text)
        if bath_match:
            property_data['bathrooms'] = int(bath_match.group(1))
        
        # Extract size
        size_match = re.search(r'(\d+)\s*m[²2]', text, re.IGNORECASE)
        if size_match:
            property_data['size_sqm'] = int(size_match.group(1))
        
        # Property type
        text_lower = text.lower()
        if 'apartment' in text_lower or 'flat' in text_lower:
            property_data['type'] = 'Apartment'
        elif 'house' in text_lower:
            property_data['type'] = 'House'
        elif 'townhouse' in text_lower:
            property_data['type'] = 'Townhouse'
        else:
            property_data['type'] = property_data.get('type', 'Property')
        
        # Extract URL
        link = element.find('a', href=True)
        if link and '/for-sale/' in link['href']:
            href = link['href']
            property_data['url'] = href if href.startswith('http') else 'https://www.property24.com' + href
        
        # Extract images - NEW!
        images = self.extract_property_images(element)
        if images:
            property_data['images'] = images
            logger.debug(f"Found {len(images)} images for property")
        
        # Extract additional features from text
        features = []
        feature_patterns = [
            (r'pool', 'Pool'),
            (r'garage|parking', 'Parking'),
            (r'garden', 'Garden'),
            (r'security', 'Security'),
            (r'balcony', 'Balcony'),
            (r'pet[\s-]?friendly', 'Pet Friendly'),
            (r'furnished', 'Furnished'),
            (r'sea[\s-]?view|ocean[\s-]?view', 'Sea Views'),
            (r'mountain[\s-]?view', 'Mountain Views')
        ]
        
        for pattern, feature in feature_patterns:
            if re.search(pattern, text_lower):
                features.append(feature)
        
        if features:
            property_data['highlights'] = features
        
        # Create better title
        if property_data.get('bedrooms'):
            property_data['title'] = f"{property_data['bedrooms']} Bedroom {property_data['type']}"
        else:
            property_data['title'] = property_data['type']
        
        # Add area description if found
        if 'walking distance' in text_lower:
            property_data['neighborhood_vibe'] = "Walking distance to amenities"
        
        return property_data
    
    def scrape_property_details_page(self, url: str) -> Dict:
        """
        Scrape detailed information from individual property page
        This gets much more data including all images
        """
        try:
            logger.info(f"Fetching details from: {url}")
            response = self.session.get(url, timeout=15)
            
            if response.status_code != 200:
                logger.error(f"Failed to fetch property page: {response.status_code}")
                return {}
            
            soup = BeautifulSoup(response.text, 'html.parser')
            details = {}
            
            # Extract all images from gallery
            gallery_images = []
            
            # Method 1: Look for image gallery container
            gallery_container = soup.find('div', class_=re.compile('gallery|carousel|slider|images'))
            if gallery_container:
                images = self.extract_property_images(gallery_container)
                gallery_images.extend(images)
            
            # Method 2: Look for all property images on page
            all_images = soup.find_all('img', src=re.compile('property|listing|p24'))
            for img in all_images:
                src = img.get('src') or img.get('data-src')
                if src and src not in gallery_images:
                    if not src.startswith('http'):
                        src = 'https:' + src if src.startswith('//') else 'https://www.property24.com' + src
                    gallery_images.append(src)
            
            details['images'] = gallery_images[:10]  # Limit to 10 images
            
            # Extract description
            description_el = soup.find('div', class_=re.compile('description|content|details'))
            if description_el:
                details['description'] = description_el.get_text(strip=True)[:500]
            
            return details
            
        except Exception as e:
            logger.error(f"Error scraping property details: {e}")
            return {}
    
    def scrape_area_with_images(self, area: str, max_pages: int = 2, get_full_details: bool = False) -> List[Dict]:
        """
        Scrape area with enhanced image extraction
        
        Args:
            area: Area to scrape
            max_pages: Maximum pages to scrape
            get_full_details: If True, visits each property page for more images (slower)
        """
        # Property24 Area Codes
        PROPERTY24_AREA_CODES = {
            "sea-point": 11021,
            "green-point": 11017,
            "camps-bay": 11014,
            "clifton": 11015,
            "fresnaye": 11016,
            "mouille-point": 11018,
            "de-waterkant": 9141,
            "gardens": 9145,
            "oranjezicht": 9155,
            "tamboerskloof": 9163,
            "vredehoek": 9166,
        }
        
        area_normalized = area.lower().replace(" ", "-").replace("_", "-")
        area_code = PROPERTY24_AREA_CODES.get(area_normalized)
        
        if not area_code:
            logger.error(f"Unknown area: {area}")
            return []
        
        all_properties = []
        page = 1
        
        while page <= max_pages:
            url = f"https://www.property24.com/for-sale/{area_normalized}/cape-town/western-cape/{area_code}"
            if page > 1:
                url += f"?Page={page}"
            
            logger.info(f"Scraping page {page}: {url}")
            
            try:
                response = self.session.get(url, timeout=15)
                if response.status_code != 200:
                    break
                
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # Find all property containers
                selectors = [
                    'div[class*="listing"]',
                    'div[class*="p24_"]',
                    'div[class*="tile"]',
                    'article[class*="listing"]',
                    'div[class*="property"]'
                ]
                
                properties_found = 0
                for selector in selectors:
                    elements = soup.select(selector)
                    
                    for element in elements:
                        prop = self.extract_property_details(element)
                        if prop:
                            prop['area'] = area
                            
                            # Optionally get more details from individual page
                            if get_full_details and prop.get('url'):
                                time.sleep(self.delay)
                                extra_details = self.scrape_property_details_page(prop['url'])
                                if extra_details.get('images'):
                                    prop['images'] = extra_details['images']
                            
                            all_properties.append(prop)
                            properties_found += 1
                
                logger.info(f"Found {properties_found} properties on page {page}")
                
                if properties_found == 0:
                    break
                
                page += 1
                time.sleep(self.delay)
                
            except Exception as e:
                logger.error(f"Error on page {page}: {e}")
                break
        
        logger.info(f"Total properties scraped from {area}: {len(all_properties)}")
        return all_properties


# Example usage
if __name__ == "__main__":
    scraper = EnhancedProperty24Scraper()
    
    # Quick scrape with basic image extraction
    properties = scraper.scrape_area_with_images("sea-point", max_pages=1, get_full_details=False)
    
    print(f"Total properties found: {len(properties)}")
    
    for prop in properties[15:18]:
        # Handle price formatting safely
        price = prop.get('price')
        price_str = f"R{price:,}" if price else "Price on Application"
        
        print(f"\n{prop.get('title', 'Unknown')} - {price_str}")
        print(f"Images found: {len(prop.get('images', []))}")
        print(prop.get("images"))


2025-07-11 22:48:07,962 - INFO - Scraping page 1: https://www.property24.com/for-sale/sea-point/cape-town/western-cape/11021
2025-07-11 22:48:08,163 - INFO - Found 128 properties on page 1
2025-07-11 22:48:09,168 - INFO - Total properties scraped from sea-point: 128


Total properties found: 128

2 Bedroom Apartment - R7,500,000
Images found: 5
['https://images.prop24.com/216111620/Ensure528x153', 'https://images.prop24.com/357248079/Crop526x328', 'https://images.prop24.com/359421958/Crop526x328', 'https://images.prop24.com/359421959/Crop526x328', 'https://images.prop24.com/357248035/Crop526x328']

2 Bedroom Apartment - R7,500,000
Images found: 1
['https://www.property24.com/Content/images/Optimized/Icons/icon_floor_new.svg?z=6469c3498bc6c7f3625f']

2 Bedroom Apartment - R7,500,000
Images found: 0
None
