In [1]:
"""
South African Real Estate Data Generator
Generates realistic synthetic property data for SQL analysis project
"""

import csv
import random
from datetime import datetime, timedelta
from decimal import Decimal

# South African cities and their neighborhoods
SA_LOCATIONS = {
    "Johannesburg": {
        "neighborhoods": [
            "Sandton", "Rosebank", "Melrose", "Houghton", "Parktown", "Bryanston",
            "Fourways", "Randburg", "Roodepoort", "Soweto", "Alexandra", "Midrand"
        ],
        "price_multiplier": 1.2
    },
    "Cape Town": {
        "neighborhoods": [
            "Camps Bay", "Clifton", "Sea Point", "Green Point", "Waterfront", 
            "Constantia", "Bishopscourt", "Claremont", "Observatory", "Woodstock",
            "Bellville", "Mitchell's Plain"
        ],
        "price_multiplier": 1.3
    },
    "Pretoria": {
        "neighborhoods": [
            "Waterkloof", "Brooklyn", "Menlyn", "Hatfield", "Centurion", 
            "Silverton", "Arcadia", "Sunnyside", "Lynnwood", "Montana"
        ],
        "price_multiplier": 0.9
    },
    "Durban": {
        "neighborhoods": [
            "Umhlanga", "Ballito", "La Lucia", "Morningside", "Berea", 
            "Glenwood", "Durban North", "Westville", "Umlazi", "Phoenix"
        ],
        "price_multiplier": 0.85
    },
    "Port Elizabeth": {
        "neighborhoods": [
            "Summerstrand", "Humewood", "Walmer", "Lovemore Heights", 
            "Greenbushes", "Framesby", "Kabega Park", "Newton Park"
        ],
        "price_multiplier": 0.7
    }
}

PROVINCES = {
    "Johannesburg": "Gauteng",
    "Cape Town": "Western Cape",
    "Pretoria": "Gauteng",
    "Durban": "KwaZulu-Natal",
    "Port Elizabeth": "Eastern Cape"
}

PROPERTY_TYPES = {
    "House": {"base_price": 2500000, "weight": 40},
    "Apartment": {"base_price": 1500000, "weight": 30},
    "Townhouse": {"base_price": 1800000, "weight": 20},
    "Farm": {"base_price": 5000000, "weight": 5},
    "Commercial": {"base_price": 8000000, "weight": 5}
}

STATUSES = ["Listed", "Under Offer", "Sold", "Withdrawn"]

# Agency names
AGENCIES = [
    "Pam Golding Properties", "Seeff Properties", "RE/MAX", "Chas Everitt",
    "Rawson Properties", "Harcourts", "Engel & Völkers", "Leapfrog Property",
    "Tyson Properties", "Lew Geffen Sotheby's"
]

def random_date(start_date, end_date):
    """Generate random date between start and end"""
    delta = end_date - start_date
    random_days = random.randint(0, delta.days)
    return start_date + timedelta(days=random_days)

def generate_properties(num_properties=2000):
    """Generate property listings"""
    properties = []
    property_id = 1
    
    start_date = datetime.now() - timedelta(days=730)  # 2 years ago
    
    for _ in range(num_properties):
        city = random.choice(list(SA_LOCATIONS.keys()))
        neighborhood = random.choice(SA_LOCATIONS[city]["neighborhoods"])
        province = PROVINCES[city]
        
        # Weighted random property type
        prop_type = random.choices(
            list(PROPERTY_TYPES.keys()),
            weights=[PROPERTY_TYPES[pt]["weight"] for pt in PROPERTY_TYPES.keys()]
        )[0]
        
        base_price = PROPERTY_TYPES[prop_type]["base_price"]
        city_multiplier = SA_LOCATIONS[city]["price_multiplier"]
        
        # Property characteristics based on type
        if prop_type == "Apartment":
            bedrooms = random.choices([1, 2, 3, 4], weights=[20, 40, 30, 10])[0]
            bathrooms = random.choices([1, 1.5, 2, 2.5], weights=[30, 30, 30, 10])[0]
            parking = random.randint(1, 2)
            size_sqm = random.randint(50, 200)
            erf_size = 0
        elif prop_type == "House":
            bedrooms = random.choices([2, 3, 4, 5, 6], weights=[10, 30, 35, 20, 5])[0]
            bathrooms = random.choices([1, 2, 2.5, 3, 4], weights=[10, 30, 30, 20, 10])[0]
            parking = random.randint(1, 4)
            size_sqm = random.randint(120, 450)
            erf_size = random.randint(300, 1500)
        elif prop_type == "Townhouse":
            bedrooms = random.choices([2, 3, 4], weights=[20, 50, 30])[0]
            bathrooms = random.choices([1.5, 2, 2.5, 3], weights=[20, 40, 30, 10])[0]
            parking = random.randint(1, 2)
            size_sqm = random.randint(100, 250)
            erf_size = random.randint(150, 400)
        elif prop_type == "Farm":
            bedrooms = random.choices([3, 4, 5, 6], weights=[20, 40, 30, 10])[0]
            bathrooms = random.choices([2, 3, 4, 5], weights=[30, 40, 20, 10])[0]
            parking = random.randint(3, 8)
            size_sqm = random.randint(200, 600)
            erf_size = random.randint(10000, 500000)
        else:  # Commercial
            bedrooms = 0
            bathrooms = random.randint(2, 6)
            parking = random.randint(5, 50)
            size_sqm = random.randint(200, 2000)
            erf_size = random.randint(500, 5000)
        
        # Calculate listing price with variation
        price_variation = random.uniform(0.6, 1.8)
        listing_price = int(base_price * city_multiplier * price_variation)
        
        # Listing date
        listing_date = random_date(start_date, datetime.now())
        
        # Status and sale information
        status = random.choices(STATUSES, weights=[40, 15, 35, 10])[0]
        
        sale_date = None
        sale_price = None
        
        if status == "Sold":
            days_to_sell = random.randint(7, 180)
            sale_date = listing_date + timedelta(days=days_to_sell)
            # Sale price typically 90-105% of listing
            sale_percentage = random.uniform(0.90, 1.05)
            sale_price = int(listing_price * sale_percentage)
        
        agent_id = random.randint(1, 50)
        
        # Street number and name
        street_num = random.randint(1, 999)
        street_names = ["Main", "Church", "Park", "Oak", "High", "Station", "Market", 
                       "Victoria", "Jan Smuts", "Nelson Mandela", "Long", "Beach"]
        street_types = ["Street", "Road", "Avenue", "Drive", "Lane"]
        address = f"{street_num} {random.choice(street_names)} {random.choice(street_types)}, {neighborhood}"
        
        properties.append({
            "property_id": property_id,
            "address": address,
            "city": city,
            "province": province,
            "property_type": prop_type,
            "bedrooms": bedrooms,
            "bathrooms": bathrooms,
            "parking_spaces": parking,
            "size_sqm": size_sqm,
            "erf_size_sqm": erf_size,
            "listing_price": listing_price,
            "listing_date": listing_date.strftime("%Y-%m-%d"),
            "sale_date": sale_date.strftime("%Y-%m-%d") if sale_date else None,
            "sale_price": sale_price if sale_price else None,
            "agent_id": agent_id,
            "status": status
        })
        
        property_id += 1
    
    return properties

def generate_agents(num_agents=50):
    """Generate real estate agents"""
    agents = []
    first_names = ["John", "Sarah", "Michael", "Emma", "David", "Lisa", "James", 
                   "Jennifer", "Robert", "Michelle", "Thabo", "Nomsa", "Sipho",
                   "Zanele", "Pieter", "Annelie", "Ahmed", "Fatima"]
    last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Miller", 
                  "Davis", "Van der Merwe", "Naidoo", "Mokoena", "Dlamini", 
                  "Pretorius", "Khumalo", "Patel", "Van Zyl"]
    
    for agent_id in range(1, num_agents + 1):
        first_name = random.choice(first_names)
        last_name = random.choice(last_names)
        agent_name = f"{first_name} {last_name}"
        agency_name = random.choice(AGENCIES)
        city = random.choice(list(SA_LOCATIONS.keys()))
        
        # South African phone format
        phone = f"0{random.randint(71, 87)}-{random.randint(100, 999)}-{random.randint(1000, 9999)}"
        email = f"{first_name.lower()}.{last_name.lower()}@{agency_name.replace(' ', '').lower()}.co.za"
        
        agents.append({
            "agent_id": agent_id,
            "agent_name": agent_name,
            "agency_name": agency_name,
            "contact_number": phone,
            "email": email,
            "city": city
        })
    
    return agents

def generate_neighborhoods():
    """Generate neighborhood data"""
    neighborhoods = []
    neighborhood_id = 1
    
    income_brackets = ["Low", "Lower-Middle", "Middle", "Upper-Middle", "High"]
    
    for city, data in SA_LOCATIONS.items():
        province = PROVINCES[city]
        
        for neighborhood in data["neighborhoods"]:
            # Assign characteristics
            if neighborhood in ["Sandton", "Camps Bay", "Clifton", "Constantia", 
                               "Waterkloof", "Umhlanga", "Bishopscourt"]:
                income = "High"
                crime = random.randint(1, 4)
                school = random.randint(7, 10)
                cbd_distance = random.uniform(5, 20)
            elif neighborhood in ["Soweto", "Alexandra", "Mitchell's Plain", 
                                 "Umlazi", "Phoenix"]:
                income = random.choice(["Low", "Lower-Middle"])
                crime = random.randint(6, 9)
                school = random.randint(3, 6)
                cbd_distance = random.uniform(15, 40)
            else:
                income = random.choice(["Middle", "Upper-Middle"])
                crime = random.randint(3, 7)
                school = random.randint(5, 8)
                cbd_distance = random.uniform(5, 25)
            
            neighborhoods.append({
                "neighborhood_id": neighborhood_id,
                "neighborhood_name": neighborhood,
                "city": city,
                "province": province,
                "avg_income_bracket": income,
                "crime_rating": crime,
                "school_rating": school,
                "distance_to_cbd_km": round(cbd_distance, 2)
            })
            
            neighborhood_id += 1
    
    return neighborhoods

def generate_property_features(properties):
    """Generate property features"""
    features = []
    
    for prop in properties:
        property_id = prop["property_id"]
        prop_type = prop["property_type"]
        price = prop["listing_price"]
        
        # Features more common in expensive properties
        high_end = price > 3000000
        
        if prop_type == "Commercial":
            features.append({
                "feature_id": property_id,
                "property_id": property_id,
                "has_pool": False,
                "has_garden": False,
                "has_security": random.choice([True, False]),
                "has_solar_panels": random.choices([True, False], weights=[30, 70])[0],
                "has_borehole": random.choices([True, False], weights=[20, 80])[0],
                "pet_friendly": False,
                "furnished": False
            })
        else:
            has_pool = random.choices([True, False], 
                                     weights=[40, 60] if high_end else [10, 90])[0]
            has_garden = prop_type in ["House", "Townhouse", "Farm"] and \
                        random.choices([True, False], weights=[70, 30])[0]
            has_security = random.choices([True, False], 
                                         weights=[80, 20] if high_end else [40, 60])[0]
            has_solar = random.choices([True, False], 
                                      weights=[35, 65] if high_end else [15, 85])[0]
            has_borehole = (prop_type == "Farm" and random.choices([True, False], 
                           weights=[60, 40])[0]) or \
                          (prop_type == "House" and random.choices([True, False], 
                           weights=[20, 80])[0])
            pet_friendly = random.choices([True, False], weights=[60, 40])[0]
            furnished = prop_type == "Apartment" and \
                       random.choices([True, False], weights=[30, 70])[0]
            
            features.append({
                "feature_id": property_id,
                "property_id": property_id,
                "has_pool": has_pool,
                "has_garden": has_garden,
                "has_security": has_security,
                "has_solar_panels": has_solar,
                "has_borehole": has_borehole,
                "pet_friendly": pet_friendly,
                "furnished": furnished
            })
    
    return features

def generate_price_history(properties):
    """Generate price change history for some properties"""
    price_history = []
    history_id = 1
    
    # About 30% of properties have price changes
    properties_with_changes = random.sample(properties, int(len(properties) * 0.3))
    
    reasons = ["Market adjustment", "Buyer feedback", "Comparative market analysis",
               "Property improvements", "Seasonal adjustment", "Time on market"]
    
    for prop in properties_with_changes:
        property_id = prop["property_id"]
        listing_date = datetime.strptime(prop["listing_date"], "%Y-%m-%d")
        listing_price = prop["listing_price"]
        
        # 1-3 price changes
        num_changes = random.randint(1, 3)
        current_price = listing_price
        
        for i in range(num_changes):
            # Price changes typically 5-15% up or down
            change_pct = random.uniform(-0.15, 0.10)
            new_price = int(current_price * (1 + change_pct))
            
            # Change date between listing and now
            days_after = random.randint(14, 120)
            change_date = listing_date + timedelta(days=days_after * (i + 1))
            
            if change_date > datetime.now():
                break
            
            price_history.append({
                "history_id": history_id,
                "property_id": property_id,
                "price_change_date": change_date.strftime("%Y-%m-%d"),
                "old_price": current_price,
                "new_price": new_price,
                "reason": random.choice(reasons)
            })
            
            current_price = new_price
            history_id += 1
    
    return price_history

def write_csv(filename, data, fieldnames):
    """Write data to CSV file"""
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)
    print(f"✓ Generated {filename} with {len(data)} records")

def main():
    """Generate all data files"""
    print("Generating South African Real Estate Data...\n")
    
    # Generate data
    print("Generating properties...")
    properties = generate_properties(2000)
    
    print("Generating agents...")
    agents = generate_agents(50)
    
    print("Generating neighborhoods...")
    neighborhoods = generate_neighborhoods()
    
    print("Generating property features...")
    features = generate_property_features(properties)
    
    print("Generating price history...")
    price_history = generate_price_history(properties)
    
    print("\nWriting CSV files...\n")
    
    # Write CSV files
    write_csv('properties.csv', properties, 
              ['property_id', 'address', 'city', 'province', 'property_type',
               'bedrooms', 'bathrooms', 'parking_spaces', 'size_sqm', 'erf_size_sqm',
               'listing_price', 'listing_date', 'sale_date', 'sale_price', 
               'agent_id', 'status'])
    
    write_csv('agents.csv', agents,
              ['agent_id', 'agent_name', 'agency_name', 'contact_number', 
               'email', 'city'])
    
    write_csv('neighborhoods.csv', neighborhoods,
              ['neighborhood_id', 'neighborhood_name', 'city', 'province',
               'avg_income_bracket', 'crime_rating', 'school_rating', 
               'distance_to_cbd_km'])
    
    write_csv('property_features.csv', features,
              ['feature_id', 'property_id', 'has_pool', 'has_garden', 
               'has_security', 'has_solar_panels', 'has_borehole', 
               'pet_friendly', 'furnished'])
    
    write_csv('price_history.csv', price_history,
              ['history_id', 'property_id', 'price_change_date', 
               'old_price', 'new_price', 'reason'])
    
    print("\n" + "="*50)
    print("DATA GENERATION COMPLETE!")
    print("="*50)
    print(f"\nGenerated files:")
    print(f"  • properties.csv - {len(properties)} properties")
    print(f"  • agents.csv - {len(agents)} agents")
    print(f"  • neighborhoods.csv - {len(neighborhoods)} neighborhoods")
    print(f"  • property_features.csv - {len(features)} feature records")
    print(f"  • price_history.csv - {len(price_history)} price changes")
    print("\nYou can now import these CSV files into your database!")

if __name__ == "__main__":
    main()

Generating South African Real Estate Data...

Generating properties...
Generating agents...
Generating neighborhoods...
Generating property features...
Generating price history...

Writing CSV files...

✓ Generated properties.csv with 2000 records
✓ Generated agents.csv with 50 records
✓ Generated neighborhoods.csv with 52 records
✓ Generated property_features.csv with 2000 records
✓ Generated price_history.csv with 995 records

DATA GENERATION COMPLETE!

Generated files:
  • properties.csv - 2000 properties
  • agents.csv - 50 agents
  • neighborhoods.csv - 52 neighborhoods
  • property_features.csv - 2000 feature records
  • price_history.csv - 995 price changes

You can now import these CSV files into your database!
