In [3]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Parameters
n_samples = 1000
locations = ['Downtown', 'Suburb', 'Urban', 'Rural']

# Generate synthetic data
data = {
    'Land_ID': range(1, n_samples + 1),
    'Location': np.random.choice(locations, n_samples, p=[0.3, 0.3, 0.2, 0.2]),
    'Area_SqFt': np.random.randint(2000, 15000, n_samples),
    'Road_Width_Ft': np.random.randint(10, 50, n_samples),
    'Distance_to_Landmark_Km': np.random.uniform(0.5, 15, n_samples)
}

# Create DataFrame
df = pd.DataFrame(data)

# Generate Price_per_SqFt_USD with extra noise
location_multipliers = {'Downtown': 2.0, 'Urban': 1.5, 'Suburb': 1.0, 'Rural': 0.5}
df['Price_per_SqFt_USD'] = (
    100 * df['Location'].map(location_multipliers) +
    2 * (df['Road_Width_Ft'] - 10) -
    5 * df['Distance_to_Landmark_Km'] +
    np.random.normal(0, 20, n_samples)  # Increased noise (±$20)
)

# Ensure prices are positive and rounded
df['Price_per_SqFt_USD'] = df['Price_per_SqFt_USD'].clip(lower=30).round(2)

# Introduce missing values (7% missing)
missing_rate = 0.07
for col in ['Road_Width_Ft', 'Distance_to_Landmark_Km', 'Price_per_SqFt_USD']:
    mask = np.random.rand(n_samples) < missing_rate
    df.loc[mask, col] = np.nan

# Save to CSV
df.to_csv('realistic_land_price_dataset.csv', index=False)
