In [1]:

import numpy as np
import pandas as pd
from pathlib import Path

In [5]:
# Generating a realistic synthetic Ibadan house-prices dataset
# This code will:
# - create N synthetic house records
# - include realistic numeric/categorical features
# - assign Ibadan neighbourhoods with lat/lon and desirability scores
# - compute base price from fundamentals (area, beds, baths, amenities) and location multiplier
# - convert price to Naira and add noise
# - show the head and correlation between desirability and price
# - save a CSV to /mnt/data/ibadan_synthetic_housing.csv for download
#
# Run in the notebook/Colab environment to inspect and download the CSV.

import numpy as np
import pandas as pd
from pathlib import Path

np.random.seed(42)
N = 1000

# Neighborhoods with coordinates and desirability (1-5 scale)
locations = {
    'Bodija': {'lat': 7.4352, 'lon': 3.9133, 'score': 5.0},
    'Jericho': {'lat': 7.4030, 'lon': 3.8850, 'score': 4.8},
    'Agodi GRA': {'lat': 7.4069, 'lon': 3.8993, 'score': 4.7},
    'Iyaganku GRA': {'lat': 7.3925, 'lon': 3.8681, 'score': 4.6},
    'Alalubosa': {'lat': 7.3839, 'lon': 3.8617, 'score': 4.3},
    'Oluyole Estate': {'lat': 7.3628, 'lon': 3.8562, 'score': 4.2},
    'Challenge': {'lat': 7.3383, 'lon': 3.8773, 'score': 3.8},
    'Akobo': {'lat': 7.3964, 'lon': 3.9167, 'score': 3.5},
    'Samonda': {'lat': 7.4306, 'lon': 3.9081, 'score': 3.2},
    'Apete': {'lat': 7.4492, 'lon': 3.8722, 'score': 2.8},
}

loc_names = list(locations.keys())

# Sample locations with a bias: more records in mid-tier areas
loc_probs = np.array([0.08, 0.10, 0.07, 0.06, 0.06, 0.08, 0.14, 0.15, 0.16, 0.10])
loc_probs = loc_probs / loc_probs.sum()

chosen_locs = np.random.choice(loc_names, size=N, p=loc_probs)

# Core numeric features
area = np.round(np.random.normal(loc=2500, scale=900, size=N)).clip(400, 10000)  # sqft
bedrooms = np.random.choice([1,2,3,4,5,6], size=N, p=[0.05,0.1,0.35,0.3,0.15,0.05])
bathrooms = np.minimum(bedrooms, np.random.choice([1,2,3,4], size=N, p=[0.4,0.4,0.15,0.05]))
stories = np.random.choice([1,2,3,4], size=N, p=[0.5,0.35,0.12,0.03])
parking = np.random.poisson(lam=1.6, size=N).clip(0,6)

# Binary features (yes/no) with reasonable probabilities
mainroad = np.random.choice(['yes','no'], size=N, p=[0.35, 0.65])
guestroom = np.random.choice(['yes','no'], size=N, p=[0.12, 0.88])
basement = np.random.choice(['yes','no'], size=N, p=[0.08, 0.92])
hotwaterheating = np.random.choice(['yes','no'], size=N, p=[0.05, 0.95])
airconditioning = np.random.choice(['yes','no'], size=N, p=[0.22, 0.78])
prefarea = np.random.choice(['yes','no'], size=N, p=[0.18, 0.82])
furnishingstatus = np.random.choice(['furnished','semi-furnished','unfurnished'], size=N, p=[0.25,0.4,0.35])

# Build a base price (in USD) using an interpretable formula
# price_base = base_rate_per_sqft * area + bedroom_premium + bathroom_premium + story_premium + parking_premium + amenities_bonus
base_rate_per_sqft = np.random.normal(loc=70, scale=12, size=N).clip(30, 120)  # USD per sqft baseline before location
price_base = base_rate_per_sqft * area

# Add linear contributions for features
price_base += bedrooms * np.random.normal(5000, 800, size=N)        # per bedroom contribution
price_base += bathrooms * np.random.normal(3000, 500, size=N)       # per bathroom contribution
price_base += stories * np.random.normal(2500, 400, size=N)
price_base += parking * np.random.normal(2000, 500, size=N)

# Amenity bonuses (in USD)
amenity_bonus = (
    (mainroad == 'yes') * 4000 +
    (guestroom == 'yes') * 3000 +
    (basement == 'yes') * 2500 +
    (hotwaterheating == 'yes') * 1500 +
    (airconditioning == 'yes') * 5000 +
    (prefarea == 'yes') * 3500 +
    (furnishingstatus == 'furnished') * 6000 +
    (furnishingstatus == 'semi-furnished') * 3000
)
price_base += amenity_bonus

# Apply location multiplier driven by desirability score
# Normalize desirability (1-5 -> 0-1) and map to multiplier range [0.85, 1.35]
desirability_scores = np.array([locations[l]['score'] for l in chosen_locs])
des_norm = (desirability_scores - desirability_scores.min()) / (desirability_scores.max() - desirability_scores.min())
loc_multiplier = 0.85 + des_norm * (1.35 - 0.85)  # 0.85 to 1.35 multiplier
price_loc_adjusted = price_base * loc_multiplier

# Add market noise (5-12%)
noise = np.random.normal(0, 0.08, size=N)  # 8% std dev
price_noisy = price_loc_adjusted * (1 + noise)

# Convert to Naira (assumed 1 USD = 1600 NGN)
usd_to_ngn = 1600
price_ngn = np.round(price_noisy * usd_to_ngn)

# Assemble the dataframe
df = pd.DataFrame({
    'area': area.astype(int),
    'bedrooms': bedrooms,
    'bathrooms': bathrooms,
    'stories': stories,
    'mainroad': mainroad,
    'guestroom': guestroom,
    'basement': basement,
    'hotwaterheating': hotwaterheating,
    'airconditioning': airconditioning,
    'parking': parking,
    'prefarea': prefarea,
    'furnishingstatus': furnishingstatus,
    'location': chosen_locs,
    'desirability_score': desirability_scores,
    'loc_multiplier': loc_multiplier,
    'price_usd': np.round(price_noisy),
    'price_naira': price_ngn
})

# Add lat/lon from locations mapping
latitudes = [locations[l]['lat'] for l in chosen_locs]
longitudes = [locations[l]['lon'] for l in chosen_locs]
df['latitude'] = latitudes
df['longitude'] = longitudes

# Quick sanity checks and outputs
df['price_per_sqft_naira'] = (df['price_naira'] / df['area']).round(0)

# Show head and correlation between desirability and price_naira
head = df.head(10)
corr = df[['price_naira', 'desirability_score']].corr().iloc[0,1]

# Save CSV for download
out_path = Path('data/ibadan_synthetic_housing.csv')
df.to_csv(out_path, index=False)


In [6]:
df.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,location,desirability_score,loc_multiplier,price_usd,price_naira,latitude,longitude,price_per_sqft_naira
0,2660,6,1,3,no,no,no,no,no,1,no,unfurnished,Oluyole Estate,4.2,1.168182,273063.0,436900393.0,7.3628,3.8562,164248.0
1,1298,3,3,2,yes,no,no,no,no,0,yes,semi-furnished,Apete,2.8,0.85,104470.0,167151706.0,7.4492,3.8722,128776.0
2,2842,3,3,1,yes,no,no,no,no,2,no,unfurnished,Akobo,3.5,1.009091,267692.0,428306508.0,7.3964,3.9167,150706.0
3,3050,3,1,2,yes,no,no,no,yes,0,no,furnished,Akobo,3.5,1.009091,235036.0,376058347.0,7.3964,3.9167,123298.0
4,3004,4,1,2,no,no,no,no,no,0,no,semi-furnished,Jericho,4.8,1.304545,313965.0,502343537.0,7.403,3.885,167225.0


In [7]:
df["desirability_score"].corr(df["price_naira"])

0.39782006233679873