In [1]:
# Import libraries
import pandas as pd
import numpy as np
import joblib
import re
import ast

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
# Load dataset
main_df = pd.read_csv("properties.csv")

print("Original dataset size:", main_df.shape)

Original dataset size: (203874, 26)


In [3]:
# Filter for Houses For Sale
main_df = main_df[main_df['category'] == 'Houses For Sale'].copy()

# Drop rows missing essential fields
df = main_df.dropna(subset=['details', 'price', 'location', 'properties', 'geo_region'])[['details', 'price', 'location', 'properties', 'geo_region']].copy()
                                                                            
# Convert 'properties' from string to dictionary
df['properties'] = df['properties'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)


In [4]:
# Extract land size and house size
def extract_land_size(prop):
    match = re.search(r'([\d.]+)\s*perch', str(prop.get('Land size', '')), re.IGNORECASE)
    return float(match.group(1)) if match else None

def extract_house_size(prop):
    size = prop.get('House size') or prop.get('Size')
    match = re.search(r'([\d,\.]+)', str(size))
    return float(match.group(1).replace(',', '')) if match else None

df['land_size_perch'] = df['properties'].apply(extract_land_size)
df['house_size_sqft'] = df['properties'].apply(extract_house_size)

df


Unnamed: 0,details,price,location,properties,geo_region,land_size_perch,house_size_sqft
4,"Bedrooms: 5, Bathrooms: 4","Rs 55,000,000",Kadawatha,"{'Address': 'Pinthaliya Road, Kadawatha', 'Bed...",LK-12,27.50,3000.0
9,"Bedrooms: 4, Bathrooms: 4","Rs 80,000,000",Dehiwala,"{'Address': 'kadawatha Road', 'Bedrooms': '4',...",LK-11,8.75,3000.0
21,"Bedrooms: 4, Bathrooms: 2","Rs 125,000,000",Mount Lavinia,"{'Address': 'Off Temple Road, Mount Lavinia', ...",LK-11,20.00,5600.0
22,"Bedrooms: 4, Bathrooms: 3","Rs 58,000,000",Malabe,"{'Bedrooms': '4', 'Bathrooms': '3', 'House siz...",LK-11,16.00,4000.0
24,"Bedrooms: 4, Bathrooms: 3","Rs 22,000,000",Kotte,"{'Address': 'Kotte', 'Bedrooms': '4', 'Bathroo...",LK-11,6.50,2200.0
...,...,...,...,...,...,...,...
203848,"Bedrooms: 4, Bathrooms: 2","Rs 9,000,000",Ragama,"{'Address': 'horpe ragama', 'Bedrooms': '4', '...",LK-12,10.00,1750.0
203855,"Bedrooms: 4, Bathrooms: 3","Rs 27,500,000",Piliyandala,{'Address': 'à¶à·à¶­à·à¶©à·à· à¶à¶°à·à·...,LK-11,7.00,2300.0
203856,"Bedrooms: 3, Bathrooms: 2","Rs 23,000,000",Nugegoda,"{'Address': 'Pasal Mawatha , Gangodavila , Nug...",LK-11,9.00,2100.0
203861,"Bedrooms: 4, Bathrooms: 4","Rs 3,250,000",Kottawa,"{'Address': 'Homagama', 'Bedrooms': '4', 'Bath...",LK-11,8.00,2800.0


In [5]:

def extract_bed_bath(details):
    bedrooms = None
    bathrooms = None

    try:
        if pd.isnull(details):
            return pd.Series({'bedrooms': bedrooms, 'bathrooms': bathrooms})

        # Clean non-breaking spaces and strip
        details = str(details).replace('\xa0', ' ').strip()

        # Match Bedrooms
        bed_match = re.search(r'Bedrooms?:\s*(\d+)', details, re.IGNORECASE)
        if not bed_match:
            bed_match = re.search(r'(\d+)\s*(Bed|Bedroom)', details, re.IGNORECASE)
        if bed_match:
            bedrooms = int(bed_match.group(1))

        # Match Bathrooms
        bath_match = re.search(r'Bathrooms?:\s*(\d+)', details, re.IGNORECASE)
        if not bath_match:
            bath_match = re.search(r'(\d+)\s*(Bath|Bathroom)', details, re.IGNORECASE)
        if bath_match:
            bathrooms = int(bath_match.group(1))

    except Exception as e:
        print(f"Error processing: {details} -> {e}")

    return pd.Series({'bedrooms': bedrooms, 'bathrooms': bathrooms})

# Apply to the column
df[['bedrooms', 'bathrooms']] = df['details'].apply(extract_bed_bath)

# Drop the "properties' and 'details' columns as they are no longer needed
df = df.drop(['properties', 'details'], axis=1)

df


Unnamed: 0,price,location,geo_region,land_size_perch,house_size_sqft,bedrooms,bathrooms
4,"Rs 55,000,000",Kadawatha,LK-12,27.50,3000.0,5,4
9,"Rs 80,000,000",Dehiwala,LK-11,8.75,3000.0,4,4
21,"Rs 125,000,000",Mount Lavinia,LK-11,20.00,5600.0,4,2
22,"Rs 58,000,000",Malabe,LK-11,16.00,4000.0,4,3
24,"Rs 22,000,000",Kotte,LK-11,6.50,2200.0,4,3
...,...,...,...,...,...,...,...
203848,"Rs 9,000,000",Ragama,LK-12,10.00,1750.0,4,2
203855,"Rs 27,500,000",Piliyandala,LK-11,7.00,2300.0,4,3
203856,"Rs 23,000,000",Nugegoda,LK-11,9.00,2100.0,3,2
203861,"Rs 3,250,000",Kottawa,LK-11,8.00,2800.0,4,4


In [6]:
print("Nulls after parsing:")
print(df[['bedrooms', 'bathrooms']].isnull().sum())

# Drop rows with missing required values
df = df.dropna()

df

Nulls after parsing:
bedrooms     0
bathrooms    0
dtype: int64


Unnamed: 0,price,location,geo_region,land_size_perch,house_size_sqft,bedrooms,bathrooms
4,"Rs 55,000,000",Kadawatha,LK-12,27.50,3000.0,5,4
9,"Rs 80,000,000",Dehiwala,LK-11,8.75,3000.0,4,4
21,"Rs 125,000,000",Mount Lavinia,LK-11,20.00,5600.0,4,2
22,"Rs 58,000,000",Malabe,LK-11,16.00,4000.0,4,3
24,"Rs 22,000,000",Kotte,LK-11,6.50,2200.0,4,3
...,...,...,...,...,...,...,...
203848,"Rs 9,000,000",Ragama,LK-12,10.00,1750.0,4,2
203855,"Rs 27,500,000",Piliyandala,LK-11,7.00,2300.0,4,3
203856,"Rs 23,000,000",Nugegoda,LK-11,9.00,2100.0,3,2
203861,"Rs 3,250,000",Kottawa,LK-11,8.00,2800.0,4,4


In [7]:
# Final cleaning and filtering
# Clean the price column
df.loc[:, 'price_clean'] = (
    df['price']
    .astype(str)
    .str.replace('Rs', '', regex=False)
    .str.replace(',', '')
    .str.extract(r'(\d+\.?\d*)')[0]
    .astype(float)
)

# Remove extreme outliers
df = df[(df['land_size_perch'] < 100) & (df['price_clean'] < 100_000_000)]

# Keep only necessary columns
df = df[[
    'bedrooms',
    'bathrooms',
    'house_size_sqft',
    'land_size_perch',
    'location',
    'price_clean'
]]

print("Cleaned and trimmed dataset size:", df.shape)

Cleaned and trimmed dataset size: (48120, 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'price_clean'] = (


In [8]:
# Display data
df.info()
df.head()

# Display unique values in 'location' 
print("Unique locations:", df['location'].unique())

<class 'pandas.core.frame.DataFrame'>
Index: 48120 entries, 4 to 203872
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   bedrooms         48120 non-null  int64  
 1   bathrooms        48120 non-null  int64  
 2   house_size_sqft  48120 non-null  float64
 3   land_size_perch  48120 non-null  float64
 4   location         48120 non-null  object 
 5   price_clean      48120 non-null  float64
dtypes: float64(3), int64(2), object(1)
memory usage: 2.6+ MB
Unique locations: ['Kadawatha' 'Dehiwala' 'Malabe' 'Kotte' 'Talawatugoda' 'Horana'
 'Nugegoda' 'Athurugiriya' 'Battaramulla' 'Piliyandala' 'Boralesgamuwa'
 'Kolonnawa' 'Rajagiriya' 'Mount Lavinia' 'Gampaha City' 'Kelaniya'
 'Nawala' 'Matara City' 'Ja-Ela' 'Maharagama' 'Kurunegala City'
 'Kandy City' 'Wattala' 'Ragama' 'Kiribathgoda' 'Moratuwa' 'Homagama'
 'Pannipitiya' 'Panadura' 'Kottawa' 'Kandana' 'Bandaragama' 'Matugama'
 'Colombo 5' 'Jaffna City' 'Kohuwala' 

In [9]:
# Count occurrences of each unique value in 'location'
print("Location value counts:")
print(df['location'].value_counts())


Location value counts:
location
Piliyandala        8920
Negombo            4077
Talawatugoda       3618
Malabe             3127
Athurugiriya       2613
                   ... 
Sigiriya              1
Nallur                1
Yatawatta             1
Ella                  1
Madawala Bazaar       1
Name: count, Length: 201, dtype: int64


In [10]:
top_locations = df['location'].value_counts().nlargest(10).index
df = df[df['location'].isin(top_locations)]


In [11]:
df

Unnamed: 0,bedrooms,bathrooms,house_size_sqft,land_size_perch,location,price_clean
9,4,4,3000.0,8.75,Dehiwala,80000000.0
22,4,3,4000.0,16.00,Malabe,58000000.0
31,4,3,3000.0,16.00,Talawatugoda,58000000.0
36,4,3,3800.0,10.00,Talawatugoda,95000000.0
38,5,4,4778.0,7.50,Talawatugoda,65000000.0
...,...,...,...,...,...,...
203835,5,3,2500.0,10.00,Piliyandala,35000000.0
203836,3,2,1441.0,10.00,Negombo,22500000.0
203855,4,3,2300.0,7.00,Piliyandala,27500000.0
203856,3,2,2100.0,9.00,Nugegoda,23000000.0


In [12]:
# Feature selection
features = df[['bedrooms', 'bathrooms', 'house_size_sqft', 'land_size_perch', 'location']]
X = pd.get_dummies(features, columns=['location'], drop_first=True)
y = df['price_clean']

# Save column names
feature_columns = X.columns.tolist()


In [13]:
# Split and scale
# Split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
# Train model
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# Base model
xgb = XGBRegressor(random_state=42)

# Parameter distribution
param_dist = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.2),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.3, 0.7),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(0, 10),
}

# Random search setup
random_search = RandomizedSearchCV(
    xgb,
    param_distributions=param_dist,
    n_iter=50,  # can increase for more thorough search
    scoring='r2',
    n_jobs=-1,
    cv=3, #cross-validation
    verbose=2,
    random_state=42
)

# Fit the search
random_search.fit(X_train_scaled, y_train)

# Best model
best_xgb = random_search.best_estimator_

# Evaluate
y_pred = best_xgb.predict(X_test_scaled)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Best Params: {random_search.best_params_}")
print(f"R² Score: {r2:.4f} ({r2 * 100:.2f}%)")
print(f"MAE: Rs {mae:,.2f}")
print(f"RMSE: Rs {rmse:,.2f}")

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Params: {'colsample_bytree': 0.6584651408094966, 'learning_rate': 0.05529915503958759, 'max_depth': 9, 'n_estimators': 919, 'reg_alpha': 0.940523264489604, 'reg_lambda': 3.975720210875223, 'subsample': 0.75887567526374}
R² Score: 0.9154 (91.54%)
MAE: Rs 2,660,612.60
RMSE: Rs 5,314,174.07


In [15]:
# Save everything 

# Save the XGBoost model and other components
joblib.dump(best_xgb, 'trained_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(feature_columns, 'features.pkl')

print("✅ Model and components saved.")


✅ Model and components saved.


In [16]:
# Predict using the trained XGBoost model

# Load components
model = joblib.load('trained_model.pkl')
scaler = joblib.load('scaler.pkl')
feature_columns = joblib.load('features.pkl')

# Inputs (example)
bedrooms = 4
bathrooms = 3
house_size_sqft = 4000
land_size_perch = 16
location = 'Malabe'

# Build input dict
input_data = {
    'bedrooms': bedrooms,
    'bathrooms': bathrooms,
    'house_size_sqft': house_size_sqft,
    'land_size_perch': land_size_perch,
}

# Add one-hot encoded features
for col in feature_columns:
    if col.startswith('location_'):
        input_data[col] = 1 if col == f'location_{location}' else 0
    elif col not in input_data:
        input_data[col] = 0  # Fill in any other missing features with 0

# Create input DataFrame in correct column order
input_df = pd.DataFrame([input_data])[feature_columns]

# Scale features (but NOT PCA transform)
input_scaled = scaler.transform(input_df)

# Predict
predicted_price = model.predict(input_scaled)[0]
print(f"🏠 Predicted House Price: Rs {predicted_price:,.2f}") 

🏠 Predicted House Price: Rs 56,771,032.00
