In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from glob import glob
import warnings;warnings.filterwarnings(action="ignore")
import missingno
#warnings.simplefilter(action="ignore", category='all')

#feature selection
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from category_encoders import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder as sklearn_ord


#feature extraction
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import mutual_info_classif

#models to use
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost.sklearn import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
#feature metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
import time

In [3]:
# Import the data
df = pd.read_csv("model_data.csv")

In [4]:
#drop duplicates if any
df.drop_duplicates(inplace=True)

In [5]:
# check for data shape after dropping duplicates
df.shape

(70850, 7)

In [6]:
df_copy = df.copy()

In [7]:
# reduce the price range to between 300000 and 10000000 
df_copy = df_copy[(df_copy['PRICE'] >= 300000) & (df_copy['PRICE'] <= 10000000)]

In [12]:
df_copy["LOCATION"] = df_copy["LOCATION"].str.capitalize()

In [13]:
df_copy["LOCATION"].nunique()

91

In [14]:
df_copy.shape

(64821, 7)

In [15]:
df_copy["LOCATION"].unique()

array(['Ikoyi', 'Yaba', 'Lekki', 'Ajah', 'Victoria island', 'Ikeja',
       'Ilupeju', 'Isolo', 'Shomolu', 'Ketu', 'Surulere', 'Amuwo odofin',
       'Abule egba', 'Oshodi', 'Apapa', 'Ikorodu', 'Ojodu', 'Ipaja',
       'Egbeda', 'Ikotun', 'Idimu', 'Ogba', 'Igando', 'Akowonjo',
       'Ikate-lekki', 'Ikota-lekki', 'Chevron-lekki', 'Phase1-lekki',
       'Phase2-lekki', 'Vgc-lekki', 'Ibeju-lekki', 'Osapa london lekki',
       'Agungi lekki', 'Opebi-ikeja', 'Allen avenue-ikeja', 'Gra-ikeja',
       'Oregun-ikeja', 'Akoka-yaba', 'Alagomeji-yaba', 'Adekunle-yaba',
       'Abule oja-yaba', 'Abule ijesha-yaba', 'Onike-yaba', 'Jibowu-yaba',
       'Sabo-yaba', 'Iwaya-yaba', 'Ebute metta-yaba', 'Fola agoro-yaba',
       'Ago palace-okota', 'Okota', 'Phase1-gbagada', 'Phase2-gbagada',
       'Ifako-gbagada', 'Oworonshoki-gbagada', 'Soluyi-gbagada',
       'Medina-gbagada', 'Gbagada', 'Anthony village-maryland',
       'Mende-maryland', 'Maryland', 'Ikotun-igando', 'Ojo', 'Ayobo',
       'Akesan'

In [16]:
my_dict = {}

In [17]:
for i, y in enumerate(list(df_copy["LOCATION"].unique())):
    my_dict[y] = i

In [18]:
my_dict

{'Ikoyi': 0,
 'Yaba': 1,
 'Lekki': 2,
 'Ajah': 3,
 'Victoria island': 4,
 'Ikeja': 5,
 'Ilupeju': 6,
 'Isolo': 7,
 'Shomolu': 8,
 'Ketu': 9,
 'Surulere': 10,
 'Amuwo odofin': 11,
 'Abule egba': 12,
 'Oshodi': 13,
 'Apapa': 14,
 'Ikorodu': 15,
 'Ojodu': 16,
 'Ipaja': 17,
 'Egbeda': 18,
 'Ikotun': 19,
 'Idimu': 20,
 'Ogba': 21,
 'Igando': 22,
 'Akowonjo': 23,
 'Ikate-lekki': 24,
 'Ikota-lekki': 25,
 'Chevron-lekki': 26,
 'Phase1-lekki': 27,
 'Phase2-lekki': 28,
 'Vgc-lekki': 29,
 'Ibeju-lekki': 30,
 'Osapa london lekki': 31,
 'Agungi lekki': 32,
 'Opebi-ikeja': 33,
 'Allen avenue-ikeja': 34,
 'Gra-ikeja': 35,
 'Oregun-ikeja': 36,
 'Akoka-yaba': 37,
 'Alagomeji-yaba': 38,
 'Adekunle-yaba': 39,
 'Abule oja-yaba': 40,
 'Abule ijesha-yaba': 41,
 'Onike-yaba': 42,
 'Jibowu-yaba': 43,
 'Sabo-yaba': 44,
 'Iwaya-yaba': 45,
 'Ebute metta-yaba': 46,
 'Fola agoro-yaba': 47,
 'Ago palace-okota': 48,
 'Okota': 49,
 'Phase1-gbagada': 50,
 'Phase2-gbagada': 51,
 'Ifako-gbagada': 52,
 'Oworonshoki-gbaga

In [19]:
location_dict = my_dict

In [20]:
new_dict = df_copy["HOUSE_TYPE"].value_counts().to_dict()

In [21]:
new_dict

{'FLAT': 47432, 'HOUSE': 11932, 'DUPLEX': 5081, 'BUNGALOW': 376}

In [22]:
house_dict = {'FLAT': 1, 'HOUSE': 2, 'DUPLEX': 3, 'BUNGALOW': 4}

In [23]:
# map the House type and location using the dictionary created 
df_copy["HOUSE_TYPE"] = df_copy.HOUSE_TYPE.map(house_dict)
df_copy["LOCATION"] = df_copy.LOCATION.map(location_dict)

In [24]:
df_copy.head()

Unnamed: 0,LOCATION,PRICE,DATE ADDED,BEDROOMS,BATHROOMS,TOILETS,HOUSE_TYPE
1,0,10000000.0,2022-08-12,3,3,4,1
2,0,10000000.0,2022-08-10,3,3,4,1
10,0,8500000.0,2022-08-10,3,2,3,1
11,0,6000000.0,2022-08-10,3,3,4,1
15,0,6000000.0,2022-08-10,3,3,4,2


In [1]:
# drop the target label and assign it to x. assign the target label as y

In [25]:
x = df_copy[['BEDROOMS', 'BATHROOMS', 'TOILETS', 'LOCATION', 'HOUSE_TYPE']]

In [26]:
y = df_copy['PRICE']

In [27]:
from xgboost import XGBRegressor

In [2]:
# split data into trainibg and testing data set

In [28]:
X_train_val, X_test, y_train_val, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=0)

In [29]:
from sklearn.model_selection import GridSearchCV

In [30]:
xgb1 = XGBRegressor()
parameters = {
              'objective':['reg:squarederror'],
              'learning_rate': [.0001, 0.001, .01],
              'max_depth': [3, 5, 7],
              'min_child_weight': [3,5,7],
              'subsample': [0.1,0.5,1.0],
              'colsample_bytree': [0.1, 0.5, 1.0],
              'n_estimators': [500]}

xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 3,
                        n_jobs = -1,
                        verbose=0)

xgb_grid.fit(X_train, y_train)

xgb_cv = (xgb_grid.best_estimator_)

eval_set = [(X_train, y_train),
            (X_val, y_val)]

fit_model = xgb_cv.fit(
    X_train,
    y_train,
    eval_set=eval_set,
    eval_metric='mae',
    early_stopping_rounds=50,
    verbose=False)

print("MAE:", mean_absolute_error(y_val, fit_model.predict(X_val)))
print("MSE:", mean_squared_error(y_val, fit_model.predict(X_val)))
print("R2:", r2_score(y_val, fit_model.predict(X_val)))

MAE: 656168.5238352361
MSE: 1181808856004.9792
R2: 0.7253669451718985


In [32]:
import joblib

In [33]:
# save the model as a pickle file
joblib.dump(fit_model, 'final_xgboost_model.pkl')

['final_xgboost_model.pkl']