### ML MODEL THAT PREDICTS THE RENT PRICE IN IBADAN

In [2]:
# Data Wrangling
import pandas as pd
import numpy as np

# Model building and evaluation
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [4]:
#Load the dataset
data = pd.read_csv('ibadan_pro.csv')

In [5]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,page,title,description,condition,rooms,address,price
0,0,1,Newly Built 3 Bedroom Flat,FOR RENT: DIRECT LETTING Beautiful Two numbers...,Serviced\nNewly Built,3 beds\n3 baths\n3 Toilets,"['Second Avian, Tipper Garage Akala Express ']","₦ 700,000/year"
1,1,1,4 Bedroom Terrace Duplex,FOR RENT: Discover comfort and convenience at ...,,4 beds\n4 baths\n5 Toilets,['Green Gate Oluyole Estate '],"₦ 2,000,000/year"
2,2,1,Warehouse,FOR RENT: Direct brief Warehouse at iyana libe...,,0 beds\n0 baths\n0 Toilets,['Oke Ado '],"₦ 2,500,000/year"
3,3,1,Commercial Shop,"FOR RENT: Commercial shop to let at akobo, Gen...",,0 beds\n0 baths\n0 Toilets,['General Gas Akobo '],"₦ 1,000,000/year"
4,4,1,3 Bedroom Flat,FOR RENT: Twin duplex of 3 bedroom at yanbule ...,,2 beds\n0 baths\n0 Toilets,[''],"₦ 2,000,000/year"


In [6]:
data['condition'].unique()

array(['Serviced\nNewly Built', nan, 'Newly Built', 'Furnished',
       'Furnished\nNewly Built', 'Furnished\nServiced\nNewly Built',
       'Furnished\nServiced', 'Serviced'], dtype=object)

In [7]:
data['condition'] = ['newly built' if x in ['Serviced\nNewly Built',
       'Newly Built'] else x for x in data['condition']]

In [8]:
data['condition'] = ['furnished' if x in ['Furnished\nNewly Built',
       'Furnished', 'Furnished\nServiced\nNewly Built', 'Furnished\nServiced'] else x for x in data['condition']]

In [9]:
data['condition'].unique()

array(['newly built', nan, 'furnished', 'Serviced'], dtype=object)

In [10]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,page,title,description,condition,rooms,address,price
0,0,1,Newly Built 3 Bedroom Flat,FOR RENT: DIRECT LETTING Beautiful Two numbers...,newly built,3 beds\n3 baths\n3 Toilets,"['Second Avian, Tipper Garage Akala Express ']","₦ 700,000/year"
1,1,1,4 Bedroom Terrace Duplex,FOR RENT: Discover comfort and convenience at ...,,4 beds\n4 baths\n5 Toilets,['Green Gate Oluyole Estate '],"₦ 2,000,000/year"
2,2,1,Warehouse,FOR RENT: Direct brief Warehouse at iyana libe...,,0 beds\n0 baths\n0 Toilets,['Oke Ado '],"₦ 2,500,000/year"
3,3,1,Commercial Shop,"FOR RENT: Commercial shop to let at akobo, Gen...",,0 beds\n0 baths\n0 Toilets,['General Gas Akobo '],"₦ 1,000,000/year"
4,4,1,3 Bedroom Flat,FOR RENT: Twin duplex of 3 bedroom at yanbule ...,,2 beds\n0 baths\n0 Toilets,[''],"₦ 2,000,000/year"


In [11]:
# check for missing values
data.isnull().sum()

Unnamed: 0       0
page             0
title            0
description      0
condition      286
rooms            0
address          0
price            0
dtype: int64

In [12]:
# Handling missing values with dropna
data.dropna(inplace = True)
data.reset_index(drop = True, inplace = True)

In [13]:
data.isnull().sum()

Unnamed: 0     0
page           0
title          0
description    0
condition      0
rooms          0
address        0
price          0
dtype: int64

In [14]:
import re

# Define a function to extract the number from the string using regex
def extract_number_correctly(room_info, room_type):
    match = re.search(rf"(\d+) {room_type}", room_info)
    return int(match.group(1)) if match else 0

# Extract beds, baths, and toilets information correctly
data['beds'] = data['rooms'].apply(lambda x: extract_number_correctly(x, 'beds'))
data['baths'] = data['rooms'].apply(lambda x: extract_number_correctly(x, 'baths'))
data['toilets'] = data['rooms'].apply(lambda x: extract_number_correctly(x, 'Toilets'))

# Display the updated dataframe with new columns
data[['title', 'rooms', 'beds', 'baths', 'toilets']].head()

Unnamed: 0,title,rooms,beds,baths,toilets
0,Newly Built 3 Bedroom Flat,3 beds\n3 baths\n3 Toilets,3,3,3
1,Room And Parlor Self Contained,1 beds\n1 baths\n1 Toilets,1,1,1
2,Newly Built 2 Bedroom Flat,2 beds\n2 baths\n2 Toilets,2,2,2
3,Shop Space,0 beds\n0 baths\n0 Toilets,0,0,0
4,Nicely Built 4 Bedroom Duplex,4 beds\n4 baths\n4 Toilets,4,4,4


In [15]:
import re

# Function to separate the price into integer and string components
def separate_price(price):
    # Remove the currency symbol and any commas
    numeric_part = re.sub(r'[^\d]', '', price)
    # Extract the string component
    string_part = re.sub(r'[\d,]', '', price).strip()
    
    # Convert numeric part to integer if possible
    if numeric_part:
        numeric_part = int(numeric_part)
    else:
        numeric_part = None
    
    return numeric_part, string_part

# Apply the function to the price column
data['PRICE'], data['price_string'] = zip(*data['price'].apply(separate_price))

# Display the updated dataframe with new columns
data[['price', 'PRICE', 'price_string']].head()

Unnamed: 0,price,PRICE,price_string
0,"₦ 700,000/year",700000,₦ /year
1,"₦ 500,000/year",500000,₦ /year
2,"₦ 250,000/year",250000,₦ /year
3,"₦ 120,000/year",120000,₦ /year
4,"₦ 2,200,000/year",2200000,₦ /year


In [16]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,page,title,description,condition,rooms,address,price,beds,baths,toilets,PRICE,price_string
0,0,1,Newly Built 3 Bedroom Flat,FOR RENT: DIRECT LETTING Beautiful Two numbers...,newly built,3 beds\n3 baths\n3 Toilets,"['Second Avian, Tipper Garage Akala Express ']","₦ 700,000/year",3,3,3,700000,₦ /year
1,7,1,Room And Parlor Self Contained,FOR RENT: A room and parlour self contain at y...,newly built,1 beds\n1 baths\n1 Toilets,"['Elebu,off Akala Express ']","₦ 500,000/year",1,1,1,500000,₦ /year
2,12,1,Newly Built 2 Bedroom Flat,FOR RENT: ```|OPEN LETTING BRIEF|``` *_DIRECT ...,newly built,2 beds\n2 baths\n2 Toilets,"['Cele Alapata Estate, Apata ']","₦ 250,000/year",2,2,2,250000,₦ /year
3,22,1,Shop Space,FOR RENT: Shop to let at Ajibode maternity hea...,furnished,0 beds\n0 baths\n0 Toilets,['Estate Ajibode '],"₦ 120,000/year",0,0,0,120000,₦ /year
4,24,1,Nicely Built 4 Bedroom Duplex,FOR RENT: 4 bedroom duplex with modern facilit...,furnished,4 beds\n4 baths\n4 Toilets,"['Ire Akari Estate, Ajinde Akala Express ']","₦ 2,200,000/year",4,4,4,2200000,₦ /year


In [17]:
data.rename(columns = {'Unnamed: 0':'Unnamed'}, inplace = True)

In [18]:
data.drop(columns = ['price', 'rooms', 'price_string'], inplace = True)

In [19]:
data.head(3)

Unnamed: 0,Unnamed,page,title,description,condition,address,beds,baths,toilets,PRICE
0,0,1,Newly Built 3 Bedroom Flat,FOR RENT: DIRECT LETTING Beautiful Two numbers...,newly built,"['Second Avian, Tipper Garage Akala Express ']",3,3,3,700000
1,7,1,Room And Parlor Self Contained,FOR RENT: A room and parlour self contain at y...,newly built,"['Elebu,off Akala Express ']",1,1,1,500000
2,12,1,Newly Built 2 Bedroom Flat,FOR RENT: ```|OPEN LETTING BRIEF|``` *_DIRECT ...,newly built,"['Cele Alapata Estate, Apata ']",2,2,2,250000


In [20]:
# Encoding the categorical variables

cat_cols = ['title', 'description', 'condition', 'address']
num_cols = ['Unnamed', 'page', 'beds', 'baths', 'toilets']

cat_cols = pd.get_dummies(data[cat_cols])

# scale the data in the X axis
scaler = StandardScaler()
num_cols = scaler.fit_transform(data[num_cols])
num_cols = pd.DataFrame(num_cols, columns = ['Unnamed', 'page', 'beds', 'baths', 'toilets'])

cat_cols[['Unnamed', 'page', 'beds', 'baths', 'toilets']] = num_cols

In [25]:
# Model building and evaluation
X = cat_cols
y = data['PRICE']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 23)

model = RandomForestRegressor(random_state = 23)
model.fit(X_train, y_train)
preds = model.predict(X_test)

rmse = mean_squared_error(y_test, preds, squared = False)
print(f'RMSE: {rmse}')


RMSE: 4817124.069085649


In [26]:
estimators = list(range(10,200,10))

for n_estimator in estimators:
    model = RandomForestRegressor(random_state = 23, n_estimators = n_estimator)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = mean_squared_error(y_test, preds, squared = False)
    print(f'RMSE: {rmse}, n_estimator: {n_estimator}')
    print(f'===============================================')

RMSE: 4869046.287518738, n_estimator: 10
RMSE: 4854240.606528629, n_estimator: 20
RMSE: 4842798.207152391, n_estimator: 30
RMSE: 4838649.354682251, n_estimator: 40
RMSE: 4830693.296848216, n_estimator: 50
RMSE: 4817899.680884757, n_estimator: 60
RMSE: 4827032.942036529, n_estimator: 70
RMSE: 4828253.105342206, n_estimator: 80
RMSE: 4822095.962683085, n_estimator: 90
RMSE: 4817124.069085649, n_estimator: 100
RMSE: 4818555.601205274, n_estimator: 110
RMSE: 4815815.354088815, n_estimator: 120
RMSE: 4819844.380910226, n_estimator: 130
RMSE: 4821275.00232566, n_estimator: 140
RMSE: 4821150.0356728425, n_estimator: 150
RMSE: 4821804.60374731, n_estimator: 160
RMSE: 4824178.541610442, n_estimator: 170
RMSE: 4819389.493757221, n_estimator: 180
RMSE: 4817455.080944224, n_estimator: 190


In [27]:
depth = list(range(2,20))

for max_depth in depth:
    model = RandomForestRegressor(random_state = 23, n_estimators = 10, max_depth = max_depth)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = mean_squared_error(y_test, preds, squared = False)
    print(f'RMSE: {rmse}, max_depth: {max_depth}')
    print(f'===============================================')

RMSE: 4746428.191355008, max_depth: 2
RMSE: 4712764.605652795, max_depth: 3
RMSE: 4747154.480131622, max_depth: 4
RMSE: 4772426.975664909, max_depth: 5
RMSE: 4792845.324077272, max_depth: 6
RMSE: 4802911.661160051, max_depth: 7
RMSE: 4814077.429083379, max_depth: 8
RMSE: 4829443.72026551, max_depth: 9
RMSE: 4833829.1803059615, max_depth: 10
RMSE: 4844320.379603796, max_depth: 11
RMSE: 4850075.197696443, max_depth: 12
RMSE: 4848520.392890176, max_depth: 13
RMSE: 4847867.260488297, max_depth: 14
RMSE: 4853590.544345498, max_depth: 15
RMSE: 4851981.511213406, max_depth: 16
RMSE: 4851624.6467696205, max_depth: 17
RMSE: 4857219.3386017615, max_depth: 18
RMSE: 4867816.492641071, max_depth: 19


In [28]:
criterion = ['squared_error', 'friedman_mse', 'absolute_error', 'poisson']

for i in criterion:
    model = RandomForestRegressor(random_state = 23, n_estimators = 10, max_depth = 5, criterion = i)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

# evaluate the model performance

    rmse = mean_squared_error(y_test, preds, squared = False)
    print(f'RMSE: {rmse}, Criterion: {i}')
    print('====================================')

RMSE: 4772426.975664909, Criterion: squared_error
RMSE: 4772426.975664909, Criterion: friedman_mse
RMSE: 4835603.702707952, Criterion: absolute_error
RMSE: 4752049.49871931, Criterion: poisson


In [29]:
min_sample_leaf = list(range(1,10))
for leaf in min_sample_leaf:
    model = RandomForestRegressor(random_state = 23, n_estimators = 10, max_depth = 5, criterion = 'absolute_error',
                                 min_samples_leaf = leaf)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

# evaluate the model performance

    rmse = mean_squared_error(y_test, preds, squared = False)
    print(f'RMSE: {rmse}, min_sample_leaf: {leaf}')
    print('====================================')

RMSE: 4835603.702707952, min_sample_leaf: 1
RMSE: 4794103.66082277, min_sample_leaf: 2
RMSE: 4809886.929729816, min_sample_leaf: 3
RMSE: 4601660.511629005, min_sample_leaf: 4
RMSE: 4653266.517803841, min_sample_leaf: 5
RMSE: 4639242.909870717, min_sample_leaf: 6
RMSE: 4604008.128599936, min_sample_leaf: 7
RMSE: 4599049.2515146155, min_sample_leaf: 8
RMSE: 4582701.850437141, min_sample_leaf: 9


In [30]:
model = RandomForestRegressor(random_state = 23, n_estimators = 120, max_depth = 3, criterion = 'poisson', 
                              min_samples_leaf = 9)
model.fit(X_train, y_train)
preds = model.predict(X_test)
rmse = mean_squared_error(y_test, preds, squared = False)
print(f'RMSE: {rmse}')

RMSE: 4221340.002548195
