In [24]:
# import necessary packages
import numpy as np
import pandas as pd

In [25]:
# load data and view some rows
df = pd.read_excel('Condo_rental_raw.xlsx')
df.head()

Unnamed: 0,No.,Rental price (USD/month),Area (m2),No. bedroom,Khan,Sangkat,Data posted
0,1,1200,130.0,3.0,Mean Chey,Chakto Mukh,2022-03-02
1,2,1100,135.0,3.0,Chamkar Mon,Tonle Basak,2022-10-26
2,3,900,63.0,2.0,Chamkar Mon,Tonle Basak,2023-01-07
3,4,800,107.0,2.0,Tuol Kouk,Boeng Kak Ti Muoy,2023-02-03
4,5,870,,2.0,Chamkar Mon,Boeng Keng Kang Ti Bei,2023-02-03


In [26]:
# for this study, we are interested in prediction rental price with three features, Area(m2), No. bedroom and Khan
df = df[['Rental price (USD/month)', 'Area (m2)', 'No. bedroom', 'Khan']]
df.head()

Unnamed: 0,Rental price (USD/month),Area (m2),No. bedroom,Khan
0,1200,130.0,3.0,Mean Chey
1,1100,135.0,3.0,Chamkar Mon
2,900,63.0,2.0,Chamkar Mon
3,800,107.0,2.0,Tuol Kouk
4,870,,2.0,Chamkar Mon


In [27]:
# check size of data
df.shape

(400, 4)

In [28]:
# check number of null values in each collumn
df.isnull().sum()

Rental price (USD/month)     0
Area (m2)                   23
No. bedroom                  3
Khan                         1
dtype: int64

In [29]:
# as null values represent less than 6% of the data, we decide to remove all rows containing them
df = df.dropna()
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Rental price (USD/month),Area (m2),No. bedroom,Khan
0,1200,130.0,3.0,Mean Chey
1,1100,135.0,3.0,Chamkar Mon
2,900,63.0,2.0,Chamkar Mon
3,800,107.0,2.0,Tuol Kouk
4,2500,91.0,2.0,Chamkar Mon


In [30]:
# check null values again
df.isnull().sum()

Rental price (USD/month)    0
Area (m2)                   0
No. bedroom                 0
Khan                        0
dtype: int64

In [31]:
# check size after deleting null values
df.shape

(374, 4)

In [32]:
# check information of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 4 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Rental price (USD/month)  374 non-null    int64  
 1   Area (m2)                 374 non-null    float64
 2   No. bedroom               374 non-null    float64
 3   Khan                      374 non-null    object 
dtypes: float64(2), int64(1), object(1)
memory usage: 11.8+ KB


In [33]:
# summary statistic (numerical values)
df.describe()

Unnamed: 0,Rental price (USD/month),Area (m2),No. bedroom
count,374.0,374.0,374.0
mean,591.296791,71.451872,1.449198
std,510.487587,30.674484,0.65194
min,100.0,25.0,1.0
25%,305.0,50.0,1.0
50%,450.0,65.5,1.0
75%,650.0,85.0,2.0
max,3800.0,250.0,4.0


In [34]:
# summary statistic (cateogrical values)
df.describe(include = 'object')

Unnamed: 0,Khan
count,374
unique,8
top,Chamkar Mon
freq,232


In [35]:
# check unique values of Khan
sorted(df.Khan.unique())

['Chamkar Mon',
 'Chbar Ampov',
 'Doun Penh',
 'Mean Chey',
 'Prampir Meakkakra',
 'Russey Keo',
 'Saensokh',
 'Tuol Kouk']

In [36]:
# one hot encode for each variable of Khan
for khan in sorted(df.Khan.unique()):
    df[khan] = (df['Khan']==khan).astype(int)

df.head()

Unnamed: 0,Rental price (USD/month),Area (m2),No. bedroom,Khan,Chamkar Mon,Chbar Ampov,Doun Penh,Mean Chey,Prampir Meakkakra,Russey Keo,Saensokh,Tuol Kouk
0,1200,130.0,3.0,Mean Chey,0,0,0,1,0,0,0,0
1,1100,135.0,3.0,Chamkar Mon,1,0,0,0,0,0,0,0
2,900,63.0,2.0,Chamkar Mon,1,0,0,0,0,0,0,0
3,800,107.0,2.0,Tuol Kouk,0,0,0,0,0,0,0,1
4,2500,91.0,2.0,Chamkar Mon,1,0,0,0,0,0,0,0


In [37]:
# remove 'Khan' column
df.drop(columns='Khan', inplace=True)
df.head()

Unnamed: 0,Rental price (USD/month),Area (m2),No. bedroom,Chamkar Mon,Chbar Ampov,Doun Penh,Mean Chey,Prampir Meakkakra,Russey Keo,Saensokh,Tuol Kouk
0,1200,130.0,3.0,0,0,0,1,0,0,0,0
1,1100,135.0,3.0,1,0,0,0,0,0,0,0
2,900,63.0,2.0,1,0,0,0,0,0,0,0
3,800,107.0,2.0,0,0,0,0,0,0,0,1
4,2500,91.0,2.0,1,0,0,0,0,0,0,0


In [39]:
# convert dataframe to numpy array

data = df.to_numpy()
data[:5]

array([[1.20e+03, 1.30e+02, 3.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00],
       [1.10e+03, 1.35e+02, 3.00e+00, 1.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00],
       [9.00e+02, 6.30e+01, 2.00e+00, 1.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00],
       [8.00e+02, 1.07e+02, 2.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.00e+00],
       [2.50e+03, 9.10e+01, 2.00e+00, 1.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00]])

In [40]:
# Seperate variables X and target y

X = data[:, 1:]
y = data[:, 0]
print(X.shape)
print(y.shape)

(374, 10)
(374,)


In [41]:
# split data set into training set (70%), test set (30%)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=2023, 
                                                    test_size=0.3, 
                                                    shuffle=True)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(261, 10)
(261,)
(113, 10)
(113,)


In [52]:
# Linear regression model using scikit-learn RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train,y_train)
score = model.score(X_test,y_test)
print(score)

0.6379721831621408


In [53]:
import pickle

In [55]:
filename = 'Random_Forest_Regressor.sav'
pickle.dump(model, open(filename, 'wb'))