In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv("/content/RealEstate_Dataset.csv")
data

Unnamed: 0.1,Unnamed: 0,City,Price,Bedroom,Bathroom,Floors,Parking,Face,Year,Area,Road Width,Road Type,City_Place,Property_Type,Listing_Type,Garden
0,12,Kathmandu,15000000,2,2,1.0,1,South,2074.0,5 Aana,12 Feet,Gravelled,Budhanilkantha,House,Sell,No
1,13,Kathmandu,52500000,5,5,3.0,3,South East,2075.0,8 Aana,20 Feet,Blacktopped,Budhanilkantha,House,Sell,Yes
2,14,Kathmandu,26500000,6,6,3.0,1,East,2076.0,4 Aana,13 Feet,Paved,Budhanilkantha,House,Sell,No
3,15,Kathmandu,25500000,6,6,3.0,1,East,2076.0,4 Aana,13 Feet,Gravelled,Sitapaila,House,Sell,No
4,17,Kathmandu,18500000,4,4,,3,South,,6 Aana,13 Feet,,Thankot,House,Sell,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,1061,Kathmandu,25500000,6,4,3.0,2,East,,4.5 Aana,13 Feet,Blacktopped,Budhanilkantha,House,Sell,Yes
296,1062,Kathmandu,15000000,0,0,,0,East,2073.0,3 Aana,10 Feet,Blacktopped,Kapan,House,Sell,No
297,1073,Kathmandu,5400000,0,0,,0,South,,4 Aana,6 Meter,Blacktopped,Boudha,Land,Sell,No
298,1077,Lalitpur,49500000,0,0,,0,East,,9 Aana,16 Feet,Blacktopped,Hattiban,House,Sell,No


In [4]:
data = data.drop(columns={'Unnamed: 0'})

In [5]:
data.isnull().sum()

Unnamed: 0,0
City,0
Price,0
Bedroom,0
Bathroom,0
Floors,115
Parking,0
Face,0
Year,131
Area,0
Road Width,0


In [6]:
data['City'].unique()

array(['Kathmandu', 'Bhaktapur', 'Lalitpur'], dtype=object)

In [7]:
data['Price'].unique()

array([ 15000000,  52500000,  26500000,  25500000,  18500000,  62000000,
       145000000,  55000000,  37000000, 102500000,  58500000, 130000000,
       105000000,  22500000,  60000000, 250000000,  40000000,  30000000,
        33000000,  30500000,  28000000,  41000000,  27000000,  22000000,
        29000000,  37500000,  45000000,   3000000,   2350000,    200000,
           30000,   2800000,   3500000,   2600000,   4200000,     20000,
           75000,   5700000,   2500000,     35000,    250000,     60000,
           70000,    900000,   1300000,    300000,  31500000,  57500000,
          100000,  25000000,     78000,  11000000,   2400000,   2700000,
         2300000,  21000000,   1650000,  32500000,  24000000,   2200000,
        16800000,  34000000,  42500000,    790000,     55000,  23500000,
          750000,  81000000,    273258,   1450000,   9500000,   2900000,
        20000000,  65000000,     95000,     16000,     25000,     65000,
          150000,  85000000, 160000000,  61000000, 

In [8]:
data['Bedroom'].unique()

array([ 2,  5,  6,  4,  7, 95,  8,  9, 10,  3,  0,  1, 17, 12, 13, 16, 18,
       24])

In [9]:
data['Bathroom'].unique()

array([ 2,  5,  6,  4, 40,  8,  3,  0,  1,  7, 19, 10])

In [10]:
data['Floors'].unique()

array([ 1.,  3., nan,  5.,  4.,  2., 10.,  0.])

In [11]:
fill_floors = data['Floors'].median()
fill_floors

3.0

In [12]:
data['Floors'] = data['Floors'].fillna(fill_floors)

In [13]:
data['Floors'].unique()

array([ 1.,  3.,  5.,  4.,  2., 10.,  0.])

In [14]:
data['Parking'].unique()

array([ 1,  3,  7,  4,  2, 10,  0,  6,  5, 15])

In [15]:
data['Face'].unique()

array(['South', 'South East', 'East', 'West', 'North East', 'North',
       'South West', 'North West'], dtype=object)

In [16]:
data['Year'].unique().astype(int)

array([                2074,                 2075,                 2076,
       -9223372036854775808,                 2073,                 2071,
                       2070,                 2069,                 2013,
                       2019,                 2012,                 2001,
                       2000,                 2072,                 2020,
                       2018,                 2065,                 2016,
                       2014,                 2050,                 2010,
                       2009,                 2068,                 2064,
                       2060,                 2063])

In [17]:
fill_year = data['Year'].median()
fill_year

2075.0

In [18]:
data['Year'] = data['Year'].fillna(fill_year).astype(int)

In [19]:
data['Area'].unique()

array(['5 Aana', '8 Aana', '4 Aana', '6 Aana', '10 Aana', '21 Aana',
       '19 Aana', '6.1 Aana', '6.5 Aana', '3.1 Aana', '11 Aana',
       '30 Aana', '9 Aana', '3.5 Aana', '4.5 Aana', '12 Aana', '18 Aana',
       '41 Aana', '5.5 Aana', '16 Aana', '1 Aana', '25 Aana', '14 Aana',
       '4.3 Aana', '2.5 Aana', '13 Aana', '4.75 Aana', '22 Aana',
       '12.5 Aana', '3 Aana', '80 Aana', '7.2 Aana', '8.2 Aana',
       '3.2 Aana', '23 Aana', '7 Aana', '12.02 Aana', '17 Aana',
       '3.75 Aana', '3.3 Aana', '20 Aana', '8.5 Aana', '5.2 Aana',
       '5.1 Aana', '7.5 Aana', '15 Aana', '8.1 Aana', '48 Aana',
       '39 Aana'], dtype=object)

In [20]:
data['Area_in_Aana'] = data['Area'].str.replace(r'\bAana\b', '', regex=True).astype(float)

In [21]:
data['Road Width'].unique()

array(['12 Feet ', '20 Feet ', '13 Feet ', '13 Feet', '0 Feet', '32 Feet',
       '16 Feet ', '15 Feet ', '14 Feet ', '18 Feet ', '15 Feet',
       '26 Feet ', '22 Feet', '20 Feet', '11 Feet', '30 Feet ', '18 Feet',
       '7 Meter ', '14 Feet', '72 Feet ', '16 Feet', '10 Feet', '3 Feet',
       '28 Feet', '10 Feet ', '24 Feet ', '4 Feet', '0 Feet ', '22 Feet ',
       '7 Feet ', '21 Feet', '12 Feet', '1 Meter ', '17 Meter ',
       '5 Meter ', '40 Feet ', '5 Feet ', '33 Feet', '17 Feet ',
       '6 Meter '], dtype=object)

In [22]:
def convert_to_meters(value):
    value = value.strip()
    if 'Feet' in value:  # Check if value is in feet
        feet_value = float(value.split()[0])  # Extracting the numeric part
        return feet_value * 0.3048
    elif 'Meter' in value:
        return float(value.split()[0])
    else:
        return None

In [23]:
# Applying the function to the Road Width column
data['Road_width_in_m'] = data['Road Width'].apply(convert_to_meters)

In [24]:
data['Road Type'].unique()

array([' Gravelled', ' Blacktopped', ' Paved', nan, ' Soil Stabilized',
       ' Alley', ' Concrete'], dtype=object)

In [25]:
fill_road_type = data['Road Type'].mode()[0]

In [26]:
data['Road Type'] = data['Road Type'].fillna(fill_road_type)
data['Road Type'].unique()

array([' Gravelled', ' Blacktopped', ' Paved', ' Soil Stabilized',
       ' Alley', ' Concrete'], dtype=object)

In [27]:
data['City_Place'].unique()
data = data.rename({"City_Place": "Address"}, axis=1)

In [28]:
data['Property_Type'].unique()

array(['House', 'Land', 'Flat', 'Office Space', 'Shop', 'Business'],
      dtype=object)

In [29]:
data['Listing_Type'].unique()

array(['Sell', 'Lease', 'Rent'], dtype=object)

In [30]:
data['Garden'].unique()

array(['No', 'Yes'], dtype=object)

In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   City             300 non-null    object 
 1   Price            300 non-null    int64  
 2   Bedroom          300 non-null    int64  
 3   Bathroom         300 non-null    int64  
 4   Floors           300 non-null    float64
 5   Parking          300 non-null    int64  
 6   Face             300 non-null    object 
 7   Year             300 non-null    int64  
 8   Area             300 non-null    object 
 9   Road Width       300 non-null    object 
 10  Road Type        300 non-null    object 
 11  Address          300 non-null    object 
 12  Property_Type    300 non-null    object 
 13  Listing_Type     300 non-null    object 
 14  Garden           300 non-null    object 
 15  Area_in_Aana     300 non-null    float64
 16  Road_width_in_m  300 non-null    float64
dtypes: float64(3), i

In [32]:
data.drop(columns={'Area', 'Road Width'}, inplace=True)

In [33]:
data.to_csv("Cleaned_Dataset.csv")

## Feature and Target Selection

In [34]:
feature = data[['Property_Type', 'Listing_Type', 'City', 'Address', 'Area_in_Aana', 'Bedroom', 'Bathroom', 'Floors', 'Parking', 'Road Type', 'Garden']]
target = data['Price']

In [35]:
feature.head(2)

Unnamed: 0,Property_Type,Listing_Type,City,Address,Area_in_Aana,Bedroom,Bathroom,Floors,Parking,Road Type,Garden
0,House,Sell,Kathmandu,Budhanilkantha,5.0,2,2,1.0,1,Gravelled,No
1,House,Sell,Kathmandu,Budhanilkantha,8.0,5,5,3.0,3,Blacktopped,Yes


In [36]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

feature.insert(1, "Encoded_Property_Type", encoder.fit_transform(feature['Property_Type']))
feature.insert(3, "Encoded_Listing_Type", encoder.fit_transform(feature['Listing_Type']))
feature.insert(5, "Encoded_City", encoder.fit_transform(feature['City']))
feature.insert(7, "Encoded_Address", encoder.fit_transform(feature['Address']))
feature.insert(14, "Encoded_Road_Type", encoder.fit_transform(feature['Road Type']))
feature.insert(16, "Encoded_Garden", encoder.fit_transform(feature['Garden']))

In [37]:
feature['Bedroom'] = feature['Bedroom'].astype(int)
feature['Bathroom'] = feature['Bathroom'].astype(int)
feature['Parking'] = feature['Parking'].astype(int)

In [38]:
feature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Property_Type          300 non-null    object 
 1   Encoded_Property_Type  300 non-null    int64  
 2   Listing_Type           300 non-null    object 
 3   Encoded_Listing_Type   300 non-null    int64  
 4   City                   300 non-null    object 
 5   Encoded_City           300 non-null    int64  
 6   Address                300 non-null    object 
 7   Encoded_Address        300 non-null    int64  
 8   Area_in_Aana           300 non-null    float64
 9   Bedroom                300 non-null    int64  
 10  Bathroom               300 non-null    int64  
 11  Floors                 300 non-null    float64
 12  Parking                300 non-null    int64  
 13  Road Type              300 non-null    object 
 14  Encoded_Road_Type      300 non-null    int64  
 15  Garden

In [39]:
feature.sample(20)

Unnamed: 0,Property_Type,Encoded_Property_Type,Listing_Type,Encoded_Listing_Type,City,Encoded_City,Address,Encoded_Address,Area_in_Aana,Bedroom,Bathroom,Floors,Parking,Road Type,Encoded_Road_Type,Garden,Encoded_Garden
206,House,2,Sell,2,Kathmandu,1,Pepsicola,38,6.0,5,4,3.0,1,Blacktopped,1,No,0
129,House,2,Rent,1,Lalitpur,2,Kupondole,30,9.0,0,5,5.0,3,Gravelled,3,Yes,1
99,House,2,Sell,2,Lalitpur,2,Imadol,24,3.0,3,3,3.0,1,Blacktopped,1,No,0
86,Land,3,Sell,2,Kathmandu,1,Kapan,28,7.2,0,0,3.0,0,Blacktopped,1,No,0
31,Land,3,Lease,0,Kathmandu,1,Dhapasi,16,10.0,0,0,3.0,0,Blacktopped,1,No,0
180,Land,3,Sell,2,Lalitpur,2,Dhapakhel,15,3.3,0,0,3.0,0,Soil Stabilized,5,No,0
1,House,2,Sell,2,Kathmandu,1,Budhanilkantha,9,8.0,5,5,3.0,3,Blacktopped,1,Yes,1
297,Land,3,Sell,2,Kathmandu,1,Boudha,8,4.0,0,0,3.0,0,Blacktopped,1,No,0
280,House,2,Sell,2,Kathmandu,1,Bhangal,7,4.5,6,6,3.0,2,Blacktopped,1,No,0
57,House,2,Rent,1,Lalitpur,2,Pulchowk,39,12.0,8,5,3.0,7,Blacktopped,1,No,0


In [40]:
feature.drop(columns={'Property_Type', 'Listing_Type', 'City', 'Address', 'Road Type', 'Garden'}, inplace=True)

In [41]:
feature.corr()

Unnamed: 0,Encoded_Property_Type,Encoded_Listing_Type,Encoded_City,Encoded_Address,Area_in_Aana,Bedroom,Bathroom,Floors,Parking,Encoded_Road_Type,Encoded_Garden
Encoded_Property_Type,1.0,0.20444,-0.036293,0.001996,0.052625,-0.223296,-0.358558,0.005807,-0.254642,0.059769,-0.195048
Encoded_Listing_Type,0.20444,1.0,-0.259012,-0.004699,-0.12491,0.009381,0.09244,-0.072965,-0.096975,0.155281,-0.149619
Encoded_City,-0.036293,-0.259012,1.0,0.050021,-0.13767,-0.024804,-0.057914,0.067882,-0.039594,-0.019699,0.119255
Encoded_Address,0.001996,-0.004699,0.050021,1.0,-0.13699,-0.039875,-0.05036,-0.009662,-0.103249,0.031144,-0.221145
Area_in_Aana,0.052625,-0.12491,-0.13767,-0.13699,1.0,0.134201,0.099136,-0.007698,0.541606,-0.085632,0.273528
Bedroom,-0.223296,0.009381,-0.024804,-0.039875,0.134201,1.0,0.824653,0.24811,0.449236,-0.057464,0.04744
Bathroom,-0.358558,0.09244,-0.057914,-0.05036,0.099136,0.824653,1.0,0.169312,0.504606,-0.020332,0.160174
Floors,0.005807,-0.072965,0.067882,-0.009662,-0.007698,0.24811,0.169312,1.0,0.119373,0.0358,-0.0318
Parking,-0.254642,-0.096975,-0.039594,-0.103249,0.541606,0.449236,0.504606,0.119373,1.0,-0.083668,0.402248
Encoded_Road_Type,0.059769,0.155281,-0.019699,0.031144,-0.085632,-0.057464,-0.020332,0.0358,-0.083668,1.0,-0.041334


In [42]:
feature.describe()

Unnamed: 0,Encoded_Property_Type,Encoded_Listing_Type,Encoded_City,Encoded_Address,Area_in_Aana,Bedroom,Bathroom,Floors,Parking,Encoded_Road_Type,Encoded_Garden
count,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0
mean,2.193333,1.816667,1.11,22.806667,8.018067,4.18,3.423333,3.023333,1.633333,1.846667,0.243333
std,0.563038,0.396121,0.389537,15.513161,7.570854,6.209896,3.405306,0.661499,1.929932,1.281391,0.429812
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,2.0,1.0,9.0,4.0,0.0,0.0,3.0,0.0,1.0,0.0
50%,2.0,2.0,1.0,20.0,5.0,4.0,4.0,3.0,1.0,1.0,0.0
75%,2.0,2.0,1.0,33.0,9.0,6.0,5.0,3.0,2.0,3.0,0.0
max,5.0,2.0,2.0,53.0,80.0,95.0,40.0,10.0,15.0,5.0,1.0


In [43]:
px.box(feature['Area_in_Aana'])

In [44]:
# Replacing outliers in 'Area_in_Aana' with the upper or lower bound
Q1 = feature['Area_in_Aana'].quantile(0.25)
Q3 = feature['Area_in_Aana'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
feature['Area_in_Aana'] = np.where(
    feature['Area_in_Aana'] < lower_bound, lower_bound,
    np.where(feature['Area_in_Aana'] > upper_bound, upper_bound, feature['Area_in_Aana'])
)

# Replacing outliers in 'Bedroom' with the upper or lower bound
Q1 = feature['Bedroom'].quantile(0.25)
Q3 = feature['Bedroom'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
feature['Bedroom'] = np.where(
    feature['Bedroom'] < lower_bound, lower_bound,
    np.where(feature['Bedroom'] > upper_bound, upper_bound, feature['Bedroom'])
)

# Replacing outliers in 'Bathroom' with the upper or lower bound
Q1 = feature['Bathroom'].quantile(0.25)
Q3 = feature['Bathroom'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
feature['Bathroom'] = np.where(
    feature['Bathroom'] < lower_bound, lower_bound,
    np.where(feature['Bathroom'] > upper_bound, upper_bound, feature['Bathroom'])
)

# Replacing outliers in 'Parking' with the upper or lower bound
Q1 = feature['Parking'].quantile(0.25)
Q3 = feature['Parking'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
feature['Parking'] = np.where(
    feature['Parking'] < lower_bound, lower_bound,
    np.where(feature['Parking'] > upper_bound, upper_bound, feature['Parking'])
)

In [45]:
px.box(feature['Area_in_Aana'])

In [46]:
feature.shape

(300, 11)

In [47]:
px.box(target)

In [48]:
Q1 = target.quantile(0.25)
Q3 = target.quantile(0.75)

IQR = Q3 - Q1

lower_bond = Q1 - 1.25 * IQR
upper_bond = Q3 + 1.5 * IQR

target = np.where(target < lower_bond, lower_bond,
                  np.where(target > upper_bond, upper_bond, target))

In [49]:
px.box(target)

In [50]:
feature_shape = print("Feature Shape:", feature.shape)
target_shape = print("Taeget Shape:", target.shape)

Feature Shape: (300, 11)
Taeget Shape: (300,)


## Train Test Split

In [51]:
X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=42)

In [52]:
X_train.shape

(240, 11)

In [53]:
y_train.shape

(240,)

In [54]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [55]:
X_train_scaled

array([[-0.34554737,  0.47435286, -0.2604308 , ..., -0.28644743,
         0.878169  , -0.54524976],
       [-0.34554737,  0.47435286,  2.23970485, ..., -0.28644743,
         1.65302399,  1.83402191],
       [-0.34554737,  0.47435286, -0.2604308 , ...,  2.54849623,
        -0.671541  , -0.54524976],
       ...,
       [-0.34554737, -2.00053162,  2.23970485, ...,  0.42228848,
        -0.671541  ,  1.83402191],
       [-0.34554737,  0.47435286, -0.2604308 , ...,  2.54849623,
        -0.671541  , -0.54524976],
       [-0.34554737,  0.47435286, -0.2604308 , ...,  1.1310244 ,
        -0.671541  , -0.54524976]])

In [56]:
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)

y_pred1 = linear_model.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred1)
r2 = r2_score(y_test, y_pred1)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 165884913406251.6
R-squared: 0.7626263817641031


In [57]:
px.scatter(x=y_test, y=y_pred1, labels={"x": "Actual Prices", "y": "Predicted Prices"})

In [58]:
rfr_model = RandomForestRegressor()

rfr_model.fit(X_train_scaled, y_train)

y_pred2 = rfr_model.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred2)
r2 = r2_score(y_test, y_pred2)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 63326272197603.7
R-squared: 0.9093830412165138


In [59]:
px.scatter(x=y_test, y=y_pred2, labels={"x": "Actual Prices", "y": "Predicted Prices"})

In [60]:
xgb_model = XGBRegressor()

xgb_model.fit(X_train_scaled, y_train)

y_pred3 = xgb_model.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred3)
r2 = r2_score(y_test, y_pred3)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 56430794540489.38
R-squared: 0.9192501499687454


In [61]:
px.scatter(x=y_test, y=y_pred3, labels={"x": "Actual Prices", "y": "Predicted Prices"})

## New Data Prediction

In [62]:
data = feature.iloc[5]
data

Unnamed: 0,5
Encoded_Property_Type,2.0
Encoded_Listing_Type,2.0
Encoded_City,1.0
Encoded_Address,27.0
Area_in_Aana,10.0
Bedroom,7.0
Bathroom,4.0
Floors,3.0
Parking,3.0
Encoded_Road_Type,1.0


In [63]:
data_reshaped = data.values.reshape(1, -1) # Reshaping the input values
data_reshaped

array([[ 2.,  2.,  1., 27., 10.,  7.,  4.,  3.,  3.,  1.,  0.]])

In [64]:
scaler.transform(data_reshaped) # Scaling the inputed values

array([[-0.34554737,  0.47435286, -0.2604308 ,  0.2830837 ,  0.73468813,
         1.01561221,  0.29849673, -0.04642383,  1.1310244 , -0.671541  ,
        -0.54524976]])

In [65]:
predicted_output = xgb_model.predict(scaler.transform(data_reshaped))[0] # Predicting the scaled values
print("The predicted price is:", predicted_output)

The predicted price is: 33932196.0


## Saving the model in .pkl file

In [66]:
# import pickle

# pickle.dump(scaler, open("scaler.pkl", 'wb'))
# pickle.dump(xgb_model, open('xgb_model.pkl', 'wb'))