In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('House_Rent_Dataset.csv')
df.head()

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
4,2022-05-09,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner


In [3]:
df.isnull().sum()

Posted On            0
BHK                  0
Rent                 0
Size                 0
Floor                0
Area Type            0
Area Locality        0
City                 0
Furnishing Status    0
Tenant Preferred     0
Bathroom             0
Point of Contact     0
dtype: int64

In [4]:
print(f"dataset: {df.shape[0]}")
print(f"features: {df.shape[1]}")

dataset: 4746
features: 12


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Posted On          4746 non-null   object
 1   BHK                4746 non-null   int64 
 2   Rent               4746 non-null   int64 
 3   Size               4746 non-null   int64 
 4   Floor              4746 non-null   object
 5   Area Type          4746 non-null   object
 6   Area Locality      4746 non-null   object
 7   City               4746 non-null   object
 8   Furnishing Status  4746 non-null   object
 9   Tenant Preferred   4746 non-null   object
 10  Bathroom           4746 non-null   int64 
 11  Point of Contact   4746 non-null   object
dtypes: int64(4), object(8)
memory usage: 445.1+ KB


In [6]:
print(df.duplicated().sum())


0


Feature Engineering

In [7]:
# Cleaning Floor Feature
# First drop existing Floor Level and Total Floors columns if they exist
if 'Floor Level' in df.columns:
    df = df.drop('Floor Level', axis=1)
if 'Total Floors' in df.columns:
    df = df.drop('Total Floors', axis=1)

df = df.join(df['Floor'].str.split(' out of ', expand=True).rename(columns={0:'Floor Level', 1:'Total Floors'}))

# Handle all basement and ground floor cases
df['Floor Level'] = df.apply(lambda x: 0 if x['Floor Level'] == 'Ground' \
                                 else (-1 if x['Floor Level'] == 'Lower Basement' \
                                       else (-2 if x['Floor Level'] == 'Upper Basement' \
                                             else int(x['Floor Level']))), axis=1)

df.drop('Floor', axis=1, inplace=True)
df = df.dropna()
df['Floor Level'] = df['Floor Level'].astype(int)
df['Total Floors'] = df['Total Floors'].astype(int)

In [8]:
df.head()

Unnamed: 0,Posted On,BHK,Rent,Size,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Floor Level,Total Floors
0,2022-05-18,2,10000,1100,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner,0,2
1,2022-05-13,2,20000,800,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,1,3
2,2022-05-16,2,17000,1000,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,1,3
3,2022-07-04,2,10000,800,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner,1,2
4,2022-05-09,2,7500,850,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner,1,2


In [9]:
print(df['Point of Contact'].value_counts())
print()
print(df['Area Type'].value_counts())

Point of Contact
Contact Owner      3212
Contact Agent      1529
Contact Builder       1
Name: count, dtype: int64

Area Type
Super Area     2444
Carpet Area    2296
Built Area        2
Name: count, dtype: int64


In [10]:
df = df[~df['Point of Contact'].str.contains("Contact Builder")]



In [11]:
df['Posted On'] = pd.to_datetime(df['Posted On'])

df['month posted'] = df['Posted On'].dt.month
df['day posted'] = df['Posted On'].dt.day
df['day of week posted'] = df['Posted On'].dt.day_of_week
df['quarter poster'] = df['Posted On'].dt.quarter

df.drop('Posted On', axis = 1, inplace= True)

In [12]:
df.head()

Unnamed: 0,BHK,Rent,Size,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Floor Level,Total Floors,month posted,day posted,day of week posted,quarter poster
0,2,10000,1100,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner,0,2,5,18,2,2
1,2,20000,800,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,1,3,5,13,4,2
2,2,17000,1000,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,1,3,5,16,0,2
3,2,10000,800,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner,1,2,7,4,0,3
4,2,7500,850,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner,1,2,5,9,0,2


Encoding

In [13]:
df.head()

Unnamed: 0,BHK,Rent,Size,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Floor Level,Total Floors,month posted,day posted,day of week posted,quarter poster
0,2,10000,1100,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner,0,2,5,18,2,2
1,2,20000,800,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,1,3,5,13,4,2
2,2,17000,1000,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,1,3,5,16,0,2
3,2,10000,800,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner,1,2,7,4,0,3
4,2,7500,850,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner,1,2,5,9,0,2


In [14]:
def one_hot_encode(data, column):
    encoded = pd.get_dummies(data[column], drop_first= True)
    data = data.drop(column, axis = 1)
    data = data.join(encoded)
    return data

In [15]:
df = df.drop('Area Locality', axis= 1)
df.head()


Unnamed: 0,BHK,Rent,Size,Area Type,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Floor Level,Total Floors,month posted,day posted,day of week posted,quarter poster
0,2,10000,1100,Super Area,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner,0,2,5,18,2,2
1,2,20000,800,Super Area,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,1,3,5,13,4,2
2,2,17000,1000,Super Area,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,1,3,5,16,0,2
3,2,10000,800,Super Area,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner,1,2,7,4,0,3
4,2,7500,850,Carpet Area,Kolkata,Unfurnished,Bachelors,1,Contact Owner,1,2,5,9,0,2


In [16]:
columns = ['Area Type', 'City', 'Furnishing Status', 'Tenant Preferred', 'Point of Contact']
for col in columns:
    df = one_hot_encode(df, col)

In [17]:
df.head()

Unnamed: 0,BHK,Rent,Size,Bathroom,Floor Level,Total Floors,month posted,day posted,day of week posted,quarter poster,...,Chennai,Delhi,Hyderabad,Kolkata,Mumbai,Semi-Furnished,Unfurnished,Bachelors/Family,Family,Contact Owner
0,2,10000,1100,2,0,2,5,18,2,2,...,False,False,False,True,False,False,True,True,False,True
1,2,20000,800,1,1,3,5,13,4,2,...,False,False,False,True,False,True,False,True,False,True
2,2,17000,1000,1,1,3,5,16,0,2,...,False,False,False,True,False,True,False,True,False,True
3,2,10000,800,1,1,2,7,4,0,3,...,False,False,False,True,False,False,True,True,False,True
4,2,7500,850,1,1,2,5,9,0,2,...,False,False,False,True,False,False,True,False,False,True


In [18]:
from scipy.stats import probplot, boxcox
from scipy.special import inv_boxcox

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [19]:
X = df.drop('Rent', axis= 1)
y = df['Rent']

bc_result = boxcox(y)
y_boxcox = bc_result[0]
lambda_ = bc_result[1]

X_train, X_test, y_train, y_test = train_test_split(X, y_boxcox, test_size= 0.2, random_state = 42)


In [25]:
X_test

array([[-0.10162026,  0.20910153,  0.03863107, ...,  0.61351979,
        -0.33274699,  0.69413892],
       [-0.10162026,  0.56937911,  0.03863107, ...,  0.61351979,
        -0.33274699,  0.69413892],
       [ 2.30677999,  2.55873794,  2.30977852, ...,  0.61351979,
        -0.33274699, -1.44063382],
       ...,
       [-0.10162026,  0.83567124,  0.03863107, ...,  0.61351979,
        -0.33274699,  0.69413892],
       [-0.10162026,  0.33441547,  0.03863107, ..., -1.62993927,
         3.00528635,  0.69413892],
       [ 1.10257986,  2.71538037,  0.03863107, ..., -1.62993927,
        -0.33274699, -1.44063382]], shape=(949, 21))

In [20]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Modelling

In [21]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators= 100, random_state= 42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [22]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


#Evaluation
y_pred = model.predict(X_test)
y_pred_inv = inv_boxcox(y_pred, lambda_)
y_test_inv = inv_boxcox(y_test, lambda_)

mae = mean_absolute_error(y_test_inv, y_pred_inv)
mse = mean_squared_error(y_test_inv, y_pred_inv)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_inv, y_pred_inv)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R^2 Score: {r2}")


Mean Absolute Error: 10768.67053563631
Mean Squared Error: 1270209477.1489596
Root Mean Squared Error: 35639.99827650051
R^2 Score: 0.6907522089862759


In [23]:
# Model file saving
import joblib
joblib.dump(model, 'rent_prediction_model.pkl')



['rent_prediction_model.pkl']