In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Datasets/House_Rent_Dataset.csv')
df

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
4,2022-05-09,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner
...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2022-05-18,2,15000,1000,3 out of 5,Carpet Area,Bandam Kommu,Hyderabad,Semi-Furnished,Bachelors/Family,2,Contact Owner
4742,2022-05-15,3,29000,2000,1 out of 4,Super Area,"Manikonda, Hyderabad",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Owner
4743,2022-07-10,3,35000,1750,3 out of 5,Carpet Area,"Himayath Nagar, NH 7",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Agent
4744,2022-07-06,3,45000,1500,23 out of 34,Carpet Area,Gachibowli,Hyderabad,Semi-Furnished,Family,2,Contact Agent


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Posted On          4746 non-null   object
 1   BHK                4746 non-null   int64 
 2   Rent               4746 non-null   int64 
 3   Size               4746 non-null   int64 
 4   Floor              4746 non-null   object
 5   Area Type          4746 non-null   object
 6   Area Locality      4746 non-null   object
 7   City               4746 non-null   object
 8   Furnishing Status  4746 non-null   object
 9   Tenant Preferred   4746 non-null   object
 10  Bathroom           4746 non-null   int64 
 11  Point of Contact   4746 non-null   object
dtypes: int64(4), object(8)
memory usage: 445.1+ KB


In [4]:
df.nunique()

Posted On              81
BHK                     6
Rent                  243
Size                  615
Floor                 480
Area Type               3
Area Locality        2235
City                    6
Furnishing Status       3
Tenant Preferred        3
Bathroom                8
Point of Contact        3
dtype: int64

In [5]:
df.isnull().sum()

Posted On            0
BHK                  0
Rent                 0
Size                 0
Floor                0
Area Type            0
Area Locality        0
City                 0
Furnishing Status    0
Tenant Preferred     0
Bathroom             0
Point of Contact     0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
df = df.drop_duplicates()

In [8]:
df['Posted On'] = pd.to_datetime(df["Posted On"])

In [9]:
def one_hot_encoding(dataframe, col):
    # Apply one-hot encoding to the specified column
    ohe = pd.get_dummies(dataframe[col], prefix=col)
    
    # Drop the original column
    dataframe = dataframe.drop(col, axis=1)
    
    # Join the one-hot encoded columns back to the dataframe
    dataframe = dataframe.join(ohe)
    return dataframe

In [10]:
columns = ['Area Type', 'City', 'Furnishing Status', 'Tenant Preferred', 'Point of Contact']

# Apply one-hot encoding to all specified columns
for col in columns:
    df = one_hot_encoding(df, col)

In [11]:
def extract_floor(floor_str):
    try:
        if 'Ground' in floor_str:
            return 0
        elif 'Lower Basement' in floor_str:
            return -1
        elif 'Upper Basement' in floor_str:
            return -2
        elif 'out of' in floor_str:
            # Extract the current floor when 'out of' is present
            return int(floor_str.split(' out of ')[0])
        else:
            # If no 'out of', assume it's a single number floor
            return int(floor_str)
    except:
        return None  # Return None for unexpected formats

def extract_total_floors(floor_str):
    try:
        if 'out of' in floor_str:
            # Extract total floors when 'out of' is present
            return int(floor_str.split(' out of ')[1])
        else:
            # If no 'out of', assume total floors equals current floor
            current_floor = extract_floor(floor_str)
            return current_floor  # Set total floors equal to current floor
    except:
        return None  # Return None for unexpected formats

In [12]:
# Ensure 'Current Floor' and 'Total Floors' columns are created first
df['Current Floor'] = df['Floor'].apply(extract_floor).astype(int)
df['Total Floors'] = df['Floor'].apply(extract_total_floors).astype(int)

In [13]:
df = df.drop('Floor', axis=1)

In [14]:
# DATA VALUES 
df['month posted'] = df['Posted On'].dt.month
df['day posted'] = df['Posted On'].dt.day
df['day of week posted'] = df['Posted On'].dt.day_of_week
df['quarter poster'] = df['Posted On'].dt.quarter

In [15]:
df = df.drop('Posted On', axis=1)
df = df.drop(columns=['Area Locality'])

In [16]:
df = df.dropna()

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 28 columns):
 #   Column                             Non-Null Count  Dtype
---  ------                             --------------  -----
 0   BHK                                4746 non-null   int64
 1   Rent                               4746 non-null   int64
 2   Size                               4746 non-null   int64
 3   Bathroom                           4746 non-null   int64
 4   Area Type_Built Area               4746 non-null   bool 
 5   Area Type_Carpet Area              4746 non-null   bool 
 6   Area Type_Super Area               4746 non-null   bool 
 7   City_Bangalore                     4746 non-null   bool 
 8   City_Chennai                       4746 non-null   bool 
 9   City_Delhi                         4746 non-null   bool 
 10  City_Hyderabad                     4746 non-null   bool 
 11  City_Kolkata                       4746 non-null   bool 
 12  City_Mumbai         

In [18]:
df

Unnamed: 0,BHK,Rent,Size,Bathroom,Area Type_Built Area,Area Type_Carpet Area,Area Type_Super Area,City_Bangalore,City_Chennai,City_Delhi,...,Tenant Preferred_Family,Point of Contact_Contact Agent,Point of Contact_Contact Builder,Point of Contact_Contact Owner,Current Floor,Total Floors,month posted,day posted,day of week posted,quarter poster
0,2,10000,1100,2,False,False,True,False,False,False,...,False,False,False,True,0,2,5,18,2,2
1,2,20000,800,1,False,False,True,False,False,False,...,False,False,False,True,1,3,5,13,4,2
2,2,17000,1000,1,False,False,True,False,False,False,...,False,False,False,True,1,3,5,16,0,2
3,2,10000,800,1,False,False,True,False,False,False,...,False,False,False,True,1,2,7,4,0,3
4,2,7500,850,1,False,True,False,False,False,False,...,False,False,False,True,1,2,5,9,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2,15000,1000,2,False,True,False,False,False,False,...,False,False,False,True,3,5,5,18,2,2
4742,3,29000,2000,3,False,False,True,False,False,False,...,False,False,False,True,1,4,5,15,6,2
4743,3,35000,1750,3,False,True,False,False,False,False,...,False,True,False,False,3,5,7,10,6,3
4744,3,45000,1500,2,False,True,False,False,False,False,...,True,True,False,False,23,34,7,6,2,3


In [19]:
df['Rent'] = np.log1p(df['Rent']) #Scaling target feature using log function

X = df['Rent']
y = df.drop('Rent', axis=1)

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(y, X, test_size=0.2, random_state=42)

In [21]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [22]:
from sklearn.linear_model import LinearRegression
lrmodel = LinearRegression()
lrmodel.fit(X_train, y_train)

In [23]:
lr_y_pred = lrmodel.predict(X_test)

In [24]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
print('Mean Squared Error:', mean_squared_error(y_test, lr_y_pred))
print('Mean Absolute Error:', mean_absolute_error(y_test, lr_y_pred))
print('R2 Score:', r2_score(y_test, lr_y_pred))

Mean Squared Error: 0.14574512488519112
Mean Absolute Error: 0.2925058694203407
R2 Score: 0.8310014942113209


In [25]:
#polynimial regression
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

In [26]:
polymodel = LinearRegression()
polymodel.fit(X_train_poly, y_train)

In [27]:
poly_y_pred = polymodel.predict(X_test_poly)

In [28]:
print('Mean Squared Error:', mean_squared_error(y_test, poly_y_pred))
print('Mean Absolute Error:', mean_absolute_error(y_test, poly_y_pred))
print('R2 Score:', r2_score(y_test, poly_y_pred))

Mean Squared Error: 0.14849646634347574
Mean Absolute Error: 0.29190011518753217
R2 Score: 0.8278111810140127


In [29]:
from sklearn.neighbors import KNeighborsRegressor
knnmodel = KNeighborsRegressor(n_neighbors=5)
knnmodel.fit(X_train, y_train)

In [30]:
knn_y_pred = knnmodel.predict(X_test)

In [31]:
print('Mean Squared Error:', mean_squared_error(y_test, knn_y_pred))
print('Mean Absolute Error:', mean_absolute_error(y_test, knn_y_pred))
print('R2 Score:', r2_score(y_test, knn_y_pred))

Mean Squared Error: 0.20772026334647323
Mean Absolute Error: 0.3510532949613764
R2 Score: 0.7591383303198789


In [32]:
from sklearn.svm import SVR
svrmodel = SVR()
svrmodel.fit(X_train, y_train)

In [33]:
svr_y_pred = svrmodel.predict(X_test)

In [34]:
print('Mean Squared Error:', mean_squared_error(y_test, svr_y_pred))
print('Mean Absolute Error:', mean_absolute_error(y_test, svr_y_pred))
print('R2 Score:', r2_score(y_test, svr_y_pred))

Mean Squared Error: 0.14559630539641868
Mean Absolute Error: 0.2862096289331278
R2 Score: 0.8311740575904019


In [35]:
from sklearn.tree import DecisionTreeRegressor
dtmodel = DecisionTreeRegressor()
dtmodel.fit(X_train, y_train)

In [36]:
dt_y_pred = dtmodel.predict(X_test)

In [37]:
print('Mean Squared Error:', mean_squared_error(y_test, dt_y_pred))
print('Mean Absolute Error:', mean_absolute_error(y_test, dt_y_pred))
print('R2 Score:', r2_score(y_test, dt_y_pred))

Mean Squared Error: 0.27075219151873964
Mean Absolute Error: 0.382322287398759
R2 Score: 0.686049767759151


In [38]:
from sklearn.ensemble import RandomForestRegressor
rfmodel = RandomForestRegressor(n_estimators=250)
rfmodel.fit(X_train, y_train)

In [39]:
rf_y_pred = rfmodel.predict(X_test)

In [40]:
print('Mean Squared Error:', mean_squared_error(y_test, rf_y_pred))
print('Mean Absolute Error:', mean_absolute_error(y_test, rf_y_pred))
print('R2 Score:', r2_score(y_test, rf_y_pred))

Mean Squared Error: 0.13715885650397264
Mean Absolute Error: 0.28257256881340886
R2 Score: 0.8409576867623209


In [42]:
print(lrmodel.score(X_test, y_test),"-Linear Regression")
print(polymodel.score(X_test_poly, y_test),"-Polynomial Regression")
print(knnmodel.score(X_test, y_test),"-KNN")
print(svrmodel.score(X_test, y_test),"-SVR")
print(dtmodel.score(X_test, y_test),"-Decision Tree")
print(rfmodel.score(X_test, y_test),"-Random Forest")

0.8310014942113209 -Linear Regression
0.8278111810140127 -Polynomial Regression
0.7591383303198789 -KNN
0.8311740575904019 -SVR
0.686049767759151 -Decision Tree
0.8409576867623209 -Random Forest
