# Project ML Model

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('Mumbai House Prices.csv')

In [3]:
data.head()

Unnamed: 0,bhk,type,locality,area,price,price_unit,region,status,age
0,3,Apartment,Lak And Hanware The Residency Tower,685,2.5,Cr,Andheri West,Ready to move,New
1,2,Apartment,Radheya Sai Enclave Building No 2,640,52.51,L,Naigaon East,Under Construction,New
2,2,Apartment,Romell Serene,610,1.73,Cr,Borivali West,Under Construction,New
3,2,Apartment,Soundlines Codename Urban Rainforest,876,59.98,L,Panvel,Under Construction,New
4,2,Apartment,Origin Oriana,659,94.11,L,Mira Road East,Under Construction,New


## Exploratory Data Analysis Steps

In [4]:
data.shape

(76038, 9)

In [5]:
data.columns

Index(['bhk', 'type', 'locality', 'area', 'price', 'price_unit', 'region',
       'status', 'age'],
      dtype='object')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76038 entries, 0 to 76037
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   bhk         76038 non-null  int64  
 1   type        76038 non-null  object 
 2   locality    76038 non-null  object 
 3   area        76038 non-null  int64  
 4   price       76038 non-null  float64
 5   price_unit  76038 non-null  object 
 6   region      76038 non-null  object 
 7   status      76038 non-null  object 
 8   age         76038 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 5.2+ MB


In [7]:
data.describe()

Unnamed: 0,bhk,area,price
count,76038.0,76038.0,76038.0
mean,2.015111,1024.53685,29.38227
std,0.922754,670.276165,32.90345
min,1.0,127.0,1.0
25%,1.0,640.0,1.75
50%,2.0,872.0,5.5
75%,3.0,1179.0,59.0
max,10.0,16000.0,99.99


In [8]:
data.describe(include = 'object')

Unnamed: 0,type,locality,price_unit,region,status,age
count,76038,76038,76038,76038,76038,76038
unique,5,9782,2,228,2,3
top,Apartment,Hiranandani Meadows,Cr,Thane West,Ready to move,New
freq,74854,861,40981,14868,44982,38072


## Pre-Processing Steps

#### Column = type

In [9]:
data['type'].value_counts()

type
Apartment            74854
Studio Apartment       882
Villa                  226
Independent House       73
Penthouse                3
Name: count, dtype: int64

In [10]:
data.loc[:, 'type'] = data['type'].replace(
    to_replace=['Apartment', 'Studio Apartment', 'Villa', 'Independent House', 'Penthouse'],
    value=[0, 1, 2, 3, 4]
)

data.head()

  data.loc[:, 'type'] = data['type'].replace(


Unnamed: 0,bhk,type,locality,area,price,price_unit,region,status,age
0,3,0,Lak And Hanware The Residency Tower,685,2.5,Cr,Andheri West,Ready to move,New
1,2,0,Radheya Sai Enclave Building No 2,640,52.51,L,Naigaon East,Under Construction,New
2,2,0,Romell Serene,610,1.73,Cr,Borivali West,Under Construction,New
3,2,0,Soundlines Codename Urban Rainforest,876,59.98,L,Panvel,Under Construction,New
4,2,0,Origin Oriana,659,94.11,L,Mira Road East,Under Construction,New


#### Column = region (228 regions -> May create complications in Frontend and bankend Predictions.)

In [11]:
# data['region'].value_counts()

In [12]:
# data.loc[:, 'region'] = data['region'].replace(
#     to_replace=[---],
#     value=[0, 1, 2, 3, 4]
# )

# data.head()

#### Column = Status

In [13]:
data['status'].value_counts()

status
Ready to move         44982
Under Construction    31056
Name: count, dtype: int64

In [14]:
data.loc[:, 'status'] = data['status'].replace(
    to_replace=['Ready to move', 'Under Construction'],
    value=[0, 1]
)

data.head()

  data.loc[:, 'status'] = data['status'].replace(


Unnamed: 0,bhk,type,locality,area,price,price_unit,region,status,age
0,3,0,Lak And Hanware The Residency Tower,685,2.5,Cr,Andheri West,0,New
1,2,0,Radheya Sai Enclave Building No 2,640,52.51,L,Naigaon East,1,New
2,2,0,Romell Serene,610,1.73,Cr,Borivali West,1,New
3,2,0,Soundlines Codename Urban Rainforest,876,59.98,L,Panvel,1,New
4,2,0,Origin Oriana,659,94.11,L,Mira Road East,1,New


#### Column = age

In [15]:
data['age'].value_counts()

age
New        38072
Resale     23357
Unknown    14609
Name: count, dtype: int64

In [16]:
data.loc[:, 'age'] = data['age'].replace(
    to_replace=['New', 'Resale', 'Unknown'],
    value=[0, 1, 2]
)

data.head()

  data.loc[:, 'age'] = data['age'].replace(


Unnamed: 0,bhk,type,locality,area,price,price_unit,region,status,age
0,3,0,Lak And Hanware The Residency Tower,685,2.5,Cr,Andheri West,0,0
1,2,0,Radheya Sai Enclave Building No 2,640,52.51,L,Naigaon East,1,0
2,2,0,Romell Serene,610,1.73,Cr,Borivali West,1,0
3,2,0,Soundlines Codename Urban Rainforest,876,59.98,L,Panvel,1,0
4,2,0,Origin Oriana,659,94.11,L,Mira Road East,1,0


In [17]:
# Function to convert price to lakhs
def convert_price_to_lakhs(price, unit):
    if unit == 'Cr':
        return price * 100
    elif unit == 'L':
        return price 
    else:
        return None

# Apply the conversion function to each row
data['price_in_lakhs'] = data.apply(lambda row: convert_price_to_lakhs(row['price'], row['price_unit']), axis=1)

# Drop the old price and price_unit columns
data.drop(['price', 'price_unit'], axis=1, inplace=True)

In [18]:
data.head()

Unnamed: 0,bhk,type,locality,area,region,status,age,price_in_lakhs
0,3,0,Lak And Hanware The Residency Tower,685,Andheri West,0,0,250.0
1,2,0,Radheya Sai Enclave Building No 2,640,Naigaon East,1,0,52.51
2,2,0,Romell Serene,610,Borivali West,1,0,173.0
3,2,0,Soundlines Codename Urban Rainforest,876,Panvel,1,0,59.98
4,2,0,Origin Oriana,659,Mira Road East,1,0,94.11


## Selecting Features

In [19]:
# Feature = data[['bhk','type','status','age']]

Feature = data[['bhk','type','area','status','age']]

In [20]:
Feature.head()

Unnamed: 0,bhk,type,area,status,age
0,3,0,685,0,0
1,2,0,640,1,0
2,2,0,610,1,0
3,2,0,876,1,0
4,2,0,659,1,0


In [21]:
Feature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76038 entries, 0 to 76037
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   bhk     76038 non-null  int64 
 1   type    76038 non-null  object
 2   area    76038 non-null  int64 
 3   status  76038 non-null  object
 4   age     76038 non-null  object
dtypes: int64(2), object(3)
memory usage: 2.9+ MB


## Lets Define Feature sets, X:

In [22]:
X = Feature
X[0:4]

Unnamed: 0,bhk,type,area,status,age
0,3,0,685,0,0
1,2,0,640,1,0
2,2,0,610,1,0
3,2,0,876,1,0


## What are our lables? Create Output Variable

In [23]:
y = data['price_in_lakhs']
y[0:5]

0    250.00
1     52.51
2    173.00
3     59.98
4     94.11
Name: price_in_lakhs, dtype: float64

In [24]:
from sklearn import preprocessing

X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[ 1.06734333, -0.11445594, -0.50656595, -0.83090884, -0.89441655],
       [-0.01637594, -0.11445594, -0.5737029 ,  1.20350146, -0.89441655],
       [-0.01637594, -0.11445594, -0.61846086,  1.20350146, -0.89441655],
       [-0.01637594, -0.11445594, -0.22160691,  1.20350146, -0.89441655],
       [-0.01637594, -0.11445594, -0.54535619,  1.20350146, -0.89441655]])

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, 
                                                    y, 
                                                    test_size = 0.2,
                                                    random_state = 3)
print ('Train set:', X_train.shape, y_train.shape)
print ('Test set:', X_test.shape, y_test.shape)

Train set: (60830, 5) (60830,)
Test set: (15208, 5) (15208,)


In [26]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import joblib

# # Create a list of regression algorithms to compare
# regressors = [
#     LinearRegression(),
#     DecisionTreeRegressor(),
#     RandomForestRegressor(),
#     SVR(),
#     KNeighborsRegressor()
# ]

# # Perform regression with each algorithm and calculate MSE
# mse_scores = []
# r2_scores = []
# for regressor in regressors:
#     regressor.fit(X_train, y_train)
#     y_pred = regressor.predict(X_test)
#     mse = mean_squared_error(y_test, y_pred)
#     # Calculate the R-squared score
#     r2 = r2_score(y_test, y_pred)
#     mse_scores.append(mse)
#     r2_scores.append(r2)

# # Find the best algorithm based on MSE
# best_algorithm_index = mse_scores.index(min(mse_scores))
# best_algorithm = regressors[best_algorithm_index]

# # Print the MSE scores and the best algorithm
# for i, mse in enumerate(mse_scores):
#     print(f"Algorithm {i+1} - MSE: {mse}")
# print(f"The best algorithm is Algorithm {best_algorithm_index+1}: {type(best_algorithm).__name__}")


# # r2_scores

# print("\n\n\n\nr2_Scores are")
# for i, r in enumerate(r2_scores):
#     print(f"Algorithm {i+1} - R2: {r}")


# # Fit the best algorithm on the entire dataset
# best_algorithm.fit(X, y)


In [27]:
# Create and fit the LinearRegression()
regressor1 = LinearRegression()
regressor1.fit(X_train, y_train)

# Make predictions on the test set
y_pred1 = regressor1.predict(X_test)

# Calculate the R-squared score
mse = mean_squared_error(y_test, y_pred1)
print(f"Mean Squared Error: {mse}")

# Calculate the R-squared score
r2 = r2_score(y_test, y_pred1)
print(f"R-squared: {r2}")


Mean Squared Error: 19891.48813464706
R-squared: 0.5892472676917011


In [28]:
# Create and fit the DecisionTreeRegressor() 
regressor2 = DecisionTreeRegressor()
regressor2.fit(X_train, y_train)

# Make predictions on the test set
y_pred2 = regressor2.predict(X_test)

# Calculate the R-squared score
mse = mean_squared_error(y_test, y_pred2)
print(f"Mean Squared Error: {mse}")

# Calculate the R-squared score
r2 = r2_score(y_test, y_pred2)
print(f"R-squared: {r2}")


Mean Squared Error: 17359.96447844526
R-squared: 0.641522404255104


In [29]:
# Create and fit the Random Forest regressor
regressor3 =  RandomForestRegressor()
regressor3.fit(X_train, y_train)

# Make predictions on the test set
y_pred2 = regressor3.predict(X_test)

# Calculate the R-squared score
mse = mean_squared_error(y_test, y_pred2)
print(f"Mean Squared Error: {r2}")

# Calculate the R-squared score
r2 = r2_score(y_test, y_pred2)
print(f"R-squared: {r2}")


Mean Squared Error: 0.641522404255104
R-squared: 0.7014280792155221


In [30]:
# # Create and fit the SVR()
# regressor4 =  SVR()
# regressor4.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred4 = regressor4.predict(X_test)

# # Calculate the R-squared score
# mse = mean_squared_error(y_test, y_pred4)
# print(f"Mean Squared Error: {mse}")

# # Calculate the R-squared score
# r2 = r2_score(y_test, y_pred4)
# print(f"R-squared: {r2}")


In [31]:
# Create and fit the KNeighborsRegressor()
regressor5 =  KNeighborsRegressor()
regressor5.fit(X_train, y_train)

# Make predictions on the test set
y_pred5 = regressor5.predict(X_test)

# Calculate the R-squared score
mse = mean_squared_error(y_test, y_pred5)
print(f"Mean Squared Error: {mse}")

# Calculate the R-squared score
r2 = r2_score(y_test, y_pred5)
print(f"R-squared: {r2}")


Mean Squared Error: 15435.778793800895
R-squared: 0.6812562100963839


#### Hence Random Forest Regressor is the best Model.

In [32]:
import joblib 

filename = 'random.joblib'
joblib.dump(regressor3,filename)

['random.joblib']