In [349]:
import pandas as pd

# reading the CSV file into pandas dataframe
df = pd.read_csv("C:/Users/pruth/OneDrive/Desktop/Projects/Housing-Price-Prediction/FeatureEngineered_Data.csv")
df.head()

Unnamed: 0,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Stage Nagarbhavi,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,428.0,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,194.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1200.0,6.0,125.0,6,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1875.0,2.0,235.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,930.0,4.0,85.0,7,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [357]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Define the model to test
model = LinearRegression()

# Split the data into training and testing sets
X = df.drop(['price'],axis='columns')
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Define the number of folds for cross-validation
n_folds = 5

# Define lists to store results
mse_scores = []
mae_scores = []
r2_scores = []

# Train the model using k-fold cross-validation
kf = KFold(n_splits=n_folds, shuffle=True, random_state=10)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse_scores.append(mean_squared_error(y_test, y_pred))
    mae_scores.append(mean_absolute_error(y_test, y_pred))
    r2_scores.append(r2_score(y_test, y_pred))

# Calculate the mean MSE, MAE, and R^2 scores across the folds
mean_mse = sum(mse_scores) / len(mse_scores)
mean_mae = sum(mae_scores) / len(mae_scores)
mean_r2 = sum(r2_scores) / len(r2_scores)

# Print the results
print('Linear Regression:\n')
print(f'MSE = {mean_mse}\nMAE = {mean_mae}\nR^2 = {mean_r2}\n')

Linear Regression:

MSE = 1206.02634718744
MAE = 19.567613537103053
R^2 = 0.7876229906081539



In [359]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Split the data into training and testing sets
X = df.drop(['price'], axis='columns')
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Create a Decision Tree Regressor object
dt_reg = DecisionTreeRegressor(random_state=10)

# Fit the model on the training data
dt_reg.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = dt_reg.predict(X_test)

# Calculate the evaluation metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the results
print('Decision Tree:')
print(f'\nMSE = {mse}\nMAE = {mae}\nR^2 = {r2}\n')

Decision Tree:

MSE = 1563.5188954835978
MAE = 20.7948899357158
R^2 = 0.6469949653443785



In [356]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Define the model to test
model = RandomForestRegressor()

# Split the data into training and testing sets
X = df.drop(['price'],axis='columns')
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Define the number of folds for cross-validation
n_folds = 5

# Define lists to store results
mse_scores = []
mae_scores = []
r2_scores = []

# Train the model using k-fold cross-validation
kf = KFold(n_splits=n_folds, shuffle=True, random_state=10)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse_scores.append(mean_squared_error(y_test, y_pred))
    mae_scores.append(mean_absolute_error(y_test, y_pred))
    r2_scores.append(r2_score(y_test, y_pred))

# Calculate the mean MSE, MAE, and R^2 scores across the folds
mean_mse = sum(mse_scores) / len(mse_scores)
mean_mae = sum(mae_scores) / len(mae_scores)
mean_r2 = sum(r2_scores) / len(r2_scores)

# Print the results
print(f'Random Forest:\n\nMSE = {mean_mse}\nMAE = {mean_mae}\nR^2 = {mean_r2}\n')


Random Forest:

MSE = 1524.1270995896944
MAE = 18.93966141848457
R^2 = 0.7313052131342219



In [361]:
# Selecting the Linear Regression algorithm as it has the highest R2 Score

import joblib
import numpy as np

# Define the linear regression model
lr_model = LinearRegression()

# Train the model
lr_model.fit(X_train, y_train)

# Save the model
joblib.dump(lr_model, 'linear_regression_model.pkl')

# Load the model for prediction
lr_model = joblib.load('linear_regression_model.pkl')


In [362]:
# Define the predict_price function using the loaded model
def predict_price(location,sqft,bath,bhk):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    return lr_model.predict([x])[0]

# Test the function
predict_price('Indira Nagar',1000, 2, 2)

147.84373911227198

In [373]:
#Pickle the model to deploy it

import pickle
pickle.dump(lr_model, open("./linear_regression_model.pkl","wb"))

In [374]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))