In [4]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import joblib


In [5]:

data = pd.read_csv('housing.csv')


data.head()


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [6]:
# Step 3: Data Preprocessing


data = data[['price', 'area', 'bedrooms', 'bathrooms', 'furnishingstatus', 'mainroad']]


data['mainroad'] = data['mainroad'].apply(lambda x: 1 if x == 'yes' else 0)


furnishing_dummies = pd.get_dummies(data['furnishingstatus'], drop_first=True)
data = pd.concat([data, furnishing_dummies], axis=1)


data.drop('furnishingstatus', axis=1, inplace=True)


data.head()


Unnamed: 0,price,area,bedrooms,bathrooms,mainroad,semi-furnished,unfurnished
0,13300000,7420,4,2,1,False,False
1,12250000,8960,4,4,1,False,False
2,12250000,9960,3,2,1,True,False
3,12215000,7500,4,2,1,False,False
4,11410000,7420,4,1,1,False,False


In [7]:

X = data[['area', 'bedrooms', 'bathrooms', 'mainroad', 'semi-furnished', 'unfurnished']]
y = data['price']


In [8]:
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Training
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)


In [10]:
# predictions
y_pred = lr_model.predict(X_test)


In [11]:
# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


Mean Squared Error: 2584250426805.035
R-squared: 0.48872994125144575


In [12]:

new_data = np.array([[300000, 3, 2, 1, 1, 0]]) 
predicted_price = lr_model.predict(new_data)
print(f'Predicted Price: {predicted_price}')


Predicted Price: [93004046.3706611]




In [13]:
joblib.dump(lr_model, 'house_price_model.pkl')



['house_price_model.pkl']