# Real estate Price Prediction - Data Preprocessing.

### Imports

In [3]:
import pandas as pd
import json
import pickle
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, r2_score

In [4]:
df1 = pd.read_csv("housing.csv")
df1.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [5]:
df1.shape

(545, 13)

In [6]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [7]:
df1["guestroom"].unique()

array(['no', 'yes'], dtype=object)

In [8]:
df1["furnishingstatus"].unique()

array(['furnished', 'semi-furnished', 'unfurnished'], dtype=object)

In [9]:
columns = df1.columns.tolist()
df1.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

### Test train split

In [11]:
X = df1.drop(columns=["price"])
y = df1["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, y_train.shape

((436, 12), (436,))

### One hot encoding

In [13]:
categorical_columns = ["mainroad", "guestroom", "basement", "hotwaterheating", "airconditioning", "furnishingstatus", "parking", "prefarea"]
X_train_encoded = pd.get_dummies(X_train, columns=categorical_columns)

### Normalization - MinMaxScaler

In [15]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)

with open("scaler.pickle", "wb") as f:
    pickle.dump(scaler, f)

### Model Training

In [17]:
lrr = LinearRegression()
model = lrr.fit(X_train_scaled, y_train)

### Test the model

In [19]:
X_test_encoded = pd.get_dummies(X_test, columns=categorical_columns)
X_test_scaled = scaler.transform(X_test_encoded)

# Predict and Evaluate
predictions = model.predict(X_test_scaled)
mse = root_mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 1318921.3853459028
R-squared: 0.6558453961589377


In [20]:
with open('housing_price_model.pickle', 'wb') as f:
    pickle.dump(model, f)

In [21]:
with open("columns.json", "w", encoding="utf-8") as f:
    json.dump(columns, f, indent=4)