In [242]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Load dataset
df = pd.read_csv("Housing.csv")

# Display info
df.head(10)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
5,10850000,7500,3,3,1,yes,no,yes,no,yes,2,yes,semi-furnished
6,10150000,8580,4,3,4,yes,no,no,no,yes,2,yes,semi-furnished
7,10150000,16200,5,3,2,yes,no,no,no,no,0,no,unfurnished
8,9870000,8100,4,1,2,yes,yes,yes,no,yes,2,yes,furnished
9,9800000,5750,3,2,4,yes,yes,no,no,yes,1,yes,unfurnished


 # Data Cleaning & Preprocessing

In [243]:
#checking missing value
df.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [244]:
df.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


# Handling Categorical Variables
Convert yes/no columns into binary (0/1).

Convert furnishingstatus into dummy variables.

In [245]:
# Convert categorical binary columns to numerical (0/1)
binary_cols = ["mainroad", "guestroom", "basement", "hotwaterheating", "airconditioning", "prefarea"]
for col in binary_cols:
    df[col] = df[col].map({"yes": 1, "no": 0})

# One-hot encode "furnishingstatus"
data = pd.get_dummies(df, columns=["furnishingstatus"], drop_first=True)

# Display the processed dataset
data.head(10)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0,0
5,10850000,7500,3,3,1,1,0,1,0,1,2,1,1,0
6,10150000,8580,4,3,4,1,0,0,0,1,2,1,1,0
7,10150000,16200,5,3,2,1,0,0,0,0,0,0,0,1
8,9870000,8100,4,1,2,1,1,1,0,1,2,1,0,0
9,9800000,5750,3,2,4,1,1,0,0,1,1,1,0,1


In [246]:
print(data.corr()["price"].sort_values(ascending=False))

price                              1.000000
area                               0.535997
bathrooms                          0.517545
airconditioning                    0.452954
stories                            0.420712
parking                            0.384394
bedrooms                           0.366494
prefarea                           0.329777
mainroad                           0.296898
guestroom                          0.255517
basement                           0.187057
hotwaterheating                    0.093073
furnishingstatus_semi-furnished    0.063656
furnishingstatus_unfurnished      -0.280587
Name: price, dtype: float64


In [247]:
# Feature Engineering: Create new feature
data["rooms_per_area"] = data["bedrooms"] / data["area"]

# Drop low-correlation features
data.drop(columns=["hotwaterheating", "furnishingstatus_semi-furnished"], inplace=True)


In [248]:
data.head(5)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,airconditioning,parking,prefarea,furnishingstatus_unfurnished,rooms_per_area
0,13300000,7420,4,2,3,1,0,0,1,2,1,0,0.000539
1,12250000,8960,4,4,4,1,0,0,1,3,0,0,0.000446
2,12250000,9960,3,2,2,1,0,1,0,2,1,0,0.000301
3,12215000,7500,4,2,2,1,0,1,1,3,1,0,0.000533
4,11410000,7420,4,1,2,1,1,1,1,2,0,0,0.000539


### Apply Log Transformation on Price

Using log(price) helps reduce skewness and makes the model more robust.

In [249]:
# Apply Log Transformation to Price (Target Variable)
data["price"] = np.log(data["price"])  # Reduce skewness

### Applying Feature Scaling

I have used Standardization (Z-score normalization) for better model performance.

In [250]:
from sklearn.preprocessing import StandardScaler

# Defining numerical columns for scaling
numerical_cols = ["area", "bedrooms", "bathrooms", "stories", "parking"]

# Initialize the scaler
scaler = StandardScaler()

# Applying scaling
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Display processed data
data.head(10)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,airconditioning,parking,prefarea,furnishingstatus_unfurnished,rooms_per_area
0,16.403275,1.046726,1.403419,1.421812,1.378217,1,0,0,1,1.517692,1,0,0.000539
1,16.321036,1.75701,1.403419,5.405809,2.532024,1,0,0,1,2.679409,0,0,0.000446
2,16.321036,2.218232,0.047278,1.421812,0.22441,1,0,1,0,1.517692,1,0,0.000301
3,16.318175,1.083624,1.403419,1.421812,0.22441,1,0,1,1,2.679409,1,0,0.000533
4,16.250001,1.046726,1.403419,-0.570187,0.22441,1,1,1,1,1.517692,0,0,0.000539
5,16.199676,1.083624,0.047278,3.41381,-0.929397,1,0,1,1,1.517692,1,0,0.0004
6,16.132984,1.581745,1.403419,3.41381,2.532024,1,0,0,1,1.517692,1,0,0.000466
7,16.132984,5.096263,2.75956,3.41381,0.22441,1,0,0,0,-0.805741,0,1,0.000309
8,16.10501,1.360358,1.403419,-0.570187,0.22441,1,1,1,1,1.517692,1,0,0.000494
9,16.097893,0.276484,0.047278,1.421812,2.532024,1,1,0,1,0.355976,1,1,0.000522


In [251]:
from scipy import stats
import numpy as np

# Select numerical columns
numerical_cols = ['area', 'bathrooms', 'stories', 'parking']

# Calculate Z-score
z_scores = np.abs(stats.zscore(data[numerical_cols]))

# Count outliers (Z-score > 3)
outliers = (z_scores > 3).sum()
outlier_percentage = (outliers / len(data)) * 100

print("Number of Outliers in Each Feature:\n", outliers)
print("Percentage of Outliers in Data:\n", outlier_percentage)

Number of Outliers in Each Feature:
 area          7
bathrooms    11
stories       0
parking       0
dtype: int64
Percentage of Outliers in Data:
 area         1.284404
bathrooms    2.018349
stories      0.000000
parking      0.000000
dtype: float64


In [269]:
# Remove outliers
data_cleaned = data[(z_scores < 3).all(axis=1)]

print("Original dataset size:", data.shape)
print("New dataset size after outlier removal:", data_cleaned.shape)


Original dataset size: (545, 13)
New dataset size after outlier removal: (528, 13)


## Features & Target Variable

In [270]:
# Define Features & Target Variable
X = data.drop(columns=["price"])  # Independent variables
y = data["price"]  # Target variable


In [271]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [286]:
#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [288]:
# Train the Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)



In [289]:
#Make Predictions
y_pred = model.predict(X_test)   #Evaluate the Model
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"R² Score: {r2:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")

R² Score: 0.67
Root Mean Squared Error: 0.25


In [290]:
#Convert Predictions Back to Original Price Scale
y_pred_original = np.exp(y_pred)  # Reverse log transformation
y_test_original = np.exp(y_test)

# Display some actual vs predicted prices
results = pd.DataFrame({"Actual Price": y_test_original, "Predicted Price": y_pred_original})
print(results.head())

     Actual Price  Predicted Price
316     4060000.0     4.968967e+06
77      6650000.0     7.243129e+06
360     3710000.0     3.389936e+06
90      6440000.0     4.528052e+06
493     2800000.0     3.426830e+06
