In [1]:
pip install statsmodels 

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Import Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import joblib

In [3]:
# 1. Load data
df = pd.read_csv("Housing.csv")
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
# 2. Clean column names (remove spaces)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [5]:
df.shape

(545, 13)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [7]:
# 3. Encode binary categorical features using LabelEncoder
binary_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
le = LabelEncoder()
for col in binary_cols:
    df[col] = le.fit_transform(df[col])

In [8]:
# 4. Apply One-Hot Encoding for multiclass features (furnishingstatus)
df = pd.get_dummies(df, columns=['furnishingstatus'])

In [9]:
df=df.astype('int')

In [10]:
# 5. Multicollinearity check using VIF
# Encode all categorical values for VIF calc
vif_df = df.drop('price', axis=1).copy()
vif_data = pd.DataFrame()
vif_data['Feature'] = vif_df.columns
vif_data['VIF'] = [variance_inflation_factor(vif_df.values, i) for i in range(vif_df.shape[1])]
print("\nVariance Inflation Factors:\n", vif_data)


Variance Inflation Factors:
                             Feature        VIF
0                              area   1.325250
1                          bedrooms   1.369477
2                         bathrooms   1.286621
3                           stories   1.478055
4                          mainroad   1.172728
5                         guestroom   1.212838
6                          basement   1.323050
7                   hotwaterheating   1.041506
8                   airconditioning   1.211840
9                           parking   1.212837
10                         prefarea   1.149196
11       furnishingstatus_furnished   8.575840
12  furnishingstatus_semi-furnished  12.404306
13     furnishingstatus_unfurnished   8.826219


In [11]:
df.drop('furnishingstatus_furnished',axis=1,inplace=True)

No more VIF > 5 ✅
No multicollinearity issue ✅
Model becomes stable ✅

ML model training

In [12]:
# 6. Train/Test split
from sklearn.preprocessing import StandardScaler
X = df.drop('price', axis=1)  # Only input features
y = df['price']               # Target

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [13]:
# 7. Train model
model = LinearRegression()
model.fit(X_train, y_train)

# 8. Predict
y_pred = model.predict(X_test)

In [14]:
# 9. Evaluation
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"\nModel Evaluation:")
print(f"R² Score: {r2:.4f}")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")


Model Evaluation:
R² Score: 0.6529
MSE: 1754318687330.67
RMSE: 1324506.96


In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
# Initialize Random Forest
rf_model = RandomForestRegressor(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict
y_pred_rf = rf_model.predict(X_test)


In [16]:
# Mean Squared Error
mse_rf = mean_squared_error(y_test, y_pred_rf)

# Root Mean Squared Error
rmse_rf = np.sqrt(mse_rf)

# R2 Score
r2_rf = r2_score(y_test, y_pred_rf)


print(f"\nRandom Forest Model Evaluation:")
print(f"R² Score: {r2_rf:.4f}")
print(f"MSE: {mse_rf:.2f}")
print(f"RMSE: {rmse_rf:.2f}")



Random Forest Model Evaluation:
R² Score: 0.6118
MSE: 1962366397823.41
RMSE: 1400844.89


Saving the Linear Regression model

In [17]:
import pickle

# Save the model to a file
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

print("Model saved to 'model.pkl'")


Model saved to 'model.pkl'


In [18]:
pickle.dump(le, open('label_encoder.pkl', 'wb'))
pickle.dump(scaler, open('scaler.pkl', 'wb'))