2. Machine Learning Model Development: The objective of this project is to build and compare multiple regression models to predict house prices using the House Price Prediction Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
df = pd.read_csv("datasets/House_Price.csv")
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   Id         2000 non-null   int64
 1   Area       2000 non-null   int64
 2   Bedrooms   2000 non-null   int64
 3   Bathrooms  2000 non-null   int64
 4   Floors     2000 non-null   int64
 5   YearBuilt  2000 non-null   int64
 6   Location   2000 non-null   str  
 7   Condition  2000 non-null   str  
 8   Garage     2000 non-null   str  
 9   Price      2000 non-null   int64
dtypes: int64(7), str(3)
memory usage: 156.4 KB


In [7]:
df.head()

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage,Price
0,1,1360,5,4,3,1970,Downtown,Excellent,No,149919
1,2,4272,5,4,3,1958,Downtown,Excellent,No,424998
2,3,3592,2,2,3,1938,Downtown,Good,No,266746
3,4,966,4,2,2,1902,Suburban,Fair,Yes,244020
4,5,4926,1,4,2,1975,Downtown,Fair,Yes,636056


In [9]:
df.isnull().sum()

Id           0
Area         0
Bedrooms     0
Bathrooms    0
Floors       0
YearBuilt    0
Location     0
Condition    0
Garage       0
Price        0
dtype: int64

In [16]:
df = df.drop(columns=['Id'])

In [None]:
#encode categorical variables
df = pd.get_dummies(df, drop_first=True)

In [None]:
#differentiate features and target
X = df.drop('Price', axis=1)
y = df['Price']

In [None]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [20]:
#linear regression
lr = LinearRegression()
lr.fit(X_train, y_train)

lr_pred = lr.predict(X_test)

lr_mse = mean_squared_error(y_test, lr_pred)
lr_r2 = r2_score(y_test, lr_pred)

print("Linear Regression MSE:", lr_mse)
print("Linear Regression R2:", lr_r2)

Linear Regression MSE: 78321466146.0328
Linear Regression R2: -0.006717808430749761


In [21]:
#decision tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

dt_pred = dt.predict(X_test)

dt_mse = mean_squared_error(y_test, dt_pred)
dt_r2 = r2_score(y_test, dt_pred)

print("Decision Tree MSE:", dt_mse)
print("Decision Tree R2:", dt_r2)

Decision Tree MSE: 161085335369.78
Decision Tree R2: -1.0705367732957267


In [22]:
#random forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)

rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

print("Random Forest MSE:", rf_mse)
print("Random Forest R2:", rf_r2)

Random Forest MSE: 85298979559.80693
Random Forest R2: -0.09640442128237403


In [23]:
#comparison 
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Decision Tree', 'Random Forest'],
    'MSE': [lr_mse, dt_mse, rf_mse],
    'R2 Score': [lr_r2, dt_r2, rf_r2]
})

print(results)

               Model           MSE  R2 Score
0  Linear Regression  7.832147e+10 -0.006718
1      Decision Tree  1.610853e+11 -1.070537
2      Random Forest  8.529898e+10 -0.096404


In [24]:
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importance)

              Feature  Importance
0                Area    0.323898
4           YearBuilt    0.262667
1            Bedrooms    0.094241
2           Bathrooms    0.071979
3              Floors    0.047900
11         Garage_Yes    0.033980
5      Location_Rural    0.030484
7      Location_Urban    0.028876
8      Condition_Fair    0.026714
6   Location_Suburban    0.026661
10     Condition_Poor    0.026602
9      Condition_Good    0.025998


In [25]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5
)

grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)

best_rf = grid.best_estimator_

tuned_pred = best_rf.predict(X_test)

tuned_mse = mean_squared_error(y_test, tuned_pred)
tuned_r2 = r2_score(y_test, tuned_pred)

print("Tuned Random Forest MSE:", tuned_mse)
print("Tuned Random Forest R2:", tuned_r2)

Best Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
Tuned Random Forest MSE: 81492813969.9062
Tuned Random Forest R2: -0.04748124772935336
