# Multiple Linear Regression: House Price Prediction

This notebook demonstrates how to implement **multiple linear regression** on a house dataset. We'll handle categorical features using **dummy variables**, fit the model, and evaluate it using R² and RMSE on both train and test data.

---

## Step 1: Import Libraries


In [1]:
# Load required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt


df = pd.read_csv("house.csv")
df.head()

Unnamed: 0,Area,Bedrooms,Bathrooms,Material,Locality,Price
0,1790,2,2,Concrete,Riverside,114300
1,2030,4,2,Concrete,Riverside,114200
2,1740,3,2,Concrete,Riverside,114800
3,1980,3,2,Concrete,Riverside,94700
4,2130,3,3,Concrete,Riverside,119800


In [2]:
# Enocde categorical columns 
df = pd.get_dummies(df,columns=["Material", "Locality"],drop_first=True,dtype=int)

# Split into features and target
X = df.drop(columns=["Price"])
y = df["Price"]

In [4]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
print("Before:",X_train.shape)
print("After:",X_train_poly.shape)

Before: (102, 6)
After: (102, 27)


In [7]:
# Train the model
model = LinearRegression()
model.fit(X_train_poly, y_train)

# Predict on train and test
y_train_pred = model.predict(X_train_poly)
y_test_pred = model.predict(X_test_poly)

# Evaluate model
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Prepare results
results = {
    "Train RMSE": [train_rmse],
    "Test RMSE": [test_rmse],
    "Train R2": [train_r2],
    "Test R2": [test_r2]
}

pd.DataFrame(results)

Unnamed: 0,Train RMSE,Test RMSE,Train R2,Test R2
0,10875.497899,13298.988211,0.841904,0.699926


# Regularization

In [9]:
from sklearn.linear_model import Ridge
rModel = Ridge(alpha=0.1)
rModel.fit(X_train_poly,y_train)
print("Train R2: ",rModel.score(X_train_poly,y_train))
print("Test R2: ",rModel.score(X_test_poly,y_test))
print("Model Coeficients (m1,m2-----m27)")
print("_____________________________________")
print(np.array(rModel.coef_).astype(str))

Train R2:  0.8384039521312241
Test R2:  0.723391322402235
Model Coeficients (m1,m2-----m27)
_____________________________________
['23.041729514723833' '-2928.4073753305893' '-28355.893685006227'
 '11305.544124091395' '5229.605103626944' '9764.58531457125'
 '-0.017252491680094236' '5.326369253743194' '27.721225646938734'
 '9.107979488826857' '-7.425561164397852' '-10.02786509002988'
 '-48.22942466166066' '-3120.3958411719477' '2027.241524123205'
 '2600.3970437805056' '8624.260508744846' '-2152.4260792476457'
 '-15597.709104540174' '1924.6303330178569' '1723.2643989183773'
 '11305.544124090065' '7198.827590296689' '22068.633983445154'
 '5229.60510361901' '0.0' '9764.585314562282']


In [38]:
from sklearn.linear_model import Ridge
rModel = Ridge(alpha=600)
rModel.fit(X_train_poly,y_train)
print("Train R2: ",rModel.score(X_train_poly,y_train))
print("Test R2: ",rModel.score(X_test_poly,y_test))
print("Model Coeficients (m1,m2-----m27)")
print("_____________________________________")
print(np.array(rModel.coef_).astype(str))

Train R2:  0.8415288425938345
Test R2:  0.7099529393822066
Model Coeficients (m1,m2-----m27)
_____________________________________
['-32.881599697929744' '-0.9233098394748758' '-0.5927776098678653'
 '1.2630036431654053' '-0.38181267953119474' '-0.190391203670357'
 '0.05426333426029218' '-1.5773624204805088' '-45.19915413148296'
 '11.150133458326632' '8.960283793039979' '20.840826229959163'
 '5.909697982735006' '-1.051673491545251' '2.830687919240228'
 '1.0789131727413788' '4.477223945170011' '-2.950072179341604'
 '2.9523014633212976' '-2.8761595863241904' '2.6081471090100625'
 '1.2630036431656055' '-0.6446000192830295' '0.32412897491634096'
 '-0.38181267953070824' '0.0' '-0.19039120367098641'
 '-2.4052124405316116e-05' '0.005760097069418087' '0.020819693423606987'
 '0.0009478011883888018' '-0.01229939962458515' '-0.021003421128412088'
 '-0.4922993492337068' '-2.7670860536611808' '1.1030162708437548'
 '1.7555490363509794' '4.267835695935616' '2.932169791851664'
 '-9.73658970886481' '2.2

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [18]:
from sklearn.linear_model import Lasso
lModel = Lasso(alpha=600,max_iter=1000000)
lModel.fit(X_train_poly,y_train)
print("Train R2: ",lModel.score(X_train_poly,y_train))
print("Test R2: ",lModel.score(X_test_poly,y_test))
print("Model Coeficients (m1,m2-----m27)")
print("_____________________________________")
print(np.array(lModel.coef_).astype(str))

Train R2:  0.8139258610206626
Test R2:  0.7473237961253145
Model Coeficients (m1,m2-----m27)
_____________________________________
['34.35615753797049' '0.0' '-0.0' '-0.0' '0.0' '0.0'
 '-0.0037631467154465316' '0.817884705025381' '3.114380349384627'
 '8.570146312786065' '3.7176449064899324' '17.730175125841917' '0.0'
 '-0.0' '0.0' '-0.0' '0.0' '-0.0' '-0.0' '-0.0' '0.0' '-0.0' '-0.0' '0.0'
 '0.0' '0.0' '0.0']


In [24]:
from sklearn.linear_model import ElasticNet
eModel = ElasticNet(alpha=30,max_iter=1000000)
eModel.fit(X_train_poly,y_train)
print("Train R2: ",eModel.score(X_train_poly,y_train))
print("Test R2: ",eModel.score(X_test_poly,y_test))
print("Model Coeficients (m1,m2-----m27)")
print("_____________________________________")
print(np.array(eModel.coef_).astype(str))

Train R2:  0.8140299735514237
Test R2:  0.7475074764018105
Model Coeficients (m1,m2-----m27)
_____________________________________
['34.554632397606085' '0.1377251736962484' '-2.2440978053864393' '-0.0'
 '0.0' '1.0808958250754668' '-0.0037622551044693953' '0.7270621268976554'
 '3.1457168406010894' '8.582117820839597' '3.7327819829592976'
 '17.692078128867262' '30.658983621652876' '-3.5897193468829034'
 '3.655037599211439' '-2.4557480727124514' '17.11838249236251'
 '-8.91438687693756' '-15.731617566868259' '-8.070536857737451'
 '3.2103459506298573' '-0.0' '-10.553030760903024' '25.61899308981631'
 '0.0' '0.0' '1.0808962445010597']


In [39]:
# Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
Model = LinearRegression()
Model.fit(X_train_poly,y_train)
print("Train R2: ",Model.score(X_train_poly,y_train))
print("Test R2: ",Model.score(X_test_poly,y_test))
print("##################################################")

print("Before:",X_train.shape)
print("After:",X_train_poly.shape)
eModel = ElasticNet(alpha=3,max_iter=1000000)
eModel.fit(X_train_poly,y_train)
print("Train R2: ",eModel.score(X_train_poly,y_train))
print("Test R2: ",eModel.score(X_test_poly,y_test))

Train R2:  0.8419044007462411
Test R2:  0.6999263312482396
##################################################
Before: (102, 6)
After: (102, 27)
Train R2:  0.8148990702353289
Test R2:  0.7489008116976695


In [44]:
alpha_values = [1, 3 ,10]
from sklearn.model_selection import GridSearchCV
gridModel = GridSearchCV(Ridge(max_iter=100000),param_grid={'alpha':alpha_values},n_jobs=-1,verbose=True,cv=10)
gridModel.fit(X_train_poly,y_train)

gridModel.best_estimator_

Fitting 10 folds for each of 3 candidates, totalling 30 fits
