In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

In [3]:
data = pd.read_csv("HousingData.csv")

In [4]:
data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [5]:
data.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,486.0,486.0,486.0,486.0,506.0,506.0,486.0,506.0,506.0,506.0,506.0,506.0,486.0,506.0
mean,3.611874,11.211934,11.083992,0.069959,0.554695,6.284634,68.518519,3.795043,9.549407,408.237154,18.455534,356.674032,12.715432,22.532806
std,8.720192,23.388876,6.835896,0.25534,0.115878,0.702617,27.999513,2.10571,8.707259,168.537116,2.164946,91.294864,7.155871,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.0819,0.0,5.19,0.0,0.449,5.8855,45.175,2.100175,4.0,279.0,17.4,375.3775,7.125,17.025
50%,0.253715,0.0,9.69,0.0,0.538,6.2085,76.8,3.20745,5.0,330.0,19.05,391.44,11.43,21.2
75%,3.560263,12.5,18.1,0.0,0.624,6.6235,93.975,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [7]:
X = data.drop('MEDV', axis=1)
y = data['MEDV']

In [8]:
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
from sklearn import metrics
import numpy as np

mse = metrics.mean_squared_error(y_test, y_pred)
mae = metrics.mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")


Mean Squared Error: 21.047043957270205
Mean Absolute Error: 3.1289564965046575
Root Mean Squared Error: 4.587705740048091


In [13]:
#There are 14 attributes in each case of the dataset. They are:
#CRIM - per capita crime rate by town
#ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
#INDUS - proportion of non-retail business acres per town.
#CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
#NOX - nitric oxides concentration (parts per 10 million)
#RM - average number of rooms per dwelling
#AGE - proportion of owner-occupied units built prior to 1940
#DIS - weighted distances to five Boston employment centres
#RAD - index of accessibility to radial highways
#TAX - full-value property-tax rate per $10,000
#PTRATIO - pupil-teacher ratio by town
#B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
#LSTAT - % lower status of the population
#MEDV - Median value of owner-occupied homes in $1000's

In [14]:
print(y_test)

173    23.6
274    32.4
491    13.6
72     22.8
452    16.1
       ... 
110    21.7
321    23.1
265    22.8
29     21.0
262    48.8
Name: MEDV, Length: 167, dtype: float64


In [15]:
print(model.coef_)

[-1.25703469e-01  3.36974207e-02 -4.08181858e-03  3.82442914e+00
 -1.47650681e+01  4.26602500e+00 -2.87130075e-02 -1.47637284e+00
  1.88056821e-01 -6.70432353e-03 -9.03277655e-01  1.17093298e-02
 -4.70491857e-01]


In [16]:
y_pred = model.predict(X_test)
print(y_pred)

[ 2.89327828e+01  3.72264420e+01  1.53900739e+01  2.57185341e+01
  1.83596001e+01  2.29554589e+01  1.79744238e+01  1.44658802e+01
  2.20164734e+01  2.07826529e+01  2.52375783e+01  1.86049092e+01
 -5.97686216e+00  2.19128236e+01  1.89542357e+01  2.52634479e+01
  1.93898010e+01  6.10283345e+00  4.07365828e+01  1.70418841e+01
  2.50043963e+01  3.04300577e+01  1.12948841e+01  2.26459624e+01
  1.74059694e+01  1.51957538e+01  2.15647727e+01  1.42789088e+01
  2.32314685e+01  1.95702105e+01  2.22135154e+01  2.51339260e+01
  2.51848660e+01  1.72517881e+01  1.62269758e+01  1.72020945e+01
  3.06696868e+01  2.02682921e+01  2.48033988e+01  2.29730392e+01
  1.46046930e+01  3.16248324e+01  4.27196491e+01  1.80945478e+01
  2.72906158e+01  1.65123231e+01  1.38565860e+01  2.66063921e+01
  1.97032124e+01  3.01632377e+01  2.09122025e+01  3.32819532e+01
  1.58148322e+01  2.61315288e+01  3.94118172e+01  2.24587904e+01
  1.85684362e+01  3.30481638e+01  2.51699384e+01  1.31334565e+01
  2.25871523e+01  3.08454

In [17]:
print(np.mean((y_pred - y_test)**2))

21.047043957270205


In [18]:
print(mean_squared_error(y_test, y_pred))

21.047043957270205
