# 4) Data Analytics I
Create a Linear Regression Model using Python/R to predict home prices using Boston Housing Dataset (https://www.kaggle.com/c/boston-housing). The Boston Housing dataset contains 
information about various houses in Boston through different parameters. There are 506 samples and 13 feature variables in this dataset.

The objective is to predict the value of prices of the house using the given features.

In [1]:
#import dependencies
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [2]:
boston = pd.read_csv(r'ds4_housing_data.csv')
boston

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [3]:
# Transform the data set into a data frame
# data = the data we want or independent variables also known as the x values
# feature_names = the column names of the data 

# RM - average number of rooms
# TAX - full-value property-tax rate per dollar 10,000
# PTRATIO - pupil-teacher ratio by town

df_x = boston[['RM','TAX','PTRATIO','MEDV']] 
df_x.head()
     

Unnamed: 0,RM,TAX,PTRATIO,MEDV
0,6.575,296,15.3,24.0
1,6.421,242,17.8,21.6
2,7.185,242,17.8,34.7
3,6.998,222,18.7,33.4
4,7.147,222,18.7,36.2


In [4]:
df_x.isnull().sum()

RM         0
TAX        0
PTRATIO    0
MEDV       0
dtype: int64

In [5]:
# target = the target of variable or the price of the houses or dependent variables also known as the y value 
df_y = pd.DataFrame(boston.MEDV)
df_y.head()

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [6]:
# Get some Statistics from the data set, count, mean 
df_x.describe()

Unnamed: 0,RM,TAX,PTRATIO,MEDV
count,506.0,506.0,506.0,506.0
mean,6.284634,408.237154,18.455534,22.532806
std,0.702617,168.537116,2.164946,9.197104
min,3.561,187.0,12.6,5.0
25%,5.8855,279.0,17.4,17.025
50%,6.2085,330.0,19.05,21.2
75%,6.6235,666.0,20.2,25.0
max,8.78,711.0,22.0,50.0


In [7]:
# Intialise the linear regression model
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

In [8]:
# Split the data into 67% training and 33% testing data
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size= 0.33, random_state= 42)
     

In [9]:
# Train the model with our training data
reg.fit(x_train, y_train)

In [10]:
# Print the co-efficients/weights for each feature/coloum of our model
print(reg.coef_) #f(x) = mx + da + b = y

[[ 9.40554071e-18  5.55111512e-17 -1.21430643e-15  1.00000000e+00]]


In [11]:
# print the predictions on our test data
y_pred = reg.predict(x_test)
print(y_pred)

[[23.6]
 [32.4]
 [13.6]
 [22.8]
 [16.1]
 [20. ]
 [17.8]
 [14. ]
 [19.6]
 [16.8]
 [21.5]
 [18.9]
 [ 7. ]
 [21.2]
 [18.5]
 [29.8]
 [18.8]
 [10.2]
 [50. ]
 [14.1]
 [25.2]
 [29.1]
 [12.7]
 [22.4]
 [14.2]
 [13.8]
 [20.3]
 [14.9]
 [21.7]
 [18.3]
 [23.1]
 [23.8]
 [15. ]
 [20.8]
 [19.1]
 [19.4]
 [34.7]
 [19.5]
 [24.4]
 [23.4]
 [19.7]
 [28.2]
 [50. ]
 [17.4]
 [22.6]
 [15.1]
 [13.1]
 [24.2]
 [19.9]
 [24. ]
 [18.9]
 [35.4]
 [15.2]
 [26.5]
 [43.5]
 [21.2]
 [18.4]
 [28.5]
 [23.9]
 [18.5]
 [25. ]
 [35.4]
 [31.5]
 [20.2]
 [24.1]
 [20. ]
 [13.1]
 [24.8]
 [30.8]
 [12.7]
 [20. ]
 [23.7]
 [10.8]
 [20.6]
 [20.8]
 [ 5. ]
 [20.1]
 [48.5]
 [10.9]
 [ 7. ]
 [20.9]
 [17.2]
 [20.9]
 [ 9.7]
 [19.4]
 [29. ]
 [16.4]
 [25. ]
 [25. ]
 [17.1]
 [23.2]
 [10.4]
 [19.6]
 [17.2]
 [27.5]
 [23. ]
 [50. ]
 [17.9]
 [ 9.6]
 [17.2]
 [22.5]
 [21.4]
 [12. ]
 [19.9]
 [19.4]
 [13.4]
 [18.2]
 [24.6]
 [21.1]
 [24.7]
 [ 8.7]
 [27.5]
 [20.7]
 [36.2]
 [31.6]
 [11.7]
 [39.8]
 [13.9]
 [21.8]
 [23.7]
 [17.6]
 [24.4]
 [ 8.8]
 [19.2]
 [25.3]


In [12]:
# Print the actual values
print(y_test)

     MEDV
173  23.6
274  32.4
491  13.6
72   22.8
452  16.1
..    ...
110  21.7
321  23.1
265  22.8
29   21.0
262  48.8

[167 rows x 1 columns]


In [13]:
# Check the model performance/accuracy using Mean Squared error(MSE)
print( np.mean((y_pred - y_test)**2 ))
     

9.911777469612269e-29


In [14]:
# Check the model performance/accuracy using Mean Squared error (MSE) and sklearn.metrics
from sklearn.metrics import mean_squared_error
print( mean_squared_error(y_test, y_pred))

9.911777469612269e-29


In [15]:
from sklearn import metrics
mse = metrics.mean_squared_error(y_test,y_pred)
mae = metrics.mean_absolute_error(y_test,y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
print(f'Mean Squared Error: {mse}',f'Mean Absolute Error: {mae}',f'Root Mean Squared Error: {rmse}',sep='\n')

Mean Squared Error: 9.911777469612269e-29
Mean Absolute Error: 8.674365284616194e-15
Root Mean Squared Error: 9.95579101307991e-15


From above *Evaluation Metrices*, we can notice that Root Mean Squared Error is low for our  Regression Model and that is good thing for us.