In [1]:
# Import the library for generating random variables
import numpy as np

# Import the library for handling data
import pandas as pd

# Visualisation library
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

# Import libraries for plotting
import matplotlib.pyplot as plt

# Import the libraries for T-test and ANOVA
import scipy.stats as stats

# inport statsmodels.
import statsmodels.api as sm

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn.metrics import mean_squared_error

# To plot the graph embedded in the notebook
%matplotlib inline

In [2]:
# Import the Boston Housing dataset
from sklearn.datasets import load_boston
boston = load_boston()

# Commented out the print statement here as it is difficult to read and interupt.
# print(boston)  

# Adapted from: https://github.com/Tsmith5151/Boston-Housing-Prices/blob/master/boston_housing.ipynb

#Values
price = boston.target                                   # Target values
feature = boston.data                                   # Attributes values

# Store in DataFrame
attributes = boston.feature_names                       # Feature names
data = pd.DataFrame(feature, columns = attributes)
target = pd.DataFrame(price, columns =['MEDV'])
df = pd.concat([data, target,], axis = 1)               # concat data/target

# PREDICT

There are 13 features. Here we will remove the columns that will not be used to predict the target variable.

* 16 data points have an 'MEDV' value of 50.0. These data points likely contain missing or censored values and have been removed.
* 1 data point has an 'RM' value of 8.78. This data point can be considered an outlier and has been removed.

In [3]:
# Adapted from: https://books.google.ie/books?id=7zhDDwAAQBAJ&pg=PT358&lpg=PT358&dq=X+%3D+tips%5B%27total_bill%27%5D.values.reshape(-1,1)+Y+%3D+tips%5B%27tip%27%5D.values.reshape(-1,1)&source=bl&ots=vHyyJ7sVqy&sig=ACfU3U1WQO1y3kwv3o_xi-PomAdcojqz9g&hl=en&sa=X&ved=2ahUKEwjtuMrfmovmAhV1o3EKHaAfBpgQ6AEwAHoECAkQAQ#v=onepage&q=X%20%3D%20tips%5B'total_bill'%5D.values.reshape(-1%2C1)%20Y%20%3D%20tips%5B'tip'%5D.values.reshape(-1%2C1)&f=false
# Spliting target variable and independent variables
# Adapted from: https://books.google.ie/books?id=7zhDDwAAQBAJ&pg=PT358&lpg=PT358&dq=X+%3D+tips%5B%27total_bill%27%5D.values.reshape(-1,1)+Y+%3D+tips%5B%27tip%27%5D.values.reshape(-1,1)&source=bl&ots=vHyyJ7sVqy&sig=ACfU3U1WQO1y3kwv3o_xi-PomAdcojqz9g&hl=en&sa=X&ved=2ahUKEwjtuMrfmovmAhV1o3EKHaAfBpgQ6AEwAHoECAkQAQ#v=onepage&q=X%20%3D%20tips%5B'total_bill'%5D.values.reshape(-1%2C1)%20Y%20%3D%20tips%5B'tip'%5D.values.reshape(-1%2C1)&f=false

X = df[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO',
 'B', 'LSTAT']].values        

y = df[["MEDV"]].values

In [None]:

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
#features= pd.DataFrame(scaler.fit_transform(features), columns=features.columns)
features= scaler.fit_transform(features)
prices= prices.values.reshape(-1,1)
#prices = pd.DataFrame(scaler.fit_transform(prices), columns=prices.columns)
prices = scaler.fit_transform(prices)

In [5]:
# Split the dataset: 66% data to the training set and 33% data to the test set
# random_state sets a seed to the random generator
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Training Size                 :", len(X_train))
print("Test Size                     :", len(X_test))
print("Total Dataset Size            :",len(X_train) + len(X_test))

Training Size                 : 354
Test Size                     : 152
Total Dataset Size            : 506


In [71]:
# Create linear regression object
lr = LinearRegression()

# Specify the predictor X and the response y 
lr.fit(X_train, y_train)
predict = lr.predict(X_test)

print("Mean Square Error       : {:.2f}".format(mean_squared_error(y_test, predict)))
print("Mean Absolute Error     : {:.2f}".format(metrics.mean_absolute_error(y_test, y_pred)))
print("Root Mean Square Error  : {:.2f}".format(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))
print("R-Squared               : {:.2f}".format(lr.score(X_train,y_train)))
print("Score                   : {:.2f}".format(model.score(X_test, y_test)))

# Determine the how accurate this algorthm is at predicting values
y_predict = lr.predict(X_test)
df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_predict.flatten()})
df

Mean Square Error       : 21.52
Mean Absolute Error     : 3.16
Root Mean Square Error  : 4.64
R-Squared               : 0.74
Score                   : 0.71


Unnamed: 0,Actual,Predicted
0,23.6,28.648960
1,32.4,36.495014
2,13.6,15.411193
3,22.8,25.403213
4,16.1,18.855280
5,20.0,23.146689
6,17.8,17.392124
7,14.0,14.078599
8,19.6,23.036927
9,16.8,20.599433


### Build Model

In [None]:
# Adapted from: https://github.com/ianmcloughlin/jupyter-teaching-notebooks/blob/master/keras-and-iris.ipynb

# Start a neural network, building it by layers.
model = kr.models.Sequential()

# Add a hidden layer with x neurons and an input layer with 4.
model.add(kr.layers.Dense(units=30, activation='relu', input_dim=4))
# Add a three neuron output layer.
model.add(kr.layers.Dense(units=3, activation='softmax'))

# Build the graph.
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [None]:

# Start a neural network, building it by layers.
model = kr.models.Sequential()

# Add a hidden layer with x neurons and an input layer with 4.
model.add(kr.layers.Dense(units=30, activation='relu', input_dim=4))
# Add a three neuron output layer.
model.add(kr.layers.Dense(units=3, activation='softmax'))

# Build the graph.
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [None]:
# Train the neural network.
model.fit(inputs_train_white, outputs_train, epochs=15, batch_size=10)