In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split

# Data Formatting

This dataset contains information about homes in the Boston area. With a number of feature variables available, we aim to predict either median home value or an estimate of air quality from the remaining variables.

In [2]:
# Modify the csv to have all columns on the same row
data = []
with open('boston.csv', 'r') as file, open('boston_mod.csv', 'a') as file2:
#with open("boston.csv") as file:
    for i,line in enumerate(file):
        if i > 21:
            if i % 2 == 0:
                new_line = line.rstrip()
            else:
                new_line = new_line + line.rstrip()
                #print(new_line.split())
                line_float = [float(i) for i in new_line.split()]
                data.append(np.array(line_float))
                file2.write(new_line + '\n')
                
data = np.array(data)

In [3]:
column_names = (['CRIM', #per capita crime rate by town
                 'ZN', #proportion of residential land zoned for lots over 25,000 sq.ft.
                 'INDUS', #proportion of non-retail business acres per town
                 'CHAS', #Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
                 'NOX', #nitric oxides concentration (parts per 10 million)
                 'RM', #average number of rooms per dwelling
                 'AGE', #proportion of owner-occupied units built prior to 1940
                 'DIS', #weighted distances to five Boston employment centres
                 'RAD', #index of accessibility to radial highways
                 'TAX', #full-value property-tax rate per $10,000
                 'PTRATIO', #pupil-teacher ratio by town
                 'B', #1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
                 'LSTAT', #% lower status of the population
                 'MEDV']) #Median value of owner-occupied homes in $1000's
feature_names = column_names # Extract the features names

In [4]:
# Let MEDV (Median home value) or NOX (Nitric oxides concentraion in air) be the target variable
boston = pd.DataFrame(data=data, columns=column_names)
target_col = 'MEDV' #'NOX'

X = boston.loc[:, boston.columns != target_col] # Features
Y = boston.loc[:, boston.columns == target_col] # Target

# 70% train, 20% test, 10% validation
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size = 0.3, random_state=1)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size = 0.67, random_state=1)

# Try a series of pre-built models to find best performing

In [5]:
### KNN Regression ###

from sklearn.neighbors import KNeighborsRegressor
KNNReg = KNeighborsRegressor(n_neighbors=3)
KNNReg.fit(X_train, Y_train)
# Use the trained KNN classifier to predict the targets for the testing feature sample
Y_predict = KNNReg.predict(X_test)

# Measure the accuracy of our classifier by comparing our predictions to our known target values
print("KNN regression accuracy:", KNNReg.score(X_test, Y_test))

KNN regression accuracy: 0.6047362193880585


In [6]:
### Ridge Regression ###

from sklearn import linear_model
# Now lets try with a ridge classifier where we're doing some crossvalidation
RidgeReg = linear_model.RidgeCV(alphas=0.5)
RidgeReg.fit(X_train, Y_train)
predict = RidgeReg.predict(X_test)

print("Ridge regression accuracy:",RidgeReg.score(X_test, Y_test))


BayReg = linear_model.BayesianRidge()
BayReg.fit(X_train, np.ravel(Y_train))
predict = BayReg.predict(X_test)

print("Bayesian Ridge regression accuracy:",BayReg.score(X_test, Y_test))

Ridge regression accuracy: 0.8036563298060967
Bayesian Ridge regression accuracy: 0.7893805796381106


In [7]:
### Lasso models ###

LassoReg = linear_model.Lasso(alpha=0.1)
LassoReg.fit(X_train, Y_train)
predict = LassoReg.predict(X_test)

print("Lasso regression accuracy:",LassoReg.score(X_test, Y_test))


# Lasso Lars?
LarsReg = linear_model.LassoLars(alpha=0.01)
LarsReg.fit(X_train, Y_train)
predict = LarsReg.predict(X_test)

print("Lars Lasso regression accuracy:",LarsReg.score(X_test, Y_test))

Lasso regression accuracy: 0.7974774427057956
Lars Lasso regression accuracy: 0.8014240751546191


In [8]:
### Elastic net model ###

ElasticReg = linear_model.ElasticNet(alpha=0.01)
ElasticReg.fit(X_train, Y_train)
predict = ElasticReg.predict(X_test)

print("Elastic regression accuracy:",ElasticReg.score(X_test, Y_test))

Elastic regression accuracy: 0.8044337929853431


For median home value, the simple best performing models are the Ridge, Lars Lasso, and Elastic regressions, each giving just over 80% accuracy for this training set.

# Try a custom built model

In [19]:
import tensorflow as tf
from tensorflow.keras import backend as K
from sklearn.metrics import r2_score

# Build a machine learning model by building up layers
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(256, activation='swish'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(units=1)
])

In [20]:
# Define the optimizer, loss function, and metric to evaluate quality of fit
def R2_score(y_true, y_pred):
    mse = K.square(y_pred - y_true)
    mean = K.mean(y_true)
    tss = K.square(y_true - mean)
    return abs(1 - K.sqrt(mse/K.square(y_true)))

# Define loss function
#loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-2), 
              loss='mean_absolute_error', metrics=R2_score)

In [21]:
# Train the model
model.fit(X_train, Y_train, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.src.callbacks.History at 0x7f9a7dfa1300>

In [22]:
# Use model to generate predictions for x_val
predictions = model(X_val.values).numpy()
#print(predictions)
acc_NOX = model.evaluate(X_val, Y_val, verbose=0)
print("Our model predicts air quality with accuracy:", acc_NOX[1])

Our model predicts air quality with accuracy: 0.8156891465187073


In [23]:
# Change target variable from 'NOX' to 'MEDV'
target_col = 'MEDV'

# Need to redefine and resplit the data with new target variable
X = boston.loc[:, boston.columns != target_col]
Y = boston.loc[:, boston.columns == target_col]

# 70% train, 20% test, 10% validation
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size = 0.3, random_state=1)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size = 0.67, random_state=1)

In [24]:
model2 = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=1),
    tf.keras.layers.Dense(256, activation='swish'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(units=1)
])

model2.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-2), 
              loss='mean_absolute_error', metrics=R2_score)
model2.fit(X_train, Y_train, epochs=25)

model2.evaluate(X_val, Y_val, verbose=1)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


[4.1644816398620605, 0.8109233379364014]

In [25]:
# Use model to generate predictions for x_val
predictions = model2(X_val.values).numpy()
#print(predictions)
acc_MED = model2.evaluate(X_val, Y_val, verbose=0)
print("Our model predicts home value with accuracy:", acc_MED[1])

Our model predicts home value with accuracy: 0.8109233379364014


The custom built model ends up with a bit over 80% accuracy for both target variables. This is marginally more effective than the best pre-existing models.