In [1]:
from sklearn.datasets import load_boston
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures



# 1 & 2. Exploration of the dataset & predicting price of a house
a. Explore the dataset by using the Scikit Learn library and Numpy.

In [2]:
# Avoid printing out warnings
with warnings.catch_warnings():
     warnings.filterwarnings("ignore")
     X, y = load_boston(return_X_y=True)

In [3]:
X.shape, y.shape

((506, 13), (506,))

In [4]:
X[0] , y[0]

(array([6.320e-03, 1.800e+01, 2.310e+00, 0.000e+00, 5.380e-01, 6.575e+00,
        6.520e+01, 4.090e+00, 1.000e+00, 2.960e+02, 1.530e+01, 3.969e+02,
        4.980e+00]),
 24.0)

# 3. Linear Regression closed form with kfold validation 

In [5]:
# Append a column of ones to the feature matrix to accommodate the bias
X_b = np.c_[np.ones((X.shape[0], 1)), X] 
y = np.array([y]).T  

# mean squared errors for training and test sets
mse_train = 0
mse_test = 0

# 10 splits 
kf = KFold(n_splits=10, shuffle=True)

for train_index, test_index in kf.split(X):
    
    # Split the data into training and test sets
    X_train_b, X_test_b = X_b[train_index], X_b[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Fit the model using the closed-form solution
    theta = np.linalg.inv(X_train_b.T.dot(X_train_b)).dot(X_train_b.T).dot(y_train)
    
    # Calculate mean squared error on training set & test set 
    mse_train += np.mean((y_train - X_train_b.dot(theta))**2)
    mse_test += np.mean((y_test - X_test_b.dot(theta))**2)


print(f"The average mean square error on the training set: {mse_train/10}")
print(f"The average mean square error on the test set: {mse_test/10}")

The average mean square error on the training set: 21.8021551962688
The average mean square error on the test set: 23.735710217214514


# 4. & 5. Ridge Regression with KFold Validation 

In [6]:
A = np.eye(X_b.shape[1])
A[0][0] = 0

alpha = np.logspace(1, 7, num=13)

mse_train_alphas = []
mse_test_alphas = []

kf = KFold(n_splits=10, shuffle=True)

for a in alpha:
    mse_train = 0
    mse_test = 0
    for train_index, test_index in kf.split(X_b):
        # Split the data into training and test sets
        X_train_b, X_test_b = X_b[train_index], X_b[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Fit the model using the closed-form solution
        theta = np.linalg.inv(X_train_b.T.dot(X_train_b) + a * A).dot(X_train_b.T).dot(y_train)

        # Calculate mean squared error on training set & test set
        mse_train += np.mean((y_train - X_train_b.dot(theta))**2)
        mse_test += np.mean((y_test - X_test_b.dot(theta))**2)

    #keeing track of the mse errors 
    mse_train_alphas.append(mse_train / 10) 
    mse_test_alphas.append(mse_test / 10)

#finding the best alpha based on the error
index = np.argmin(mse_test_alphas)

print(f"The best model with the least test error is with alpha value {alpha[index]}\n") 
print(f"The training error of the best model is {mse_train_alphas[index]}")
print(f"The testing error of the best model is {mse_test_alphas[index]}")

The best model with the least test error is with alpha value 10.0

The training error of the best model is 22.61241006057968
The testing error of the best model is 24.063665406398414


# 6. Polynomial Transformation & ridge regression

In [21]:
# perform a polynomial features transform of the dataset
trans = PolynomialFeatures(degree=2)

# applying polynomial transformation only on the X and not X with bias because the function already adds it
X_b_transformed = trans.fit_transform(X) 

A = np.eye(X_b_transformed.shape[1])
A[0][0] = 0

alpha = np.logspace(1, 7, num=13)

mse_train_alphas = []
mse_test_alphas = []

kf = KFold(n_splits=10, shuffle=True)

for a in alpha:
    mse_train = 0
    mse_test = 0
    for train_index, test_index in kf.split(X_b_transformed):
        
        # Split the data into training and test sets
        X_train_b, X_test_b = X_b_transformed[train_index], X_b_transformed[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Fit the model using the closed-form solution 
        theta = np.linalg.inv(X_train_b.T.dot(X_train_b) + a * A).dot(X_train_b.T).dot(y_train)

        # Calculate mean squared error on training set & test set
        mse_train += np.mean((y_train - X_train_b.dot(theta))**2)
        mse_test += np.mean((y_test - X_test_b.dot(theta))**2)

    mse_train_alphas.append(mse_train / 10)
    mse_test_alphas.append(mse_test / 10)

index = np.argmin(mse_test_alphas)

print(f"The best model with the least test error is with alpha value {alpha[index]}\n") 
print(f"The training error of the best model is {mse_train_alphas[index]}")
print(f"The testing error of the best model is {mse_test_alphas[index]}")

The best model with the least test error is with alpha value 316.22776601683796

The training error of the best model is 8.110304254637338
The testing error of the best model is 13.453423742307717


# 7. Gradient Descent method

In [22]:
lr = 0.0000035 # learning rate 
epochs = 500000
n = 506 # no of data points

# mean squared errors for training and test sets
mse_train = 0
mse_test = 0

# 10 splits 
kf = KFold(n_splits=10, shuffle=True)
ctr = 1
for train_index, test_index in kf.split(X):
    
    # Split the data into training and test sets
    X_train_b, X_test_b = X_b[train_index], X_b[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # gradient descent 
    theta = np.random.randn(14,1) # random initialization
    print("---------------------------------------------------------------------------------->")
    print(f"Gradient descent training {ctr} ...")
   
    for eps in range(epochs):
        gradient = (2/n) * (X_train_b.T.dot(X_train_b.dot(theta) - y_train))
        theta = theta - (lr * gradient) #training the theta
   
    # Calculate mean squared error on training set & test set 
    mse_train += np.mean((y_train - X_train_b.dot(theta))**2)
    mse_test += np.mean((y_test - X_test_b.dot(theta))**2)
    
    print(f"Kfold split {ctr}: Training Loss: {mse_train/ctr}, Test Loss: {mse_test/ctr}")
    ctr += 1

print("---------------------------------------------------------------------------------->\n")
print(f"The average mean square error on the training set: {mse_train/10}")
print(f"The average mean square error on the test set: {mse_test/10}")

---------------------------------------------------------------------------------->
Gradient descent training 1 ...
Kfold split 1: Training Loss: 25.731080395790936, Test Loss: 18.80952909763336
---------------------------------------------------------------------------------->
Gradient descent training 2 ...
Kfold split 2: Training Loss: 25.478865993486803, Test Loss: 18.809288111996835
---------------------------------------------------------------------------------->
Gradient descent training 3 ...
Kfold split 3: Training Loss: 24.916301067256153, Test Loss: 24.046520126562616
---------------------------------------------------------------------------------->
Gradient descent training 4 ...
Kfold split 4: Training Loss: 25.27687550367338, Test Loss: 22.835023399552906
---------------------------------------------------------------------------------->
Gradient descent training 5 ...
Kfold split 5: Training Loss: 24.92925333371076, Test Loss: 25.35940772017966
------------------------

# 8. Lasso Regression 
a. Cost function of Lasso Regression:
    𝐽(𝜃) = 𝑀𝑆𝐸(𝜃) + 𝛼∑|𝜃𝑖|

In [23]:
lr = 0.0000035 # learning rate 
epochs = 500000
n = 506 # no of data points

# mean squared errors for training and test sets
mse_train = 0
mse_test = 0

# 10 splits 
kf = KFold(n_splits=10, shuffle=True)
ctr = 1

for train_index, test_index in kf.split(X):    
    # Split the data into training and test sets
    X_train_b, X_test_b = X_b[train_index], X_b[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # gradient descent 
    theta = np.random.randn(14,1) # random initialization
    print("---------------------------------------------------------------------------------->")
    print(f"Gradient descent training with cost function of lasso regression {ctr} ...")
   
    for eps in range(epochs): 
        # updating the gradient for lasso 
        gradient = ((2/n) * (X_train_b.T.dot(X_train_b.dot(theta) - y_train))) + np.sign(theta)
        theta = theta - (lr * gradient)# training the theta
   
    # Calculate mean squared error on training set & test set 
    mse_train += np.mean((y_train - X_train_b.dot(theta))**2)
    mse_test += np.mean((y_test - X_test_b.dot(theta))**2)
    print(f"Kfold split {ctr}: Training Loss: {mse_train/ctr}, Test Loss: {mse_test/ctr}")
    ctr += 1

print("---------------------------------------------------------------------------------->\n")
print(f"The average mean square error on the training set: {mse_train/10}")
print(f"The average mean square error on the test set: {mse_test/10}")

---------------------------------------------------------------------------------->
Gradient descent training with cost function of lasso regression 1 ...
Kfold split 1: Training Loss: 26.51371939987463, Test Loss: 31.778551018684812
---------------------------------------------------------------------------------->
Gradient descent training with cost function of lasso regression 2 ...
Kfold split 2: Training Loss: 26.316368715940243, Test Loss: 27.258857455000317
---------------------------------------------------------------------------------->
Gradient descent training with cost function of lasso regression 3 ...
Kfold split 3: Training Loss: 26.375664001803898, Test Loss: 25.91459289265441
---------------------------------------------------------------------------------->
Gradient descent training with cost function of lasso regression 4 ...
Kfold split 4: Training Loss: 26.571940963288696, Test Loss: 24.318012784769866
--------------------------------------------------------------

# 9. Elastic Net 
a. Cost Function of Elastic Net

In [25]:
lr = 0.0000035 # learning rate 
epochs = 500000
n = 506 # no of data points
a1 = 0.4

# mean squared errors for training and test sets
mse_train = 0
mse_test = 0

# 10 splits 
kf = KFold(n_splits=10, shuffle=True)
ctr = 1

for train_index, test_index in kf.split(X):
    # Split the data into training and test sets
    X_train_b, X_test_b = X_b[train_index], X_b[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # gradient descent 
    theta = np.random.randn(14,1) # random initialization
    print("---------------------------------------------------------------------------------->")
    print(f"Gradient descent training with cost function of elastic regression {ctr} ...")
   
    for eps in range(epochs):
        gradient = ( 
            ((2/n) * (X_train_b.T.dot(X_train_b.dot(theta) - y_train))) #gradient of mse
            + (2 * (1-a1) * theta) #gradient of l2 - penality 
            + (a1 * np.sign(theta))) #gradient of l1 - penality 
        theta = theta - (lr * gradient)# training the theta
   
    # Calculate mean squared error on training set & test set 
    mse_train += np.mean((y_train - X_train_b.dot(theta))**2)
    mse_test += np.mean((y_test - X_test_b.dot(theta))**2)
    print(f"Kfold split {ctr}: Training Loss: {mse_train/ctr}, Test Loss: {mse_test/ctr}")
    ctr += 1

print("---------------------------------------------------------------------------------->\n")
print(f"The average mean square error on the training set: {mse_train/10}")
print(f"The average mean square error on the test set: {mse_test/10}")

---------------------------------------------------------------------------------->
Gradient descent training with cost function of elastic regression 1 ...
Kfold split 1: Training Loss: 30.78236934148858, Test Loss: 30.55225729675368
---------------------------------------------------------------------------------->
Gradient descent training with cost function of elastic regression 2 ...
Kfold split 2: Training Loss: 30.670918839496032, Test Loss: 32.75151287766248
---------------------------------------------------------------------------------->
Gradient descent training with cost function of elastic regression 3 ...
Kfold split 3: Training Loss: 30.96818131206629, Test Loss: 32.543029542639424
---------------------------------------------------------------------------------->
Gradient descent training with cost function of elastic regression 4 ...
Kfold split 4: Training Loss: 30.77448095431799, Test Loss: 34.219945734556696
---------------------------------------------------------

# 10. If you are given a choice of predicting future housing prices using one of the models you have learned above (those optimized with gradient descent), which one would you choose and why? State the parameters of that model.

### From the above training that has been done above, I would use gradient descent to train this dataset as the test loss or the mse for this much less then that of others. The parameters of the model is the wieght and the bias. The learning rate and the number of epochs. 