# HOUSE PRICE PREDICTION USING DIFFERENT REGRESSION TECHNIQUES

_**Predicts California home prices using diffent regression techniques such as closed-form linear regression and gradient descent.**_

This experiment considers already preprocessed California datasets stored in .npy format. Both the datasets and relevant notebook _*\_data_preprocessing.ipynb_ can be found from the same repository location.

In [None]:
# Imports required packages

import numpy as np

from sklearn.linear_model import LinearRegression, SGDRegressor

from sklearn.metrics import root_mean_squared_error

## Retrieving Data

In [None]:
# Loads datasets from previously processed .npy files

with open("./housing_train_transformed.npy", "rb") as f:
    X_train_transformed = np.load(f)

with open("./housing_test_transformed.npy", "rb") as f:
    X_test_transformed = np.load(f)

In [None]:
# Seperates target from both the datasets

y_train = X_train_transformed[:, -1]
y_test = X_test_transformed[:, -1]

X_train_transformed = X_train_transformed[:, :-1]
X_test_transformed = X_test_transformed[:, :-1]

## Closed-Form Approach

### Normal Equation [$\hat{\theta} = (X^TX)^{-1}X^Ty$]

In [None]:
# Estimates for model coefficents/parameters/weights (often denoted by θ)

theta_ne = np.linalg.inv(X_train_transformed.T @ X_train_transformed) @ X_train_transformed.T @ y_train

"""
The same expression mentioined above can also be represented like the one below.

theta = np.linalg.inv(
    X_train_transformed.T.dot(X_train_transformed)).dot(
        X_train_transformed.T).dot(X_train_target)
"""

# Shows θ values for normal equation (ne)
print(theta_ne)

[-53381.26179264 -54311.55719382  13122.77377485  -8357.34824316
  26159.33889213 -50242.06829496  37563.32620216  73068.48598075
 218989.05888959 179302.88278282 457124.98025354 215307.83111698
 222688.15596118]


In [None]:
# Performs predictions on both training and testing dataset by linearly combinining features

predictions_train_ne = X_train_transformed.dot(theta_ne)
predictions_test_ne = X_test_transformed.dot(theta_ne)

In [None]:
# Evaluates Normal Equation model performance on both training and testing dataset

rmse_train_ne = root_mean_squared_error(y_train, predictions_train_ne)
rmse_test_ne = root_mean_squared_error(y_test, predictions_test_ne)

print("Normal Equation-based Linear Regression Performance [RMSE]:\n{:.1f} [Train]\n{:.1f} [Test]".format(
    rmse_train_ne, rmse_test_ne))

Normal Equation-based Linear Regression Performance [RMSE]:
68232.8 [Train]
71002.8 [Test]


### Singular Value Decomposition (SVD)

In [None]:
# Initializes linear regression algorithm that implements SVD approach
lin_reg = LinearRegression()

# Fits the model
lin_reg.fit(X_train_transformed, y_train)

In [None]:
print("Linear Regression Model Parameters:\n")
print("Intercept:", lin_reg.intercept_)
print("Coefficients:", lin_reg.coef_)

Linear Regression Model Parameters:

Intercept: 258682.5818008228
Coefficients: [-53381.26179264 -54311.55719382  13122.77377485  -8357.34824316
  26159.33889213 -50242.06829496  37563.32620216  73068.48598075
 -39693.52291123 -79379.699018   198442.39845272 -43374.75068384
 -35994.42583964]


In [None]:
# Performs predictions on both training and testing dataset

predictions_train_lr = lin_reg.predict(X_train_transformed)
predictions_test_lr = lin_reg.predict(X_test_transformed)

In [None]:
# Evaluates Normal Equation model performance on both training and testing dataset

rmse_train_lr = root_mean_squared_error(y_train, predictions_train_lr)
rmse_test_lr = root_mean_squared_error(y_test, predictions_test_lr)

print("SVD-based Linear Regression Performance [RMSE]:\n{:.1f} [Train]\n{:.1f} [Test]".format(
    rmse_train_lr, rmse_test_lr))

SVD-based Linear Regression Performance [RMSE]:
68232.8 [Train]
71002.8 [Test]


## Gradient Descent-based Iterative Optimization Approach

In [None]:
# Initializes SGD classifier
sgd_reg = SGDRegressor(
    penalty=None,           # No regularization
    tol=1e-5,               # Tolerance for loss drop during last 'n_iter_no_change' number of epocs
    max_iter=1000,          # Maximum number of traning iterations (epochs)
    eta0=0.01,              # Learning rate
    n_iter_no_change=100,   # Exits if training loss doesn't improve by 'tol' for this many epochs
    random_state=42)

# Fits the model
sgd_reg.fit(X_train_transformed, y_train)

In [None]:
# Prints SGD model parameters

print("SGD Model Parameters:\n")
print("Intercept:", sgd_reg.intercept_)
print("Coefficients:", sgd_reg.coef_)

SGD Model Parameters:

Intercept: [178748.92103879]
Coefficients: [-53080.59288081 -55064.34555508  13006.44744967  -8439.05055623
  26055.39869498 -50501.96794798  37126.91852779  72906.65683759
  40868.78471157    941.18285415  56058.96402653  36711.0864693
  44168.90297721]


In [None]:
# Performs predictions on both training and testing dataset

predictions_train_sgd = sgd_reg.predict(X_train_transformed)
predictions_test_sgd = sgd_reg.predict(X_test_transformed)

In [None]:
# Evaluates SGD regression performance on both training and testing dataset

rmse_train_sgd = root_mean_squared_error(y_train, predictions_train_sgd)
rmse_test_sgd = root_mean_squared_error(y_test, predictions_test_sgd)

print("SGD Regression Performance [RMSE]:\n{:.1f} [Train]\n{:.1f} [Test]".format(
    rmse_train_sgd, rmse_test_sgd))

SGD Regression Performance [RMSE]:
68290.7 [Train]
70987.7 [Test]
