# HOME PRICE PREDICTION USING LINEAR REGRESSION

_**Predicting home prices using Linear Regression.**_

In [2]:
# Imports required packages

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

## Data Collection

In [4]:
# Loads dataset from csv file
housing = pd.read_csv("../Data/housing/housing.csv")

# Displays few of the instances from the dataset
display(housing.head())

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Exploratory Data Analysis (EDA)

In [6]:
# Checks for basic information about the dataset

housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [7]:
# Checks for the descriptive statistics of the dataset

housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


The basic information and descriptive statistics show that the dataseet has 
- little more than 20,000 instances in it
- missing values in 'total_bedrooms'
- columns have different scales

In [9]:
# Knowing type of the column "ocean_proximity" as 'object', 
# let's find values associated with this column

housing.ocean_proximity.value_counts()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

## Preparing Data

### Checking for Duplicates and Single-valued Columns

**First, removes duplicate observations, if any**

In [13]:
# Checks for duplicate observations
duplicate_count = sum(housing.duplicated())
print("There are", duplicate_count, "duplicates in the datsset")

# Deletes the duplicate data, if found
if duplicate_count > 0:
    housing.drop_duplicates(inplace=True)
    print("\n\tDuplicate observations were deleted.")
    # Prints the shape of data before removal of duplicate
    print("\n\tData shape after duplicate removal:", housing.shape)

There are 0 duplicates in the datsset


**Then removes single-valued columns**

In [15]:
# Gets number of unique values for each column
unique_values_per_attrib = housing.nunique()

# Records columns to delete
single_value_columns = [i for i, value_count in enumerate(unique_values_per_attrib) if value_count == 1]
print("There are", len(single_value_columns), "single_valued columns in the datsset")

# Deletes single-value columns, if exist
if len(single_value_columns) > 0:
    housing.drop(single_value_columns, axis=1, inplace=True)
    print("\n\tSingle-valued columns were removed.")
    # Prints the shape of data after removal of single-value columns
    print("\n\tData shape after single-value column removal:", housing.shape)

There are 0 single_valued columns in the datsset


### Seperating Test Set

**To ensure same distribution both in training and test dataset and to make test dataset representative of the population, stratified sampling over column "median_income" was consisdered.**

In [18]:
# Creates a column on which stratification will be based on. Essentially, it is a column having values each
# would be a income bin that an instance will be associated to.

housing["median_income_bin"] = pd.cut(
    housing["median_income"],                # Values to be binned - 'median income' in this case
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],    # Creates six bins
    labels=[1, 2, 3, 4, 5])                  # Associates labels to each bin

In [19]:
# Splits data into train and test dataset applying stratification

train_set, test_set = train_test_split(
    housing, test_size = 0.2, stratify = housing["median_income_bin"], random_state=42)

In [20]:
# Removes intermediate attribute 'median_income_bin' after stratification
# as this would no more be required

train_set.drop("median_income_bin", axis=1, inplace=True)
test_set.drop("median_income_bin", axis=1, inplace=True)

### Seperating Target Column

In [22]:
# Seperates target from features for both training and test set

X_train = train_set.drop("median_house_value", axis = 1)
target_train = train_set["median_house_value"].copy()

X_test = test_set.drop("median_house_value", axis = 1)
target_test = test_set["median_house_value"].copy()

### Transforming Data

#### Transforming Training Data

**Creating transformation pipeline to impute missing and scale numeric data in training dataset**

In [26]:
# Sets list of numerical and categorical attributes

cat_attribs = ["ocean_proximity"]
num_attribs = list(X_train.columns)
num_attribs.remove(cat_attribs[0])

In [27]:
# Setting data transformation pipeline for numerical attributes
# Note that featue scaling is NOT required for algorithms to be used here

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    #("std_scaler", StandardScaler())    # Not required
])

In [28]:
# Transforms both numerical and categorical attritues by using ColumnTransformer. 
# This transformer can also have 

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),      # Uses sub-pipeline already defined above
    ("cat", OneHotEncoder(), cat_attribs)])  # Considering OneHot encoding will just be fine for handful values

X_train_transformed = full_pipeline.fit_transform(X_train)

In [29]:
# Checks for the shape of the transformed training dataset

X_train_transformed.shape

(16512, 13)

#### Transforming Testing Data

In [31]:
X_test_transformed = full_pipeline.transform(X_test)

In [32]:
# Checks for the shape of the transformed testing dataset

X_test_transformed.shape

(4128, 13)

## Modeling

### Modeling Using Closed Form Approach

**Using Normal Equation**
$\hat{\theta} = (X^TX)^{-1}X^Ty$

In [44]:
# Estimates for model coefficents/parameters/weights (often denoted by θ)

theta_ne = np.linalg.inv(X_train_transformed.T @ X_train_transformed) @ X_train_transformed.T @ target_train

"""
The same expression mentioined above can also be represented like the one below.

theta = np.linalg.inv(
    X_train_transformed.T.dot(X_train_transformed)).dot(
        X_train_transformed.T).dot(X_train_target)
"""

# Shows θ values for normal equation (ne)
print(theta_ne)

[-2.66831122e+04 -2.54597874e+04  1.04270174e+03 -3.82448600e+00
  6.20155695e+01 -4.58931291e+01  9.81139214e+01  3.86221869e+04
 -2.25044381e+06 -2.29012999e+06 -2.01230789e+06 -2.25412504e+06
 -2.24674472e+06]


In [46]:
# Performs predictions on both training and testing dataset by linearly combinining features

predictions_train_ne = X_train_transformed.dot(theta_ne)
predictions_test_ne = X_test_transformed.dot(theta_ne)

In [48]:
# Evaluates Normal Equation model performance on both training and testing dataset

rmse_train_ne = np.sqrt(mean_squared_error(target_train, predictions_train_ne))
rmse_test_ne = np.sqrt(mean_squared_error(target_test, predictions_test_ne))

**Using Singular Value Decomposition (SVD) Approach over LinearRegression (LR) Algorithm**

In [54]:
# Fits a LinearRegression model

lr_model = LinearRegression()
lr_model.fit(X_train_transformed, target_train)

In [56]:
print("Linear Regression Model Parameters:\n")
print("Intercept:", lr_model.intercept_)
print("Coefficients:", lr_model.coef_)

Linear Regression Model Parameters:

Intercept: -2210750.2918442874
Coefficients: [-2.66831122e+04 -2.54597874e+04  1.04270174e+03 -3.82448600e+00
  6.20155695e+01 -4.58931291e+01  9.81139214e+01  3.86221869e+04
 -3.96935229e+04 -7.93796990e+04  1.98442398e+05 -4.33747507e+04
 -3.59944258e+04]


In [58]:
# Performs predictions on both training and testing dataset

predictions_train_lr = lr_model.predict(X_train_transformed)
predictions_test_lr = lr_model.predict(X_test_transformed)

In [62]:
rmse_train_lr = np.sqrt(mean_squared_error(target_train, predictions_train_lr))
rmse_test_lr = np.sqrt(mean_squared_error(target_test, predictions_test_lr))

### Analyzing Model Performance
_Note that cross validation was not used for not being useful for closed form modeling approaches._

In [65]:
# Shows Normal Equation model performance on both datasets

print("Prediction Performance (in RMSE) using Normal Equation:\n")
print("Train Error:", rmse_train_ne)
print("Test Error:", rmse_test_ne)

Prediction Performance (in RMSE) using Normal Equation:

Train Error: 68232.83515124217
Test Error: 71002.83776916377


In [67]:
# Shows Linear Regression model performance on both datasets

print("Linear Regression Model Peroformance (in RMSE):\n")
print("Train Error:", rmse_train_lr)
print("Test Error:", rmse_test_lr)

Linear Regression Model Peroformance (in RMSE):

Train Error: 68232.83515124217
Test Error: 71002.83776920402


In [69]:
# Shows both models' prediction and prediction error side-by-side 
# on few of the instances from the test dataset

pd.DataFrame({
    "Actual Target": target_test, 
    "Normal Eq. Prediction": predictions_test_ne, 
    "LR Prediction": predictions_test_lr, 
    "Normal Eq. Prediction Error": np.abs(target_test - predictions_test_ne), 
    "LR Prediction Error:": np.abs(target_test - predictions_test_lr)
}).head(10)

Unnamed: 0,Actual Target,Normal Eq. Prediction,LR Prediction,Normal Eq. Prediction Error,LR Prediction Error:
3905,397700.0,383993.915928,383993.915926,13706.084072,13706.084074
16821,202900.0,252310.540685,252310.540684,49410.540685,49410.540684
2900,310000.0,372503.598094,372503.598092,62503.598094,62503.598092
7193,314300.0,279372.702016,279372.702014,34927.297984,34927.297986
13928,187500.0,225899.332765,225899.332762,38399.332765,38399.332762
6523,141700.0,212940.693079,212940.693078,71240.693079,71240.693078
33,104900.0,163517.459664,163517.459662,58617.459664,58617.459662
15996,275100.0,346138.250216,346138.250214,71038.250216,71038.250214
18212,167000.0,257846.468207,257846.468205,90846.468207,90846.468205
10083,457700.0,258190.496189,258190.496187,199509.503811,199509.503813
