## Mumbai House Price Prediction using Linear Regression

In [91]:
# Import all the required libraries
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

In [92]:
# Obtain the modified csv 
data = "../assets/data/modified_mumbai_house_prices.csv"
house_price = pd.read_csv(data)

# Refer assets/scripts/house-price-dataset.py for preprocessing steps

In [93]:
# printing the info for dataset
print(house_price.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76038 entries, 0 to 76037
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   bhk        76038 non-null  int64  
 1   type       76038 non-null  float64
 2   area       76038 non-null  int64  
 3   price      76038 non-null  float64
 4   status     76038 non-null  int64  
 5   age        76038 non-null  float64
 6   latitude   76038 non-null  float64
 7   longitude  76038 non-null  float64
dtypes: float64(5), int64(3)
memory usage: 4.6 MB
None


In [94]:
house_price.head()

Unnamed: 0,bhk,type,area,price,status,age,latitude,longitude
0,3,0.25,685,2.5,1,1.0,19.112122,72.867676
1,2,0.25,640,0.5251,0,1.0,18.969048,72.821182
2,2,0.25,610,1.73,0,1.0,18.563005,73.906578
3,2,0.25,876,0.5998,0,1.0,18.999653,73.126328
4,2,0.25,659,0.9411,0,1.0,18.969048,72.821182


In [95]:
house_price.shape

(76038, 8)

In [96]:
# Define the ratios for train, test, and validation sets
test_ratio = 0.05
val_ratio = 0.05

indices = list(house_price.index)

# Calculate the number of samples for the testing and validation sets
test_size = int(test_ratio * len(house_price))
val_size = int(val_ratio * len(house_price))

# Split the indices into training, testing, and validation sets
test_indices = indices[:test_size]
val_indices = indices[test_size:test_size+val_size]
train_indices = indices[test_size+val_size:]

# Create training, validation, and testing sets
X_train = house_price.loc[train_indices]
X_val = house_price.loc[val_indices]
X_test = house_price.loc[test_indices]

# Extract target variable and convert to NumPy arrays
y_train = X_train.pop("price").to_numpy()
y_val = X_val.pop("price").to_numpy()
y_test = X_test.pop("price").to_numpy()

# Convert feature sets to NumPy arrays
X_train = X_train.to_numpy()
X_val = X_val.to_numpy()
X_test = X_test.to_numpy()

print("The size of X_train is: ", X_train.shape)
print("The size of X_val is: ", X_val.shape)
print("The size of X_test is: ", X_test.shape)
print("The size of y_train is: ", len(y_train))
print("The size of y_val is: ", len(y_val))
print("The size of y_test is: ", len(y_test))

The size of X_train is:  (68436, 7)
The size of X_val is:  (3801, 7)
The size of X_test is:  (3801, 7)
The size of y_train is:  68436
The size of y_val is:  3801
The size of y_test is:  3801


In [97]:
X_mean = np.mean(X_train, axis=0)
X_std = np.std(X_train, axis=0)

# Normalizing the data
X_train = (X_train - X_mean)/X_std
X_val = (X_val - X_mean)/X_std
X_test = (X_test - X_mean)/X_std

In [126]:
# Computes the sigmoid function
def relu(z):
    return np.maximum(0, z)

# Computes derivative of activation with respect to z
def relu_prime(z):
    return np.where(z > 0, 1, 0)

In [127]:
# Computes the MSE Loss

# This loss is for a point only
def loss(y, a):
    return 0.5*np.square(y-a)

# Computes derivative of loss with respect to activations
def loss_prime(y, a):
    return (y-a)

In [128]:
# Defining architecture with name vector as input and single output
architecture = [7, 100, 1]

In [129]:
# Initializing randomly
weights = [np.random.randn(x,y) for x,y in zip(architecture[1:], architecture[:-1])]
biases = [np.random.randn(y,1) for y in architecture[1:]]

for i in range(len(weights)):
    print(f'weight matrix {i} shape: {weights[i].shape}')

for i in range(len(biases)):
    print(f'bias matrix {i} shape: {biases[i].shape}')

weight matrix 0 shape: (100, 7)
weight matrix 1 shape: (1, 100)
bias matrix 0 shape: (100, 1)
bias matrix 1 shape: (1, 1)


In [130]:
# Feedforward and Backpropogation phase
def feedforward_and_backprop(X, y, weights, biases):
    X = X.reshape(-1, 1)
    a = X
    z_cache = []
    activation_cache = [X]

    for i in range(len(weights)):
        z = np.dot(weights[i], a) + biases[i]
        z_cache.append(z)
        a = relu(z)
        activation_cache.append(a)

    point_loss = loss(y, a[0][0])    
    
    # Define delta_nabla_w and delta_nabla_b
    delta_nabla_w = []
    delta_nabla_b = []

    # For output layer
    delta = loss_prime(y, a)*relu_prime(z_cache[-1])
    delta_nabla_w.append(np.dot(delta, activation_cache[-2].transpose()))
    delta_nabla_b.append(delta)

    # For hidden layers
    for i in range(len(weights)-1):
        delta = np.dot(weights[-i-1].transpose(), delta)*relu_prime(z_cache[-i-2])
        delta_nabla_w.append(np.dot(delta, activation_cache[-i-3].transpose()))
        delta_nabla_b.append(delta)

    delta_nabla_w.reverse()
    delta_nabla_b.reverse()
    
    return delta_nabla_w, delta_nabla_b, point_loss

In [131]:
# Updating one data point
def update_point(X, y, weights, biases, learning_rate=0.005):
    delta_w, delta_b, point_loss = feedforward_and_backprop(X, y, weights, biases)

    for i in range(len(weights)):
        weights[i] = weights[i] - learning_rate*delta_w[i]
        biases[i] = biases[i] - learning_rate*delta_b[i]

    return weights, biases, point_loss

In [137]:
def evaluate(X, y, weights, biases):
    a = np.squeeze(X).transpose()
    for i in range(len(weights)):
        z = np.dot(weights[i], a) + biases[i]
        a = relu(z)
    
    error = loss(y, a)
    print(error)
    return error

In [138]:
# Performing stochastic gradient descent
def sgd(X_train, y_train, weights, biases, num_epochs=10, X_val=None, y_val=None):
    loss_list = []
    
    for epoch in range(num_epochs):
        epoch_loss = 0
        # First we randomly shuffle data
        temp = list(zip(X_train, y_train))
        random.shuffle(temp)
        temp_X_train, temp_y_train = zip(*temp)
        temp_X_train = list(temp_X_train)
        temp_y_train = list(temp_y_train)

        # Now we iterate over each point in data
        for i in range(len(temp_X_train)):
            weights, biases, point_loss = update_point(temp_X_train[i], temp_y_train[i], weights, biases)
            epoch_loss += point_loss
            loss_list.append(point_loss)

        epoch_loss = epoch_loss / len(temp_X_train)
        # loss_list.append(epoch_loss)
        if (epoch % 20 == 0):
            # Test accuracy over validation data 
            print(f'Training loss for epoch {epoch}: {epoch_loss:.4f}')
            if (X_val is not None):
                val_accuracy = evaluate(X_val, y_val, weights, biases)
                print(f'Validation Accuracy for epoch {epoch}: {val_accuracy:.4f}')
    
    # Visualize loss
    plt.figure(figsize=(10, 6))
    plt.plot(loss_list)
    plt.title('Training Loss Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.show()

In [139]:
sgd(X_train, y_train, weights, biases, 10, X_val, y_val)

Training loss for epoch 0: nan
[[nan nan nan ... nan nan nan]]


TypeError: unsupported format string passed to numpy.ndarray.__format__