## Mumbai House Price Prediction using Linear Regression

In [32]:
# import all the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim

In [33]:
# obtain the raw csv 
data = "../../assets/data/mumbai_house_prices.csv"
house_price = pd.read_csv(data)

In [34]:
# We select only a few regions to keep the dataset small
selected_regions = ["Ghatkopar West", "Ghatkopar East"]
selected_regions_df = house_price[house_price["region"].isin(selected_regions)].copy()
selected_regions_df.reset_index(drop=True, inplace=True)

In [35]:
# printing the info for dataset
print(selected_regions_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1087 entries, 0 to 1086
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   bhk         1087 non-null   int64  
 1   type        1087 non-null   object 
 2   locality    1087 non-null   object 
 3   area        1087 non-null   int64  
 4   price       1087 non-null   float64
 5   price_unit  1087 non-null   object 
 6   region      1087 non-null   object 
 7   status      1087 non-null   object 
 8   age         1087 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 76.6+ KB
None


In [36]:
selected_regions_df.head()

Unnamed: 0,bhk,type,locality,area,price,price_unit,region,status,age
0,1,Apartment,Rishabraj Vicinia,448,1.34,Cr,Ghatkopar East,Under Construction,New
1,2,Apartment,Rishabraj Vicinia,639,1.91,Cr,Ghatkopar East,Under Construction,New
2,2,Apartment,Alag Olive,581,1.25,Cr,Ghatkopar East,Under Construction,Resale
3,1,Apartment,Swagat Builders Damodar Park Apartment,580,80.0,L,Ghatkopar West,Ready to move,Resale
4,3,Apartment,Runwal The Orchard Residency,1360,2.7,Cr,Ghatkopar West,Ready to move,Resale


In [37]:
# create a dictionary to map region to their latitude and longitude
geo = Nominatim(user_agent="Geopy Library", timeout=10)  # Adjust timeout value if error is raised
unique_regions = selected_regions_df["region"].unique()
print("Total number of unique values: ", len(unique_regions))
lat_long_dict = {}
unknown_regions = []

for r in unique_regions:
    loc = geo.geocode(r + ", Mumbai")
    if (loc == None):
        unknown_regions.append(r)
        continue
    else:
        latitude = loc.latitude
        longitude = loc.longitude
        lat_long_dict[r] = [latitude, longitude]

print("Geopy could not find the following regions: ", unknown_regions, len(unknown_regions))

Total number of unique values:  2
Geopy could not find the following regions:  [] 0


In [38]:
# we remove all rows which contain unidentified regions and for all rows with identified regions, we add their latitiude and longitude to the data set.
del_idx = []
for i in range(len(selected_regions_df)):
    region = selected_regions_df.loc[i, "region"]
    if region in unknown_regions:
        del_idx.append(i)
    else:
        lat_long = lat_long_dict[region]
        selected_regions_df.loc[i, "latitude"] = lat_long[0] 
        selected_regions_df.loc[i, "longitude"] = lat_long[1]


In [39]:
selected_regions_df.drop(del_idx, inplace=True)
selected_regions_df = selected_regions_df.reset_index(drop=True)

In [40]:
selected_regions_df.head()

Unnamed: 0,bhk,type,locality,area,price,price_unit,region,status,age,latitude,longitude
0,1,Apartment,Rishabraj Vicinia,448,1.34,Cr,Ghatkopar East,Under Construction,New,19.08349,72.912025
1,2,Apartment,Rishabraj Vicinia,639,1.91,Cr,Ghatkopar East,Under Construction,New,19.08349,72.912025
2,2,Apartment,Alag Olive,581,1.25,Cr,Ghatkopar East,Under Construction,Resale,19.08349,72.912025
3,1,Apartment,Swagat Builders Damodar Park Apartment,580,80.0,L,Ghatkopar West,Ready to move,Resale,19.089719,72.904597
4,3,Apartment,Runwal The Orchard Residency,1360,2.7,Cr,Ghatkopar West,Ready to move,Resale,19.089719,72.904597


In [41]:
selected_regions_df.shape

(1087, 11)

In [42]:
# remove columns locality and region since they do not hold with the idea of linear regression
selected_regions_df.drop(['locality', 'region'], axis=1, inplace=True)

In [43]:
# print all unique values of categorical columns
print(selected_regions_df.type.unique())
print(selected_regions_df.age.unique())
print(selected_regions_df.status.unique())

['Apartment' 'Studio Apartment']
['New' 'Resale' 'Unknown']
['Under Construction' 'Ready to move']


In [44]:
# dealing with categorical data
selected_regions_df["type"].replace({"Studio Apartment":0, "Apartment":1}, inplace=True)
selected_regions_df["age"].replace({"New":0, "Resale":1, "Unknown": 0.5}, inplace=True)
selected_regions_df["status"].replace({"Ready to move":0, "Under Construction":1}, inplace=True)

In [45]:
# calculating price in Crores for each using price and price_unit
for i in range(len(selected_regions_df)):  
    price_unit = selected_regions_df.loc[i, "price_unit"]

    if price_unit == "L":
        selected_regions_df.at[i, "price"] = (selected_regions_df.at[i, "price"] / 100)
  
selected_regions_df.drop(["price_unit"], axis=1, inplace=True)

In [46]:
selected_regions_df.head()

Unnamed: 0,bhk,type,area,price,status,age,latitude,longitude
0,1,1,448,1.34,1,0.0,19.08349,72.912025
1,2,1,639,1.91,1,0.0,19.08349,72.912025
2,2,1,581,1.25,1,1.0,19.08349,72.912025
3,1,1,580,0.8,0,1.0,19.089719,72.904597
4,3,1,1360,2.7,0,1.0,19.089719,72.904597


In [47]:
# Define the ratios for train, test, and validation sets
test_ratio = 0.1
val_ratio = 0.1

indices = list(selected_regions_df.index)

# Calculate the number of samples for the testing and validation sets
test_size = int(test_ratio * len(selected_regions_df))
val_size = int(val_ratio * len(selected_regions_df))

# Split the indices into training, testing, and validation sets
test_indices = indices[:test_size]
val_indices = indices[test_size:test_size+val_size]
train_indices = indices[test_size+val_size:]

# Create training, validation, and testing sets
X_train = selected_regions_df.loc[train_indices]
X_val = selected_regions_df.loc[val_indices]
X_test = selected_regions_df.loc[test_indices]

# Reset indices of each DataFrame
X_train.reset_index(drop=True, inplace=True)
X_val.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

# Extract target variable
y_train = X_train.pop("price").tolist()
y_val = X_val.pop("price").tolist()
y_test = X_test.pop("price").tolist()

print("The size of X_train is: ", X_train.shape)
print("The size of X_val is: ", X_val.shape)
print("The size of X_test is: ", X_test.shape)
print("The size of y_train is: ", len(y_train))
print("The size of y_val is: ", len(y_val))
print("The size of y_test is: ", len(y_test))

The size of X_train is:  (871, 7)
The size of X_val is:  (108, 7)
The size of X_test is:  (108, 7)
The size of y_train is:  871
The size of y_val is:  108
The size of y_test is:  108


In [48]:
X_mean = np.mean(X_train, axis=0)
X_std = np.std(X_train, axis=0)

# Normalizing the data
X_train = (X_train - X_mean)/X_std
X_val = (X_val - X_mean)/X_std
X_test = (X_test - X_mean)/X_std

print(X_train.head())

        bhk      type      area    status       age  latitude  longitude
0  0.140216  0.096281  0.250361  0.620272 -0.662982 -0.382168   0.382168
1 -1.172991  0.096281 -0.715977  0.620272 -0.662982 -0.382168   0.382168
2  0.140216  0.096281  0.079328  0.620272 -0.662982 -0.382168   0.382168
3  0.140216  0.096281 -0.108809  0.620272 -0.662982 -0.382168   0.382168
4  1.453423  0.096281  2.251452  0.620272 -0.662982  2.616648  -2.616648


In [49]:
y_train = np.array(y_train)
y_train = y_train.reshape(y_train.shape[0], 1)
y_test = np.array(y_test)
y_test = y_test.reshape(y_test.shape[0], 1)
y_val = np.array(y_val)
y_val = y_val.reshape(y_val.shape[0], 1)

In [50]:
# Defining linear kernel function
def linear_kernel(x1, x2):
    return np.dot(x1, x2)

In [51]:
# Defining polynomial kernel function
def polynomial_kernel(x, y, p=3):
    return (1 + np.dot(x, y)) ** p

In [52]:
# Defining radial basis function kernel function
def rbf_kernel(x, y, sigma=1.0):
    return np.exp(-(np.linalg.norm(x-y)**2) / (2 * (sigma**2)))

In [53]:
# Defining the kernel matrix
num_samples = len(X_train)
K = np.zeros((num_samples,num_samples))

for i in range(num_samples):
    for j in range(num_samples):
        # Ensure the same kernel in get_accuracy and predict
        K[i][j] = linear_kernel(X_train.loc[i], X_train.loc[j])

In [129]:
alpha = np.zeros((X_train.shape[0], 1))

In [130]:
def fit(X, y, alpha, K, learning_rate=0.1, num_iterations=2, l2_lambda=0.01):
    m = X.shape[0]
    total_losses = []
    print(X.shape)
    print(y.shape)
    print(K.shape)
    print(alpha.shape)

    for i in range (num_iterations):

        Z = np.dot(K, alpha)
        diff = Z - y

        d_alpha = diff
        alpha = alpha - learning_rate*diff
        
        loss = (1/m) * np.sum(np.square(diff))

        total_losses.append(loss)

        if i % 1 == 0:
            print("Loss after iteration %i: %f" % (i, loss))
        
    return alpha, total_losses


In [131]:
alpha, total_losses = fit(X_train, y_train, alpha, K, learning_rate=0.1, num_iterations=10)

(871, 7)
(871, 1)
(871, 871)
(871, 1)
Loss after iteration 0: 3.809238
Loss after iteration 1: 17016.267322
Loss after iteration 2: 589294811.246095
Loss after iteration 3: 23505854779872.707031
Loss after iteration 4: 1022838127202212480.000000
Loss after iteration 5: 46668491760142817165312.000000
Loss after iteration 6: 2180524219267648880419799040.000000
Loss after iteration 7: 103048257665704334862564781457408.000000
Loss after iteration 8: 4895916341064724031823404161191378944.000000
Loss after iteration 9: 233183897275576477694613239550576512991232.000000


In [24]:
def fittos(X, y, alpha, learning_rate=0.1, num_iterations=10):
    m = X.shape[0]
    total_losses = []

    for i in range(num_iterations):
        loss = 0

        for j in range(m):
            Z = 0
            ksum = 0
            for k in range(m):
                Z += alpha[k]*K[j][k]
            
            for k in range(m):
                ksum += K[j][k]
        
            diff = Z - y[j]
            gradient = 2*diff*ksum
            alpha[j] = alpha[j] - learning_rate*gradient

            loss += diff ** 2

        loss /= m
        total_losses.append(loss)
        
        if i % 1 == 0:
            print("Loss after iteration %i: %f" % (i, loss))
        
    return alpha, total_losses

In [25]:
def predict(X, X_train, alpha):
    m = X.shape[0]
    z = 0

    for i in range(m):
        z += alpha[i]*linear_kernel(X, X_train[i])

    # For all negative values, replace them with 0
    z[z < 0] = 0
    return z

In [26]:
alpha, total_losses = fit(X_train, y_train, alpha, learning_rate=0.1, num_iterations=10)


Loss after iteration 0: 3.809237
Loss after iteration 1: 3.809235
Loss after iteration 2: 3.809234
Loss after iteration 3: 3.809232
Loss after iteration 4: 3.809231
Loss after iteration 5: 3.809229
Loss after iteration 6: 3.809228
Loss after iteration 7: 3.809226
Loss after iteration 8: 3.809225
Loss after iteration 9: 3.809224
