In [3]:

from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import copy
import math
import pandas as pd
%matplotlib inline

In [4]:
data = pd.read_csv("housing_price_dataset.csv") 
print(data)

       SquareFeet  Bedrooms  Bathrooms Neighborhood  YearBuilt          Price
0            2126         4          1        Rural       1969  215355.283618
1            2459         3          2        Rural       1980  195014.221626
2            1860         2          1       Suburb       1970  306891.012076
3            2294         2          1        Urban       1996  206786.787153
4            2130         5          2       Suburb       2001  272436.239065
...           ...       ...        ...          ...        ...            ...
49995        1282         5          3        Rural       1975  100080.865895
49996        2854         2          2       Suburb       1988  374507.656727
49997        2979         5          3       Suburb       1962  384110.555590
49998        2596         5          2        Rural       1984  380512.685957
49999        1572         5          3        Rural       2011  221618.583218

[50000 rows x 6 columns]


In [5]:
null_values = data.isnull()
null_count_per_column = null_values.sum()
print(null_count_per_column) # no null values 

SquareFeet      0
Bedrooms        0
Bathrooms       0
Neighborhood    0
YearBuilt       0
Price           0
dtype: int64


In [6]:
print(data.dtypes) # no null values

SquareFeet        int64
Bedrooms          int64
Bathrooms         int64
Neighborhood     object
YearBuilt         int64
Price           float64
dtype: object


In [7]:
df = data.copy()

In [8]:
X = df.drop('Price', axis=1) 
X = df.drop('Neighborhood', axis=1) 
y = df['Price']  

# Split the data into training and testing sets with a 20% test split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
x_train = x_train.values # converting to numpy array 
y_train = y_train.values # converting to numpy array

In [9]:
print ('The shape of x_train is:', x_train.shape)
print ('The shape of y_train is: ', y_train.shape)
print ('Number of training examples (m):', len(x_train))

The shape of x_train is: (40000, 5)
The shape of y_train is:  (40000,)
Number of training examples (m): 40000


In [10]:
def compute_cost(x, y, w, b): 
    
    m = x.shape[0] 
    
    
    total_cost = 0

    
    cost=0
    for i in range(m):
        f_wb = w*x[i]+b
        cost += (f_wb - y[i])**2
    
    total_cost = cost/(2*m)
    
    

    return total_cost

In [11]:
def compute_gradient(x, y, w, b): 
    m = x.shape[0]
    dj_dw = 0
    dj_db = 0
    for i in range(m):
        f_wb = w * x[i] + b
        dj_db += f_wb - y[i]
        dj_dw += (f_wb - y[i]) * x[i]
    dj_dw /= m
    dj_db /= m
    return dj_dw, dj_db

In [12]:
def gradient_descent(x, y, w_in, b_in, gradient_function, alpha, num_iters): 
    m = len(x)
    w = copy.deepcopy(w_in)  #avoid modifying global w within function
    b = b_in
    for i in range(num_iters):
        dj_dw, dj_db = gradient_function(x, y, w, b)  
        w = w - alpha * dj_dw               
        b = b - alpha * dj_db               
    return w, b

In [15]:
initial_w = 0
initial_b = 0
iterations = 5
alpha = 0.01
w,b= gradient_descent(x_train ,y_train, initial_w, initial_b, compute_gradient, alpha, iterations)

In [16]:
print(w)

[1.73970780e+25 2.97242469e+04 2.01342112e+04 1.07777668e+25
 5.67903148e+43]


In [17]:
print(b)

[8.01372081e+21 8.63694614e+03 1.01748973e+04 5.42794918e+21
 2.26572589e+38]
