In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("housing.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [3]:
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20433 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64
 1   latitude            20433 non-null  float64
 2   housing_median_age  20433 non-null  float64
 3   total_rooms         20433 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20433 non-null  float64
 6   households          20433 non-null  float64
 7   median_income       20433 non-null  float64
 8   median_house_value  20433 non-null  float64
 9   ocean_proximity     20433 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.7+ MB


In [4]:

Y = data["median_house_value"]

In [5]:
X = data.drop(["ocean_proximity", "median_house_value" ],axis=1)
X

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462
...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672


In [6]:
X_np = X.to_numpy()
y_np = Y.to_numpy()

In [7]:
from sklearn.model_selection import train_test_split
X_train , X_test, y_train, y_test = train_test_split(X_np, y_np, test_size=0.2, random_state = 100)
X_test.shape

(4087, 8)

Linear Regression Model

In [8]:
#x is input data
#Y is target value
def compute_cost(x,w,b,Y):
    cost = 0
    m = x.shape[0]
    for i in range(m):
        f_wb = np.dot(x[i],w) + b
        cost += (f_wb - Y[i]) ** 2
    cost /= 2*m
    return cost 

In [9]:
def compute_descent(x, w,b , Y):
    dj_dw = np.zeros(w.shape)
    dj_db = 0
    m,n = x.shape
    for i in range(m):
        fwb = np.dot(x[i],w) + b
        err = fwb - Y[i]
        for j in range(n):
            element = x[i,j].item()
            dj_dw[j] += err * element 
        dj_db += err
    dj_dw /= m
    dj_db /= m

    return dj_dw, dj_db

In [10]:
def gradient_descent(x, Y, num_of_iter, alpha):
    n = x.shape[1]
    w = np.zeros(n)
    b = 0

    for i in range(num_of_iter):
        dj_dw, dj_db = compute_descent(x, w, b, Y)
        for j in range(n):
            w[j] -= alpha * dj_dw[j]
        b = b - alpha * dj_db

        if i % 1000 == 0:
            print(f"The cost of {i}th iterations is {compute_cost(x, w, b, Y)}")

    return w,b 

Predicition 

In [11]:
from sklearn.preprocessing import MinMaxScaler
x_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

X_train_norm = x_scaler.fit_transform(X_train)
y_train_norm = y_scaler.fit_transform(y_train.reshape(-1,1))

In [12]:
w, b = gradient_descent(X_train_norm, y_train_norm, 10000, 1)
print(w,b)

The cost of 0th iterations is [0.06761589]
The cost of 1000th iterations is [0.01054864]
The cost of 2000th iterations is [0.01026005]
The cost of 3000th iterations is [0.01017277]
The cost of 4000th iterations is [0.01014103]
The cost of 5000th iterations is [0.01012767]
The cost of 6000th iterations is [0.01012105]
The cost of 7000th iterations is [0.01011727]
The cost of 8000th iterations is [0.01011488]
The cost of 9000th iterations is [0.01011326]
[-0.882946   -0.83332641  0.12054337 -0.44410776  1.20014439 -1.50065008
  0.76251011  1.18152177] [0.73280649]


In [17]:
from sklearn.preprocessing import MinMaxScaler
x_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

X_test_norm = x_scaler.fit_transform(X_test)
y_test_norm = y_scaler.fit_transform(y_test.reshape(-1,1))

In [18]:
yhat_norm = np.dot(X_test_norm,w) + b
yhat = y_scaler.inverse_transform(yhat_norm.reshape(-1,1))

In [19]:
from sklearn.metrics import r2_score, mean_squared_error

ytest_r2 = r2_score(y_test_norm, yhat_norm) 
ytest_mse = mean_squared_error(y_test_norm, yhat_norm)

In [21]:
print(f"r2 score for y_test data {ytest_r2}")
print(f"mean squared error for y_test data {ytest_mse}")

r2 score for y_test data 0.5712878756215957
mean squared error for y_test data 0.024395955505849453


In [26]:
print(y_test[:10])
print(yhat[:10].reshape(1,-1).astype('int64'))

[292000.  30000. 334100. 213800. 357300. 360900. 464600. 173400. 107400.
 164100.]
[[313725  49221 231832 250460 411419 370635 385602  39109 148380 189060]]
