In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('datasets/House_Price.csv')

In [3]:
data

Unnamed: 0,price,crime_rate,resid_area,air_qual,room_num,age,dist1,dist2,dist3,dist4,teachers,poor_prop,airport,n_hos_beds,n_hot_rooms,waterbody,rainfall,bus_ter,parks
0,24.0,0.00632,32.31,0.538,6.575,65.2,4.35,3.81,4.18,4.01,24.7,4.98,YES,5.480,11.1920,River,23,YES,0.049347
1,21.6,0.02731,37.07,0.469,6.421,78.9,4.99,4.70,5.12,5.06,22.2,9.14,NO,7.332,12.1728,Lake,42,YES,0.046146
2,34.7,0.02729,37.07,0.469,7.185,61.1,5.03,4.86,5.01,4.97,22.2,4.03,NO,7.394,101.1200,,38,YES,0.045764
3,33.4,0.03237,32.18,0.458,6.998,45.8,6.21,5.93,6.16,5.96,21.3,2.94,YES,9.268,11.2672,Lake,45,YES,0.047151
4,36.2,0.06905,32.18,0.458,7.147,54.2,6.16,5.86,6.37,5.86,21.3,5.33,NO,8.824,11.2896,Lake,55,YES,0.039474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,22.4,0.06263,41.93,0.573,6.593,69.1,2.64,2.45,2.76,2.06,19.0,9.67,NO,9.348,12.1792,Lake and River,27,YES,0.056006
502,20.6,0.04527,41.93,0.573,6.120,76.7,2.44,2.11,2.46,2.14,19.0,9.08,YES,6.612,13.1648,Lake and River,20,YES,0.059903
503,23.9,0.06076,41.93,0.573,6.976,91.0,2.34,2.06,2.29,1.98,19.0,5.64,NO,5.478,12.1912,,31,YES,0.057572
504,22.0,0.10959,41.93,0.573,6.794,89.3,2.54,2.31,2.40,2.31,19.0,6.48,YES,7.940,15.1760,,47,YES,0.060694


In [4]:
data.dropna(inplace=True)
data.shape

(498, 19)

In [5]:
data.columns

Index(['price', 'crime_rate', 'resid_area', 'air_qual', 'room_num', 'age',
       'dist1', 'dist2', 'dist3', 'dist4', 'teachers', 'poor_prop', 'airport',
       'n_hos_beds', 'n_hot_rooms', 'waterbody', 'rainfall', 'bus_ter',
       'parks'],
      dtype='object')

In [6]:
data.dtypes

price          float64
crime_rate     float64
resid_area     float64
air_qual       float64
room_num       float64
age            float64
dist1          float64
dist2          float64
dist3          float64
dist4          float64
teachers       float64
poor_prop      float64
airport         object
n_hos_beds     float64
n_hot_rooms    float64
waterbody       object
rainfall         int64
bus_ter         object
parks          float64
dtype: object

In [7]:
nonNumericColumns = data.select_dtypes(exclude = ['int64', 'float64'])
nonNumericColumns

Unnamed: 0,airport,waterbody,bus_ter
0,YES,River,YES
1,NO,Lake,YES
2,NO,,YES
3,YES,Lake,YES
4,NO,Lake,YES
...,...,...,...
501,NO,Lake and River,YES
502,YES,Lake and River,YES
503,NO,,YES
504,YES,,YES


In [8]:
cols = {}
for c in nonNumericColumns.columns:
    cols[c] = {val : index for index, val in enumerate(data[c].unique())}
    data[c] = [cols[c][val] for val in data[c]]
cols

{'airport': {'YES': 0, 'NO': 1},
 'waterbody': {'River': 0, 'Lake': 1, 'None': 2, 'Lake and River': 3},
 'bus_ter': {'YES': 0}}

In [9]:
trainData = data.sample(frac=0.8, random_state=0)

In [10]:
testData = data.drop(trainData.index)

In [11]:
def costFunction(x,y,w):
    m = len(x)
    mat = np.matmul(x,w)
    mat = np.subtract(mat,y)
    J = np.matmul(mat.transpose(),mat)/2*m
    return J

In [12]:
def gradient(x,y,w,b,learning_rate,iterations=10000,stoping_threshold=0.00001):
    x = x.to_numpy()
    y = y.to_numpy()
    y = y.reshape(len(y), 1)
    prev_cost = None
    bol = True
    for j in range(iterations):
        crnt_cost = costFunction(x,y,w)
        if prev_cost is not None and abs(prev_cost[0][0] - crnt_cost[0][0]) <= stoping_threshold:
            break
        m = len(x)
        prev_cost = crnt_cost
        derivative = np.subtract(np.matmul(x,w), y)/m
        n = len(w) 
        for i in range(n):
            derv = np.multiply(derivative,x[:,i].reshape(m,1))
            w[i,0] = w[i,0] - learning_rate*sum(derv)
        b = b - learning_rate*sum(derivative)
    print(j)
    return w, b

In [13]:
y = trainData['price']
y

91     22.0
257    50.0
287    23.2
450    13.4
340    18.7
       ... 
283    50.0
230    24.3
150    21.5
145    13.8
338    20.6
Name: price, Length: 398, dtype: float64

In [14]:
x = trainData.drop(columns = ['price'])
x

Unnamed: 0,crime_rate,resid_area,air_qual,room_num,age,dist1,dist2,dist3,dist4,teachers,poor_prop,airport,n_hos_beds,n_hot_rooms,waterbody,rainfall,bus_ter,parks
91,0.03932,33.41,0.489,6.405,73.9,3.16,2.95,3.41,2.85,22.2,8.20,1,8.240,13.1760,0,23,0,0.045096
257,0.61154,33.97,0.647,8.704,86.9,2.09,1.53,1.83,1.76,27.0,5.12,0,8.600,11.4000,0,54,0,0.055193
287,0.03871,35.32,0.405,6.209,31.3,7.45,7.20,7.58,7.05,23.4,7.14,0,9.964,14.1856,1,60,0,0.043226
450,6.71772,48.10,0.713,6.749,92.6,2.40,2.11,2.37,2.40,19.8,17.44,0,9.068,13.1072,1,59,0,0.061322
340,0.06151,35.19,0.515,5.968,58.5,5.06,4.76,4.97,4.46,19.8,9.29,0,6.474,13.1496,2,20,0,0.053312
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,0.01501,31.21,0.401,7.923,24.8,6.15,5.84,5.97,5.58,26.4,3.16,0,8.700,13.4000,1,20,0,0.048895
230,0.53700,36.20,0.504,5.981,68.1,3.97,3.38,3.88,3.47,22.6,11.65,1,8.286,10.1944,2,59,0,0.046602
150,1.65660,49.58,0.871,6.122,97.3,1.65,1.52,1.93,1.37,25.3,14.10,1,5.530,12.1720,0,34,0,0.083033
145,2.37934,49.58,0.871,6.130,100.0,1.69,1.26,1.72,1.01,25.3,27.80,0,7.776,10.1104,0,59,0,0.084516


In [15]:
w = np.ones(len(x.columns))
w = w.reshape(len(x.columns),1)

In [16]:
b, b0 = gradient(x,y,w,1,0.00001)

9999


In [17]:
print(b, b0)

[[-0.04249428]
 [-0.06620791]
 [ 0.9874151 ]
 [ 0.98252746]
 [ 0.09444628]
 [ 0.226723  ]
 [ 0.23771775]
 [ 0.21814313]
 [ 0.23263989]
 [ 0.55109174]
 [-0.81608425]
 [ 0.89471381]
 [ 0.71666315]
 [ 0.04371211]
 [ 0.80259766]
 [-0.00675134]
 [ 1.        ]
 [ 0.99880743]] [0.96258445]


In [18]:
yTest = testData['price']

In [19]:
xTest = testData.drop(columns = ['price'])

In [20]:
yOutput = []
b = b.ravel()
for d in xTest.iterrows():
    out = sum(np.multiply(d[1].to_numpy(), b)) + b0[0]
    yOutput.append((out, yTest[d[0]]))
yOutput

[(25.8743837040394, 18.9),
 (20.706003064992938, 13.9),
 (23.087735319561272, 18.4),
 (26.654920647461427, 14.5),
 (11.682569952960304, 13.2),
 (20.70994031715605, 24.2),
 (26.318618911524712, 30.8),
 (25.36918399376976, 25.3),
 (19.100725277924315, 16.6),
 (22.58045845817851, 18.9),
 (29.26179677455351, 23.3),
 (25.205852930064847, 21.7),
 (20.878299775833728, 23.4),
 (26.399317733150482, 22.9),
 (29.588639270570695, 26.6),
 (31.33282185777573, 23.6),
 (30.385189754027095, 28.7),
 (27.057201370534965, 22.9),
 (26.718527431258646, 27.5),
 (19.353160739587196, 19.5),
 (29.53700667824178, 19.2),
 (21.987964269005943, 19.3),
 (23.170484321036582, 20.3),
 (12.825879991994107, 17.3),
 (16.543969892722618, 14.3),
 (24.3406196688866, 19.2),
 (23.09474838204897, 18.4),
 (20.86988493084222, 15.4),
 (22.982739037031394, 19.4),
 (28.556487619422356, 25.0),
 (26.080585512220946, 23.8),
 (26.418694171420235, 22.6),
 (24.799857016232234, 23.2),
 (31.256813231702047, 37.2),
 (32.763960640251796, 37.9