In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

In [41]:
class LinearRegressionClassifier:
    def __init__(self,lr=0.01,epochs=1000):
        self.lr = lr
        self.epochs = epochs

    def fit(self,x,y):
        self.th,self.errorGD = self.gradientDescent(x,y,self.lr,self.epochs)

    def predict(self,t):
        return self.hypothesis(t,self.th)
        
    def hypothesis(self,x,th):
        return th@x.T       # it returns y_pred
    
    def error(self,x,th,y):
        y_pred = self.hypothesis(x,th)
        return np.mean((y_pred-y)**2)
        
    def gradientDescent(self,x,y,lr=0.01,epochs = 1000):
        th = np.random.randn(x.shape[1])

        errorGD = []
        for i in range(epochs+1):
            y_pred = self.hypothesis(x,th)
            gradients = ((y_pred-y)@x)/x.shape[0]
            th = th - lr*gradients
    
            errorGD.append(self.error(x,th,y))
    
        return th,errorGD
    
    def r2_score(self,y_true,y_pred):
        rss = np.sum((y_true-y_pred)**2)
        tss = np.sum((y_true-np.mean(y_true))**2)
        return 1 - rss/tss

In [30]:
data = pd.read_csv("BostonHousing.csv")
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [38]:
x = data.drop(['medv'],axis = 1).values
y = data['medv'].values
x = (x - x.mean())/x.std()
x = np.column_stack((x,np.ones(shape = (x.shape[0],1))))

In [40]:
clf = LinearRegressionClassifier(0.1, 100000)
clf.fit(x, y)
th,errorGD = clf.gradientDescent(x,y,lr = 0.01,epochs=100000)

Epoch: 1, Error: 176.08947414183547, r2 score-8.450936058183768
Epoch: 1001, Error: 52.82970303074364, r2 score0.3741390977089002
Epoch: 2001, Error: 48.40125109596853, r2 score0.4266129424016384
Epoch: 3001, Error: 45.01229982485439, r2 score0.46676699876473937
Epoch: 4001, Error: 42.347612602855534, r2 score0.4983392660098176
Epoch: 5001, Error: 40.23788078171758, r2 score0.5233360709706318
Epoch: 6001, Error: 38.558959611173734, r2 score0.5432283841574524
Epoch: 7001, Error: 37.21548413937484, r2 score0.5591461525013928
Epoch: 8001, Error: 36.13369508467659, r2 score0.5719633221518112
Epoch: 9001, Error: 35.256433300544245, r2 score0.5823571494909949
Epoch: 10001, Error: 34.53934645320895, r2 score0.5908531473486294
Epoch: 11001, Error: 33.947979247363136, r2 score0.597859562088489
Epoch: 12001, Error: 33.45553457377739, r2 score0.6036939010769629
Epoch: 13001, Error: 33.04114780379846, r2 score0.6086033800007464
Epoch: 14001, Error: 32.68855466644225, r2 score0.6127807067321007
Epo

Calculating r2 Score for train data and test data

In [42]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8)
clf2 = LinearRegressionClassifier(0.1, 100000)
clf2.fit(x_train, y_train)

r2Score_train = clf2.r2_score(y_train, clf2.predict(x_train))
print(f"r2 Score for train data: {r2Score_train}")

r2Score_test = clf2.r2_score(y_test, clf2.predict(x_test))
print(f"r2 Score for test data: {r2Score_test}")

r2 Score for train data: 0.6786539621273838
r2 Score for test data: 0.6980559521156645


Using linear regression model from sklearn

In [43]:
clf3 = LinearRegression()
clf3.fit(x_train, y_train)

r2score_train = r2_score(y_train, clf3.predict(x_train))     
print(f"r2 Score for train data: {r2score_train}")

r2score_test = r2_score(y_test, clf3.predict(x_test))
print(f"r2 Score for test data: {r2score_test}")

r2 Score for train data: 0.7463551618392019
r2 Score for test data: 0.7074134682851592
