### === Task ===

1. Implement early stopping in which if the absolute difference between old loss and new loss does not exceed certain threshold, we abort the learning.

2. Implement options for stochastic gradient descent in which we use only one sample for training.  Make sure that sample does not repeat unless all samples are read at least once already.

3. Add options for mini-batch gradient descent.

3. Put everything into class.

In [1]:
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


boston = load_boston()

X = boston.data
X.shape #number of samples, number of features

m = X.shape[0]  #number of samples
n = X.shape[1]  #number of features
y = boston.target
assert m == y.shape[0]
scaler = StandardScaler()

X = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

assert len(X_train)  == len(y_train)
assert len(X_test) == len(y_test)
intercept = np.ones((X_train.shape[0], 1))
X_train = np.concatenate((intercept, X_train), axis=1)
intercept = np.ones((X_test.shape[0], 1)) 
X_test = np.concatenate((intercept, X_test), axis=1)

In [5]:
from time import time
import random
assert X_train.shape[0] == y_train.shape[0]
start = time()
class LinearRegression:
    def __init__(self, alpha=0.0001, max_iter=10000000000, 
            loss_old=10000, tol=0.0001, method="batch"):
        self.alpha = alpha
        self.max_iter = max_iter
        self.loss_old = loss_old
        self.tol = tol
        self.method = method

    def mse(self,yhat, y):
        return ((yhat - y)**2).sum() / yhat.shape[0]        
    def h_theta(self,X):
        return X @ self.theta
    def gradient(self,X, error):
        return X.T @ error    

    def mini_batch(self,x,y):                             
        batch_size = 10
        ix = np.random.randint(0, X.shape[0])
        self.X_train =x[ix:ix+batch_size,:]
        self.y_train =y[ix:ix+batch_size]
        return self.X_train,self.y_train

    def sto(self,x,y):
        random_ = np.random.randint(x.shape[0])
        self.X_train = x[random_,:].reshape(1, -1)
        self.y_train = y[random_]
        return self.X_train,self.y_train

    def fit(self, X, y):
        self.theta = np.zeros(X.shape[1])
        iter_stop = 0 
        X_train = X
        y_train = y   
        loss = 0
        
                 
        if self.method =="mini-batch":
            X_train,y_train =  self.mini_batch(X_train,y_train)
            
            
        if self.method =="sto":
            X_train,y_train = self.sto(X_train,y_train)
                    
        for i in range(self.max_iter): 
            # 1.early stopping
            yhat = self.h_theta(X_train)                    
            loss_new = self.mse(yhat,y_train)
            
            if np.abs(self.loss_old - loss_new) < self.tol:
                loss = loss_new  
                print("iter_stop : ",iter_stop)
                print("Mse train ",self.method,loss)              
                break
            
            error = yhat-y_train
            self.loss_old = loss_new             
            grad = self.gradient(X_train,error)
            self.theta = self.theta - self.alpha * grad
            iter_stop +=1
                


In [8]:
model = LinearRegression(method = "")
model.fit(X_train,y_train)
yhat = model.h_theta(X_test)
mse = model.mse(yhat, y_test)
print("Mse test :",mse)

586.2789265536722
537.7739983987132
495.9451286647459
459.1386902391497
426.2528248015551
396.5375585005529
369.4686728738915
344.66800603725017
321.8529918364682
300.80461910730594
281.34700341957193
263.3342866298275
246.64216688370277
231.16236042763987
216.79892502085744
203.46577021599612
191.0849286960462
179.5853195656826
168.90183318827346
158.9746293454603
149.74857970658582
141.1728103440539
133.20031567447862
125.78762511459526
118.89451003625686
112.48372262120094
106.52076079386708
100.9736550784194
95.81277431752211
91.01064791537024
86.54180275846733
82.3826133083357
78.51116360366332
74.9071200888879
71.55161432324195
68.42713473247152
65.51742665357584
62.8073999966883
60.28304391137582
57.93134789967021
55.74022886676212
53.698463643675524
51.7956265552513
50.0220316420352
48.36867917667882
46.82720614460119
45.38984038524662
44.04935811456592
42.799044571576495
41.63265755221313
40.54439361233792
39.52885673889308
38.581029303883945
37.69624513030092
36.8701645123367