In [None]:
## Linear Regression

In [None]:
import pandas as pd
import numpy as np
import math
from tqdm import tqdm 
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('~/Desktop/housing.csv')
df.head()

Unnamed: 0,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24
0,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
1,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
2,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
3,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
4,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7


In [None]:
# understand the data
df.describe()
# We can find that the range of data is not same. So need to normalize.

Unnamed: 0,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24
count,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0
mean,3.620663,11.350495,11.154257,0.069307,0.554728,6.284059,68.581584,3.79446,9.566337,408.459406,18.461782,356.594376,12.668257,22.529901
std,8.608569,23.343704,6.855868,0.254227,0.11599,0.703195,28.176371,2.107761,8.707553,168.629992,2.16252,91.367787,7.13995,9.205991
min,0.00906,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.08221,0.0,5.19,0.0,0.449,5.885,45.0,2.1,4.0,279.0,17.4,375.33,7.01,17.0
50%,0.25915,0.0,9.69,0.0,0.538,6.208,77.7,3.1992,5.0,330.0,19.1,391.43,11.38,21.2
75%,3.6782,12.5,18.1,0.0,0.624,6.625,94.1,5.2119,24.0,666.0,20.2,396.21,16.96,25.0
max,88.976,100.0,27.74,1.0,0.871,8.78,100.0,12.127,24.0,711.0,22.0,396.9,37.97,50.0


In [None]:
# function & class
# function contain return and print; 
# class define same characteristics automatically.
# ml use class

In [None]:
class LinearRegression:
    def __init__(self, X, y, learningRate, tolerance, maxIteration, gd = False) -> None:
        self.X = X
        self.y = y
        self.tolerance = tolerance
        self.learningRate = learningRate
        self.maxIteration = maxIteration
        self.gd = gd
        
    def splitTrainTest(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, 
                                                            test_size = 0.3, 
                                                            random_state = 1)
        return X_train, X_test, y_train, y_test
    
    def add_X0(self, X):
        return np.column_stack([np.ones([X.shape[0], 1]), X])
    
    def normalize(self, X):
        mean = np.mean(X,0)
        std = np.std(X,0)
        X_norm = (X - mean) / std
        X_norm = self.add_X0(X_norm) # after the normalized data
        return X_norm, mean, std     # use to normalize test data
    
    def normalizetestdata(self, X, meanTrain, stdTrain):
        return (X- meanTrain) / stdTrain
    
    def checkMatrix(self, X): # if it is full rank matrix
        X_rank = np.linalg.matrix_rank(X)
        
        if X_rank == min(X.shape[0], X.shape[1]):
            self.fullRank = True
            print("data is full rank")
        else:
            self.fullRank = False
            print("data is not full rank")
            
    def checkInvertibility(self, X): #if it is invertible
        if X.shape[0] < X.shape[1]:
            self.lowRank = True
        else:
            self.lowRank = False
            
    def closeFormSolution(self, X, y):
        w = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
        return w
    
    def gradientDescent(self, X, y):
        errors = []
        lastError = float('inf')
        
        for t in tqdm(range(self.maxIteration)):
            self.w = self.w - self.learningRate*self.costDeriviation(X, y)
            
            cur =self.sse(X, y)
            diff = lastError - cur
            lastError = cur
            
            errors.append(cur)
            
            if diff < self.tolerance:
                print("The model stopped - no further improvement")
                break
            
    def predict(self, X):
        return X.dot(self.w)
    
    def sse(self, X, y):
        y_hat = self.predict(X)
        return((y_hat-y)**2).sum()
    
    def costFunction(self, X, y):
        return self.sse(X, y)/2
    
    def costDeriviation(self, X, y):
        y_hat = self.predict(X)
        return (y_hat-y).dot(X)
    
    def fit(self):
        self.X_train, self.X_test, self.y_train, self.y_test = self.splitTrainTest()
        
        #normalize
        self.X_train, meanTrain, stdTrain = self.normalize(self.X_train)
        self.X_test = self.normalizetestdata(self.X_test, meanTrain, stdTrain)
        
        self.checkInvertibility(self.X_train)
        self.checkMatrix(self.X_train)
        
        if self.fullRank and not self.lowRank and not self.gd:
            print('Solving using Normal equation - closed form solution')
            self.w = self.closeFormSolution(self.X_train, self.y_train)
            
        else:
            print('solving using gradient descent')
            self.w = np.ones(self.X_train.shape[1], dtype = np.float64) * 0 # initiate the w
            self.gradientDescent(self.X_train, self.y_train)
            
        print(self.w)

In [None]:
regression = LinearRegression(df.values[:, 0:-1],
                             df.values[:, -1],
                             learningRate = 0.000001,
                             tolerance = 0.0000005,
                             maxIteration = 10000,
                             gd = False)

In [None]:
regression.fit()

data is full rank
Solving using Normal equation - closed form solution
[ 2.23932011e+01 -8.74751473e-01  1.65120068e+00  1.63217681e-01
  8.39763200e-01 -2.82239646e+00  1.98885621e+00 -8.18033722e-03
 -3.76453682e+00  2.97801750e+00 -2.38452744e+00 -1.75115149e+00
  3.54441396e-01 -4.15293078e+00]


In [None]:
40**0.5

6.324555320336759