## Group 6:
### AU1741001: Akash Tike
### AU1741011: Smit Mandavia
### AU1741068: Parth Maniyar
### AU1741095: Shaunak Vyas

# Multiple Linear Regression using Normal Equation Method : 

<img src="image1.png">

In [14]:
import numpy as np
from sklearn import datasets, metrics
from numpy.linalg import inv, pinv, LinAlgError

X, y = datasets.load_boston(return_X_y=True) # Loading Data from boston data set with X and y separate


X  = np.append( np.ones([len(X),1]) ,X,axis=1) # Adding dummy column with all values one

Split_ratio = 0.8  # Split ratio to split data into train and test data
Split_index = int(len(X)*Split_ratio) # Index from which it will split
 
#Splitiing data    
X_train = X[0:Split_index] 
y_train = y[0:Split_index]

print("X_train's shape :" , X_train.shape)
print("y_train's shape :" , y_train.shape)

X_test = X[Split_index:len(X)]
y_test = y[Split_index:len(y)]

print("X_test's shape :" , X_test.shape)
print("y_test's shape :" , y_test.shape)

# Finding theta using given equation
try: 
    theta = inv( np.dot(X_train.T, X_train ))
    theta = np.dot( theta, X_train.T )
    theta = np.dot( theta, y_train)    
except LinAlgError: # if inverse do not exist use psuedo inverse
    theta = pinv( np.dot(X_train.T, X_train ))
    theta = np.dot( theta, X_train.T )
    theta = np.dot( theta, y_train)    
    
print("Theta : ", theta)
print("Shape of Theta", theta.shape)   

predictions=np.dot(theta,X_test.T)  # Finding predictions of test data using theta

print("MAE:", metrics.mean_absolute_error(y_true=y_test,y_pred=predictions)) # Finding mean absolute error
print("MSE:", metrics.mean_squared_error(y_true=y_test,y_pred=predictions))  # Finding mean squared error 

X_train's shape : (404, 14)
y_train's shape : (404,)
X_test's shape : (102, 14)
y_test's shape : (102,)
Theta :  [ 3.00771669e+01 -2.02135297e-01  4.41276341e-02  5.26739364e-02
  1.88474315e+00 -1.49281487e+01  4.76038673e+00  2.88734527e-03
 -1.30025278e+00  4.61661953e-01 -1.55434673e-02 -8.11632369e-01
 -1.97174433e-03 -5.32273431e-01]
Shape of Theta (14,)
MAE: 4.7300172509620095
MSE: 32.79986268021572


### Multiple Regression for Streaming Data (using Stochastic Gradient Descent)
works as a linear regression when number of features is one in dataset


In [1]:
# Libraries section

import numpy as np
from sklearn import datasets,metrics
from sklearn.preprocessing import StandardScaler

In [2]:
# reading the dataset

# train test splitting ratio
split_ratio = 0.8

x,y = datasets.load_boston(return_X_y = True)

# adding dummy feature to the dependent variable matrix
x = np.append(np.ones([len(x),1]),x,axis=1)

# x is now feature matrix with dummy variable
print("Shape of x,y: ",x.shape,y.shape)

# splitting into training and testing
x_train = x[:int(split_ratio*x.shape[0])]
y_train = y[:int(split_ratio*x.shape[0])]

x_test  = x[int(split_ratio*x.shape[0]):]
y_test  = y[int(split_ratio*x.shape[0]):]

print("Shape of x_train, y_train: ",x_train.shape,y_train.shape)
print("Shape of x_test, y_test: ",x_test.shape,y_test.shape)

Shape of x,y:  (506, 14) (506,)
Shape of x_train, y_train:  (404, 14) (404,)
Shape of x_test, y_test:  (102, 14) (102,)


In [3]:
# normalization of data
# as different ranges results into nan error in gradient descent in some cases

# cretes an object of StandardScaler
standard_scaler=StandardScaler()
standard_scaler.fit(x_train[:,1:])

# we should not normalize the first column, as it is just dummy variable
x_train[:,1:]= standard_scaler.transform(x_train[:,1:])
x_test[:,1:] = standard_scaler.transform(x_test[:,1:])

In [4]:
# function to generate stream data from training set
def StreamGenerator():

# the function iterates through the training set and yields one entry each time
    for i in range(x_train.shape[0]):
        yield(x_train[i],y_train[i])

In [5]:
# training the model through stream

stream = StreamGenerator()

# number of thetas = number of features
theta  = np.random.uniform(0,1,size=(x_train.shape[1]))

#learning rate
alpha = 0.01

# here x,y is an entry of training set, received through stream
for (x,y) in stream:
    
    # calculating h(x,theta) = x0*theta0 + x1*theta1 + ... + xn*thetan
    y_pred = np.dot(x, theta)
    
    # error in predicted and actual value
    err = y_pred - y

    # gradient of error with respect to x
    grad = err*x.T
    
    # updating the value of theta
    theta = theta - alpha*grad   

In [6]:
# testing the model with test dataset

# predicting on test dataset
y_pred = np.dot(x_test, theta)

# calculating error metrics
mae = metrics.mean_absolute_error(y_true = y_test, y_pred = y_pred)
mse = metrics.mean_squared_error(y_true = y_test, y_pred = y_pred)

print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)

Mean Absolute Error:  6.4208864504499505
Mean Squared Error:  54.4115788973508


# Simple and Multiple Linear Regression using Gradient Descent (batch data)

Note: This is the generalized algorithm for multiple linear regression if number of attributes is equal to 1 then it is converted simple linear regression

In [7]:
import numpy as np
from sklearn import datasets, metrics
from sklearn.preprocessing import StandardScaler

# reading the datase and splitting into input and output variable

# train test splitting ratio
split_ratio = 0.8

x,y = datasets.load_boston(return_X_y = True)

# adding dummy feature to the dependent variable matrix
x_temp = np.zeros((x.shape[0],x.shape[1]+1))
x_temp[:,0] = np.ones((x_temp.shape[0]))
x_temp[:,1:] = x
x = x_temp

# x is now feature matrix with dummy variable
print("Shape of x,y: ",x.shape,y.shape)

# splitting into training and testing
x_train = x[:int(split_ratio*x.shape[0])]
x_test  = x[int(split_ratio*x.shape[0]):]
y_train = y[:int(split_ratio*x.shape[0])]
y_test  = y[int(split_ratio*x.shape[0]):]

print("Shape of x_train, y_train: ",x_train.shape,y_train.shape)
print("Shape of x_test, y_test: ",x_test.shape,y_test.shape)

Shape of x,y:  (506, 14) (506,)
Shape of x_train, y_train:  (404, 14) (404,)
Shape of x_test, y_test:  (102, 14) (102,)


In [8]:
#scaling of input variable 
scaler=StandardScaler()
scaler.fit(x_train[:,1:])
x_train[:,1:]=scaler.transform(x_train[:,1:])
x_test[:,1:]=scaler.transform(x_test[:,1:])

image for understanding and explanation purpose

<img src="image2.png">

<img src="image3.png">

In [9]:
m=x_train.shape[0]
n=x_train.shape[1]

#number of iterations as stoping criteraia 
niterations=1000
alpha=0.01#learning rate


#intializing theta randomly between 0 and 1
theta=np.random.uniform(0,1,n)
print("Type of theta:", type(theta), "Shape of Theta:", theta.shape)

#algorithm as shown above
for i in range(niterations):
    update=np.zeros(n)
    error=np.dot(x_train,theta) - y_train #h(x(i))-y(i) 
    for j in range(n):
        update[j]=np.sum(error*(x_train.T)[j])
    theta = theta - ((1/m)*(alpha)*update)

    
print("Theta:",theta)

predictions=np.dot(x_test, theta) #The Equation implemented as shown above

print("Mean Absolute Error:", metrics.mean_absolute_error(y_test,predictions))
print("Mean Square Error:", metrics.mean_squared_error(y_test,predictions))

Type of theta: <class 'numpy.ndarray'> Shape of Theta: (14,)
Theta: [ 2.41747309e+01 -1.09931928e+00  8.74198106e-01 -2.40871635e-02
  6.12968681e-01 -1.23949660e+00  3.63987990e+00 -6.17872405e-04
 -2.53808433e+00  1.86958870e+00 -9.96710499e-01 -1.67564644e+00
  2.97907673e-02 -3.59762191e+00]
Mean Absolute Error: 4.335812804045984
Mean Square Error: 28.03930801527921


# Incremental Mathematical Stream Regression

In [10]:
# Libraries section

import pandas as pd
import numpy as np
from sklearn import datasets
from numpy.linalg import inv, pinv, LinAlgError
from sklearn import metrics
import math    

In [11]:
# reading the dataset

# train test splitting ratio
split_ratio = 0.8

x,y = datasets.load_boston(return_X_y = True)

# adding dummy feature to the dependent variable matrix
x = np.append(np.ones([len(x),1]),x,axis=1)

# x is now feature matrix with dummy variable
print("Shape of x,y: ",x.shape,y.shape)

# splitting into training and testing
x_train = x[:int(split_ratio*x.shape[0])]
y_train = y[:int(split_ratio*x.shape[0])]

x_test  = x[int(split_ratio*x.shape[0]):]
y_test  = y[int(split_ratio*x.shape[0]):]

print("Shape of x_train, y_train: ",x_train.shape,y_train.shape)
print("Shape of x_test, y_test: ",x_test.shape,y_test.shape)

Shape of x,y:  (506, 14) (506,)
Shape of x_train, y_train:  (404, 14) (404,)
Shape of x_test, y_test:  (102, 14) (102,)


In [12]:
# Incremental Mathematical Stream Regression

# Initializing Synopsis
M = np.zeros((x_test.shape[1],x_test.shape[1])) # p*p

V = np.zeros(x_test.shape[1]) # p*1

# Initializing Beta
beta = np.zeros(x_test.shape[1]) # p*1

# Taking Batch Size
length = 101

if(x_train.shape[0] % length != 0):
    
    print("INVALID LENGTH")
    print("PLEASE CHOOSE EXACT DIVISOR")
else:
    times = int(x_train.shape[0]/length)

    for i in range(times):

        # Fetching Group Data
        x = x_train[i*length:(i+1)*length,:]
        y = y_train[i*length:(i+1)*length]

        try:
            xtx_m_inv = inv(M + np.dot(x.T,x)) #inverse of (M+xtx)
        except LinAlgError:
            xtx_m_inv = pinv(M + np.dot(x.T,x)) #pseudo inverse of (M+xtx)

        #xty
        xty = np.dot(x.T,y)

        #(M+xtx)^-1(V+xty)
        beta = np.dot(xtx_m_inv,(V + xty)) 

        #storing synopsis
        M = M + np.dot(x.T,x) 
        V = V + xty

    # testing the model with test dataset

    # predicting on test dataset
    y_pred = np.dot(x_test, beta)

    # calculating error metrics
    mae = metrics.mean_absolute_error(y_true = y_test, y_pred = y_pred)
    mse = metrics.mean_squared_error(y_true = y_test, y_pred = y_pred)

    print("Beta: ", beta)
    print("")
    print("Mean Absolute Error: ", mae)
    print("Mean Squared Error: ", mse)

Beta:  [ 3.00771669e+01 -2.02135297e-01  4.41276341e-02  5.26739364e-02
  1.88474315e+00 -1.49281487e+01  4.76038673e+00  2.88734527e-03
 -1.30025278e+00  4.61661953e-01 -1.55434673e-02 -8.11632369e-01
 -1.97174433e-03 -5.32273431e-01]

Mean Absolute Error:  4.730017250961159
Mean Squared Error:  32.79986268020979


# Approximate Stream Regression

In [13]:
# Approximate Stream Regression

# Initializing Beta
beta = np.zeros(x_test.shape[1]) # p*1

# Taking Beta Previous as 0
beta_previous = 0

# Initializing Alpha
alpha = 0.5

# Taking Batch Size
length = 101

if(x_train.shape[0] % length != 0):
    
    print("INVALID LENGTH")
    print("PLEASE CHOOSE EXACT DIVISOR")
else:
    times = int(x_train.shape[0]/length)

    for i in range(times):

        # Fetching Group Data
        x = x_train[i*length:(i+1)*length,:]
        y = y_train[i*length:(i+1)*length]

        try:
            xtx_inv = inv(np.dot(x.T,x)) #inverse of (xtx)
        except LinAlgError:
            xtx_inv = pinv(np.dot(x.T,x)) #pseudo inverse of (xtx)

        #xty
        xty = np.dot(x.T,y)

        #(xtx)^-1(xty)
        beta_current = np.dot(xtx_inv,xty)

        # Calculating Beta
        beta = (1-alpha)*beta_current + alpha*(beta_previous)

        # Saving this Beta as Previous Beta
        beta_previous = beta

    # testing the model with test dataset

    # predicting on test dataset
    y_pred = np.dot(x_test, beta)

    # calculating error metrics
    mae = metrics.mean_absolute_error(y_true = y_test, y_pred = y_pred)
    mse = metrics.mean_squared_error(y_true = y_test, y_pred = y_pred)

    print("Beta: ", beta)
    print("")
    print("Mean Absolute Error: ", mae)
    print("Mean Squared Error: ", mse)

Beta:  [ 4.93733262e+01  5.16559721e-01  1.91759478e-02  9.12150224e-02
  4.73439451e+00 -5.03142784e+01  2.98961900e+00  4.81040425e-03
 -1.02804698e+00  6.79530282e-01 -2.09793804e-02 -7.35755924e-01
  9.27697627e-03 -4.93837863e-01]

Mean Absolute Error:  6.3859733088487705
Mean Squared Error:  89.28080048157986
