In [1]:
%config IPCompleter.use_jedi = False
%config Completer.evaluation = 'limited'
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np 
import pandas as pd 
import plotly.express as px 
import plotly.graph_objects as go 

In [3]:
# Import the Diabetes dataset from sklearn
from sklearn.datasets import  load_diabetes

In [4]:
X , y = load_diabetes(return_X_y = True)

In [5]:
X

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286131, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04688253,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452873, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00422151,  0.00306441]], shape=(442, 10))

In [6]:
y 

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

In [7]:
X.shape

(442, 10)

In [8]:
y.shape

(442,)

<h2> Make the Train and test set

In [9]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 2) 

In [10]:
print(X_train.shape)
print(y_train.shape)

(353, 10)
(353,)


In [11]:
print(X_test.shape)
print(y_test.shape)

(89, 10)
(89,)


<h2> Using sklearn Linear Regression Model

In [12]:
from sklearn.linear_model import LinearRegression

In [13]:
# make the model object 
sk_lr = LinearRegression() 

In [14]:
# Let's train the model
sk_lr.fit(X_train , y_train)

In [15]:
# Let's check the theta's. There will be total 11 theta 0 to 10
sk_lr.coef_

array([  -9.15865318, -205.45432163,  516.69374454,  340.61999905,
       -895.5520019 ,  561.22067904,  153.89310954,  126.73139688,
        861.12700152,   52.42112238])

In [16]:
# Let's check the offset(theta 0)
sk_lr.intercept_

np.float64(151.88331005254167)

In [17]:
# Let's do the prediction
y_pred = sk_lr.predict(X_test) 

In [18]:
# Let's check the metrices
from sklearn.metrics import  r2_score , mean_absolute_error , mean_squared_error

print(f"mean_absolute_error: {mean_absolute_error(y_test , y_pred)}")
print(f"mean_squared_error: {mean_squared_error(y_test , y_pred)}") 
print(f"r2_score: {r2_score(y_test , y_pred)}")

mean_absolute_error: 45.21303419046903
mean_squared_error: 3094.4566715660626
r2_score: 0.4399338661568968


<h2> Let's Make our own Multi-variable Linear Regression class using OLS(Ordinary Least Squares)

In [29]:
class MyLinearRegression1:
    # Constructor
    def __init__(self):
        self.coef_ = None  # Coefficients for the features
        self.intercept_ = None # Intercept term (theta_0)

    # Make the fit method
    def fit(self , X_train , y_train): 
        # X_train is a matrix(ndarray) of n x m and y_train is ndarray of shape (n,) or (n,1)
        # n is the number of rows and m is number of features(dimensions)

        # In augmented X matrix we have first column as coef of theta0. So insert a new col with value 1
        # Add a new col at X's 0-th position with value 1
        # X_train = np.insert(X_train , 0 , 1 , axis = 1) # Adding the intercept term

        # No need to add intercept mannually - we will keep intercept separate
        
        # our task is to find the theta values using the euqation
        # theta = ((transpose(X) * X)^-1) * transpose(X) * y
        
        # find the transpose of X
        X_transpose = np.transpose(X_train)
        
        # Do inverse(X_transpose * X)
        A = np.linalg.pinv(np.dot(X_transpose , X_train))
        
        # Find B = transpose(X) * y 
        B = np.dot(X_transpose , y_train)
        
        theta = np.dot(A , B) 

        # Final coefficient vector (excluding intercept)
        self.coef_ = theta

        # Calculate the intercept 
        # inercept = mean(y) - mean(X * coef_)
        self.intercept_ = np.mean(y_train) - np.mean(np.dot(X_train , self.coef_))

    # Make the predict method 
    def predict(self , X_test):
        # y = X * coef + intercept
        y_pred = np.dot(X_test , self.coef_) + self.intercept_
        return y_pred

<h2> Version 2: Using Augmented Feature Matrix (Intercept Added in X)

In [30]:
class MyLinearRegression2:
    # Constructor
    def __init__(self):
        self.coef_ = None       # Coefficients for the features (excluding intercept)
        self.intercept_ = None  # Intercept term (theta_0)

    # Make the fit method
    def fit(self , X_train , y_train): 
        # X_train is a matrix (ndarray) of n x m and y_train is ndarray of shape (n,) or (n,1)
        # n is the number of rows and m is the number of features (dimension)

        # In augmented X matrix, we have the first column as coef of theta_0.
        # So insert a new column with value 1 at the 0-th index to represent the intercept term
        X_train_augmented = np.insert(X_train , 0 , 1 , axis = 1)  # Adding the intercept term
        
        # Our task is to find the theta values using the equation:
        # theta = ((transpose(X) * X)^-1) * transpose(X) * y

        # Find the transpose of X
        X_transpose = np.transpose(X_train_augmented)

        # Compute the pseudo-inverse for numerical stability: inverse(X^T * X)
        A = np.linalg.pinv(np.dot(X_transpose , X_train_augmented))

        # Compute (X^T * y)
        B = np.dot(X_transpose , y_train)

        # Final theta vector (intercept + coefficients)
        theta = np.dot(A , B) 

        # Now the first value of theta is theta_0 (intercept)
        self.intercept_ = theta[0]

        # And the rest of the values are coefficients for the features
        self.coef_ = theta[1:]

    # Make the predict method 
    def predict(self , X_test):
        # Add the intercept part to X_test
        X_test_augmented = np.insert(X_test , 0 , 1 , axis = 1)

        # Create the full theta vector [intercept, coef_]
        full_theta = np.concatenate(([self.intercept_], self.coef_))

        # y = X * theta
        y_pred = np.dot(X_test_augmented, full_theta)

        return y_pred

In [19]:
X_train.shape

(353, 10)

In [20]:
type(X_train)

numpy.ndarray

In [21]:
y_train.shape

(353,)

In [22]:
type(y_train)

numpy.ndarray

In [31]:
# Create an object of version-1 model
my_linear_reg_v1 = MyLinearRegression1() 

In [32]:
# Train the version-1 model
my_linear_reg_v1.fit(X_train , y_train)

In [33]:
# predict using version-1
y_pred_v1 = my_linear_reg_v1.predict(X_test)

In [34]:
# Create an object of version-2 model
my_linear_reg_v2 = MyLinearRegression2() 

In [35]:
# Train the version-2 model
my_linear_reg_v2.fit(X_train , y_train)

In [36]:
# predict using version-2
y_pred_v2 = my_linear_reg_v2.predict(X_test)

<h2> R2 Score of sklearn model and my model v1 , v2

In [37]:
print(f'sklean model R2 score: {r2_score(y_test , y_pred)}')
print(f"my_linear_reg_v1 r2_score: {r2_score(y_test , y_pred_v1)}")
print(f"my_linear_reg_v2 r2_score: {r2_score(y_test , y_pred_v2)}")

sklean model R2 score: 0.4399338661568968
my_linear_reg_v1 r2_score: 0.3641990212219125
my_linear_reg_v2 r2_score: 0.43993386615689667
