# Import the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

# Load the dataset

In [2]:
X, y = load_diabetes(return_X_y=True)

In [3]:
X.shape, y.shape

((442, 10), (442,))

In [4]:
X

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286131, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04688253,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452873, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00422151,  0.00306441]])

In [5]:
y

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

# Train Test Split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((353, 10), (89, 10), (353,), (89,))

# Linear Regression

In [8]:
lr = LinearRegression()

## Fit the model

In [9]:
lr.fit(X_train, y_train)

## Prediction on test data

In [10]:
y_pred = lr.predict(X_test)

## Metrics

### MAE

In [11]:
print('MAE: ', mean_absolute_error(y_test, y_pred))

MAE:  42.79409467959993


### MSE

In [12]:
print('MSE: ', mean_squared_error(y_test, y_pred))

MSE:  2900.1936284934804


### RMSE

In [13]:
print('RMSE: ', root_mean_squared_error(y_test, y_pred))

RMSE:  53.85344583676592


### R2 Score

In [14]:
r2 = r2_score(y_test, y_pred)
print('R2 score: ', r2)

R2 score:  0.4526027629719196


### Adjusted R2 Score

In [15]:
print('Adjusted R2 score: ', (1 - ((1 - r2) * (89 - 1)) / (89 - 1 - 10)))

Adjusted R2 score:  0.38242363001960167


## Slopes and Intercept

In [16]:
lr.coef_

array([  37.90402135, -241.96436231,  542.42875852,  347.70384391,
       -931.48884588,  518.06227698,  163.41998299,  275.31790158,
        736.1988589 ,   48.67065743])

In [17]:
lr.intercept_

151.34560453985995

# Custom Linear Regression

In [18]:
class MyLR:

    def __init__(self):
        self.coef_ = None
        self.intercept_ = None

    def fit(self, X_train, y_train):
        # Add 1 before X_train (insert 1's at 0th index in X_train)
        X_train = np.insert(X_train, 0, 1, axis=1)

        betas = np.linalg.inv(np.dot(X_train.T,X_train)).dot(X_train.T).dot(y_train)
        self.intercept_ = betas[0]
        self.coef_ = betas[1:]

    def predict(self, X_test):
        y_pred = np.dot(X_test, self.coef_) + self.intercept_
        return y_pred

## Fit the model

In [19]:
lr = MyLR()

In [20]:
lr.fit(X_train, y_train)

## Prediction for test data

In [21]:
y_pred = lr.predict(X_test)

## Metrics

### MAE

In [22]:
print('MAE: ', mean_absolute_error(y_test, y_pred))

MAE:  42.79409467959996


### MSE

In [23]:
print('MSE: ', mean_squared_error(y_test, y_pred))

MSE:  2900.193628493485


### RMSE

In [24]:
print('RMSE: ', root_mean_squared_error(y_test, y_pred))

RMSE:  53.853445836765964


### R2 Score

In [25]:
r2 = r2_score(y_test, y_pred)
print('R2 score: ', r2)

R2 score:  0.4526027629719187


### Adjusted R2 Score

In [26]:
print('Adjusted R2 score: ', (1 - ((1 - r2) * (89 - 1)) / (89 - 1 - 10)))

Adjusted R2 score:  0.38242363001960056


### Slopes and Intercept

In [27]:
lr.coef_

array([  37.90402135, -241.96436231,  542.42875852,  347.70384391,
       -931.48884588,  518.06227698,  163.41998299,  275.31790158,
        736.1988589 ,   48.67065743])

In [28]:
lr.intercept_

151.34560453985995