# Multiple Linear Regression

## Multiple Regression Class - OSL
### OLS == (Ordinary Least Squares)

We will be using the inbuilt diabetes dataset from sklearn

In [1]:
import numpy as np
from sklearn.datasets import load_diabetes

In [2]:
x, y = load_diabetes(return_X_y=True)

In [3]:
x

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286131, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04688253,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452873, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00422151,  0.00306441]])

In [4]:
x.shape

(442, 10)

### Linear Regression using SKlearn

In [5]:
# train test splot
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [6]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(x_train, y_train)

In [7]:
y_pred = reg.predict(x_test)

In [8]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

0.4399338661568969

In [9]:
# these are all the beta values
reg.coef_

array([  -9.15865318, -205.45432163,  516.69374454,  340.61999905,
       -895.5520019 ,  561.22067904,  153.89310954,  126.73139688,
        861.12700152,   52.42112238])

In [20]:
# beta0 value
reg.intercept_

151.88331005254167

### Making our Own Regression Class

In [25]:
# this does not have '1' as first column
x_train.shape

(353, 10)

In [26]:
# this one has '1' as the first column
np.insert(x_train, 0, 1, axis=1).shape

(353, 11)

In [27]:
class My_LR:

    def __init__(self) -> None:
        self.coef_ = None
        self.intercept_ = None

    def fit(self, x_train, y_train):
        # add the first column with '1'
        x_train = np.insert(x_train, 0, 1, axis=1)

        # calculate the coeffs, use the formula from Day 50_2 notes
        betas = np.linalg.inv(np.dot(x_train.T, x_train)).dot(x_train.T).dot(y_train)
        # assign values
        self.intercept_ = betas[0]
        self.coef_ = betas[1:]

    def predict(self, x_test):
        y_pred = np.dot(x_test, self.coef_) + self.intercept_
        return y_pred

In [33]:
lr = My_LR()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
y_pred.shape

(89,)

In [30]:
r2_score(y_test, y_pred)

0.439933866156897

In [34]:
# the beta values
lr.coef_

array([  -9.15865318, -205.45432163,  516.69374454,  340.61999905,
       -895.5520019 ,  561.22067904,  153.89310954,  126.73139688,
        861.12700152,   52.42112238])

In [35]:
# the beta0 values
lr.intercept_

151.88331005254167