# Linear Regression Practice

The goal of this notebook is to re-create the scikit-learn linear model from scratch.

## Linear Regression with scikit-learn

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
# Load the diabetes dataset
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

# Use only one feature
diabetes_X = diabetes_X[:, np.newaxis, 2]

# Split the data into training/testing sets
x_train, x_test, y_train, y_test = train_test_split(diabetes_X, diabetes_y, test_size=0.1)

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(x_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(x_test)


In [None]:
# The coefficients
print("Coefficients: \n{}".format(regr.coef_))
# The mean squared error
print("Mean squared error: {:.2f}".format(mean_squared_error(y_test, y_pred)))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: {:.2f}".format(r2_score(y_test, y_pred)))


In [None]:
# Plot outputs
plt.scatter(x_test, y_test, color="red", marker="+")
plt.plot(x_test, y_pred, color="blue", linewidth=0.75)
plt.show()


## Linear Regression from scratch

In [None]:
# Define functions corresponding to calculation of Linear Regression by hand
# Wrap functions in a class


class MyLinearRegression:
    """A Linear Regression model.
    """
    def __init__(self):
        return


    def _calculate_r(self, x, y):
        """Calculates Pearson Correlation Coefficient (r).
        """
        x_mean = np.mean(x)
        y_mean = np.mean(y)
        numerator = sum([(x_i-x_mean)*(y[i]-y_mean) for i, x_i in enumerate(x)])
        sub_denominator_x = sum([(x_i-x_mean)**2 for i, x_i in enumerate(x)])
        sub_denominator_y = sum([(y_i-y_mean)**2 for i, y_i in enumerate(y)])
        denominator = np.sqrt(sub_denominator_x * sub_denominator_y)
        return numerator/denominator


    def _calculate_b(self, x, y, r):
        """Calculates slope (b) of regression line.

        r: Pearson's correlation coefficient
        """
        x_stdev = np.std(x, ddof=1)
        y_stdev = np.std(y, ddof=1)
        return r * (y_stdev / x_stdev)


    def _calculate_a(self, x, y, b):
        """Calculates y-intercept (a) of regression line.

        b: Slope
        """
        x_mean = np.mean(x)
        y_mean = np.mean(y)
        return y_mean - (b * x_mean)


    def fit(self, x, y):
        """Calculates components of the linear regression formula: y = a + bx.
        """
        self.r = self._calculate_r(x, y)
        self.b = self._calculate_b(x, y, r=self.r)
        self.a = self._calculate_a(x, y, b=self.b)
        return


    def predict(self, x):
        """Calculates predicted y using: y = a + bx.
        """
        return np.array([self.a + (self.b * x_i) for x_i in x]).flatten()



In [None]:
# Test on dummy data
x_dummy = np.array([17, 13, 12, 15, 16, 14, 16, 16, 18, 19])
y_dummy = np.array([94, 73, 59, 80, 93, 85, 66, 79, 77, 91])

my_regr = MyLinearRegression()
my_regr.fit(x=x_dummy, y=y_dummy)
my_regr.predict(x=x_dummy)


In [None]:
# Test on Diabetes data set
my_regr = MyLinearRegression()
my_regr.fit(x=x_train, y=y_train)
my_y_pred = my_regr.predict(x=x_test)


In [None]:
# Scikit-learn and personal model predictions are the same!
np.testing.assert_allclose(my_y_pred, y_pred, rtol=1e-5, atol=0)
display(y_pred, my_y_pred)
