In [2]:
!pip install -q pandas
import math
import pandas as pd
import numpy as np

$$\text{Mean} (\mu) = \frac{\sum_{i=1}^{n} x_i}{n}$$<br><br>
$$\text{MSE} = \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2$$<br><br>
$$\text{RMSE} = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2}$$<br><br>
$$\text{Variance(}\sigma^2) = \frac{1}{n} \sum_{i=1}^{n} (x_i - \mu)^2$$<br><br>
$$\text{Cov}(X, Y) = \frac{1}{n-1} \sum_{i=1}^{n} (x_i - \mu(X))(y_i - \mu(Y))$$<br><br>
$$\text{Intercept}(b_0) = \mu(Y) - b_1 \mu(X)$$<br><br>
$$\text{Slope}(b_1) = \frac{\sum_{i=1}^{n} (x_i - \mu(X))(y_i - \mu(Y))}{\sum_{i=1}^{n} (x_i - \mu(X))^2} = \frac{\text{Cov}(X, Y)}{\text{Var}(X)}$$<br><br>
$$\text{Simple Linear Regression}(y) = \beta_0 + \beta_1 x + \epsilon$$<br><br>

In [110]:
import pandas as pd
import numpy as np
from math import sqrt

def n_(x):
    n = len(x)
    return n

def power_(x, n):
    x = x**n
    return x

def Σ_(x):
    Σ = 0
    for xi in x:
        Σ += xi
    return Σ

def μ_(x):
    n = n_(x)
    μ = Σ_([xi for xi in x]) / n
    return μ

def σ2_(x):
    μ, n = μ_(x), n_(x)
    σ2 = Σ_([power_((xi - μ), 2) for xi in x]) / (n - 1)
    return σ2

def sqrt_(x):
    # Babylonian/Heron's method
    if x < 0:
        raise ValueError("Cannot compute square root of a negative number")
    guess = x / 2.0
    tolerance = 1e-10
    while abs(guess * guess - x) > tolerance:
        guess = (guess + x / guess) / 2.0
    return guess

def CovXY_(x, y):
    xy = list(zip(x, y))
    n = n_(xy)
    μX = μ_(x)
    μY = μ_(y)
    CovXY = Σ_([(xi - μX) * (yi - μY) for xi, yi in xy]) / (n - 1)
    return CovXY

def coefficients_(x, y):
    b1 = CovXY_(x, y) / σ2_(x)
    b0 = μ_(y) - b1 * μ_(x)
    return b0, b1

def mse_(y, yhat):
    n = n_(y)
    error_summation = Σ_([power_(yhat[i] - y[i], 2) for i in range(n)])
    mse = error_summation / n
    return mse

def rmse_(y, yhat):
    mse = mse_(y, yhat)
    rmse = sqrt_(mse)
    return rmse

def train_test_split_(x, y, split):
    import random
    random.seed(1)
    data = list(zip(x, y))
    random.shuffle(data)
    x, y = zip(*data)
    split_index = round(split * len(x))
    xtrain = x[:split_index]
    ytrain = y[:split_index]
    xtest = x[split_index:]
    ytest = y[split_index:]
    return (xtrain, ytrain), (xtest, ytest)

def simple_linear_regression_(x, y):
    yhats = list()
    b0, b1 = coefficients_(x, y)
    for row in x:
        yhat = b0 + b1 * row
        yhats.append(yhat)
    return yhats

def train(x, y):
    yhat = simple_linear_regression_(x, y)
    rmse = rmse_(y, yhat)
    return rmse


In [111]:
data = pd.read_csv('AutoInsurSweden.tsv', sep='\t')
seed(1)
split = 0.8
x, y = data['X'].to_numpy(), data['Y'].to_numpy()

print('μ(x) = %.3f σ2(x) = %.3f' % (μ_(x), σ2_(x)))
print('μ(y) = %.3f σ2(y) = %.3f' % (μ_(y), σ2_(y)))
print('CovXY(X, Y) = %.3f' % (CovXY_(x, y)))
print('Coefficients: Intercept(b0) = %.3f, Slope(b1) = %.3f' % (coefficients_(x, y)))

(train_x, train_y), (test_x, test_y) = train_test_split_(x, y, split)
rmse = train(train_x, train_y)
print("RMSE:", rmse)

μ(x) = 22.905 σ2(x) = 545.313
μ(y) = 98.187 σ2(y) = 7626.101
CovXY(X, Y) = 1861.604
Coefficients: Intercept(b0) = 19.994, Slope(b1) = 3.414
RMSE: 37.1005471622345
