# Linear regression

This notebook was adapted from a course at Duke University.



We will show how to estimate regression parameters using a simple linear model

$$
y \sim ax + b
$$

We can restate the linear model $$y = ax + b + \epsilon$$ as sampling from a probability distribution

$$
y \sim \mathcal{N}(ax + b, \sigma^2)
$$

Now we can use `PyMC3` to estimate the parameters $a$, $b$ and $\sigma$. We will assume the following priors

$$
a \sim \mathcal{N}(0, 100) \\
b \sim \mathcal{N}(0, 100) \\
\sigma \sim | \mathcal{N(0, 1)} |
$$

Note: It may be useful to scale observed values to have zero mean and unit standard deviation to simplify choice of priors. However, you may need to back-transform the parameters to interpret the estimated values.

## Imports

In [None]:
%matplotlib inline

import numpy as np
import numpy.random as rng
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')
sns.set_style('darkgrid')
import pandas as pd
import pymc3 as pm
import scipy.stats as stats
#from sklearn.preprocessing import StandardScaler
import theano
import theano.tensor as tt
theano.config.warn.round=False

import warnings
warnings.simplefilter('ignore', UserWarning)

#### Setting up and fitting linear model

In [None]:
# observed data
np.random.seed(123)
n = 11
_a = 6
_b = 2
x = np.linspace(0, 1, n)
y = _a*x + _b + np.random.randn(n)

In [None]:
niter = 1000
with pm.Model() as linreg:
    a = pm.Normal('a', mu=0, sd=100)
    b = pm.Normal('b', mu=0, sd=100)
    sigma = pm.HalfNormal('sigma', sd=1)
    
    y_est = a*x + b     
    likelihood = pm.Normal('y', mu=y_est, sd=sigma, observed=y)

    trace = pm.sample(niter, random_seed=123)

In [None]:
pm.traceplot(trace, varnames=['a', 'b'])
pass

In [None]:
# Make a plot with the true result and draws from the end of the trace.

plt.scatter(x, y, s=30, label='data')
for a_, b_ in zip(trace['a'][-100:], trace['b'][-100:]):
    plt.plot(x, a_*x + b_, c='gray', alpha=0.1)
plt.plot(x, _a*x + _b, label='true regression line', lw=3., c='red')
plt.legend(loc='best')
pass

#### Posterior predictive checks

In [None]:
ppc = pm.sample_posterior_predictive(trace, samples=500, model=linreg, size=11)

In [None]:
sns.distplot([np.mean(n) for n in ppc['y']], kde=True)
plt.axvline(np.mean(y), color='red')
pass

## Using the GLM module

In [None]:
df = pd.DataFrame({'x': x, 'y': y})
df.head()

In [None]:
with pm.Model() as model:
    pm.glm.GLM.from_formula('y ~ x', df)
    trace = pm.sample(2000)

In [None]:
pm.traceplot(trace, varnames=['Intercept', 'x'])
pass  

In [None]:
plt.scatter(x, y)
pm.plot_posterior_predictive_glm(trace, samples=200)
plt.plot(x, _a*x + _b, label='true regression line', lw=3., c='red')
pass

## Robust linear regression

If our data has outliers, we can perform a robust regression by modeling errors from a fatter tailed distribution than the normal distribution.

In [None]:
# observed data
np.random.seed(123)
n = 11
_a = 6
_b = 2
x = np.linspace(0, 1, n)
y = _a*x + _b + np.random.randn(n)
y[5] *=10 # create outlier
df = pd.DataFrame({'x': x, 'y': y})
df.head()

#### Effect of outlier on linear regression

In [None]:
niter = 1000
with pm.Model() as linreg:
    a = pm.Normal('a', mu=0, sd=100)
    b = pm.Normal('b', mu=0, sd=100)
    sigma = pm.HalfNormal('sigma', sd=1)
    
    y_est = pm.Deterministic('mu', a*x + b)
    y_obs = pm.Normal('y_obs', mu=y_est, sd=sigma, observed=y)

    trace = pm.sample(niter, random_seed=123)

In [None]:
with linreg:
    pp = pm.sample_posterior_predictive(trace, samples=100, vars=[a, b])

In [None]:
plt.scatter(x, y, s=30, label='data')
for a_, b_ in zip(pp['a'], pp['b']):
    plt.plot(x, a_*x + b_, c='gray', alpha=0.1)
plt.plot(x, _a*x + _b, label='true regression line', lw=3., c='red')
plt.legend(loc='upper left')
pass

#### Use a T-distribution for the errors for a more robust fit

Note how we sample [a, b] as a vector β using the `shape` argument.

In [None]:
niter = 1000
with pm.Model() as robust_linreg:
    beta = pm.Normal('beta', 0, 10, shape=2)
    nu = pm.Exponential('nu', 1/len(x))
    sigma = pm.HalfCauchy('sigma', beta=1)

    y_est = beta[0] + beta[1]*x
    y_obs = pm.StudentT('y_obs', mu=y_est, sd=sigma, nu=nu, observed=y)

    trace = pm.sample(niter, random_seed=123)

In [None]:
with robust_linreg:
    pp = pm.sample_posterior_predictive(trace, samples=100, vars=[beta])

In [None]:
plt.scatter(x, y, s=30, label='data')
for a_, b_ in zip(pp['beta'][:,1], pp['beta'][:,0]):
    plt.plot(x, a_*x + b_, c='gray', alpha=0.1)
plt.plot(x, _a*x + _b, label='true regression line', lw=3., c='red')
plt.legend(loc='upper left')
pass

### Using the GLM module

In [None]:
with pm.Model() as model:
    pm.glm.GLM.from_formula('y ~ x', df, 
                            family=pm.glm.families.StudentT())
    trace = pm.sample(2000)

In [None]:
plt.scatter(x, y)
pm.plot_posterior_predictive_glm(trace, samples=200)
plt.plot(x, _a*x + _b, label='true regression line', lw=3., c='red')
pass