In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

sns.set_style("ticks")

# Linear Regression

## Example:

In [2]:
x = np.array([7,20,13,59,40,51,37])
y = np.array([7,6,8,1,3,2,3])

In [3]:
x

array([ 7, 20, 13, 59, 40, 51, 37])

In [4]:
y

array([7, 6, 8, 1, 3, 2, 3])

## 1. Mean / Average

In [5]:
x.mean()

32.42857142857143

In [6]:
y.mean()

4.285714285714286

## 2. Variance

In [7]:
# manually set the degrees of freedom to get unbiased estimators
ddof = 1

In [8]:
x.var(ddof=ddof)

384.61904761904754

In [9]:
x.var()

329.67346938775506

In [10]:
y.var(ddof=ddof)

7.238095238095238

## 3. Standard Deviation

In [11]:
x.std(ddof=ddof)

19.611706902231827

In [12]:
# equivalent
np.sqrt(x.var(ddof=ddof))

19.611706902231827

In [13]:
y.std(ddof=ddof)

2.690370836538197

## 4. Covariance

In [14]:
np.cov(x,y)

array([[384.61904762, -51.30952381],
       [-51.30952381,   7.23809524]])

In [15]:
np.cov(x,y,ddof=ddof)

array([[384.61904762, -51.30952381],
       [-51.30952381,   7.23809524]])

Result is a variance-covariance matrix.

To get the covariance, access an element that is not on the diagonal

In [19]:
cov_xy = np.cov(x,y)[1,0]

## 5. Correlation (Pearson)

In [20]:
# manually
cov_xy / np.sqrt(x.var(ddof=ddof) * y.var(ddof=ddof))

-0.9724570819188173

In [22]:
np.corrcoef(x,y)[0,1]

-0.9724570819188174

## 6. Regression coefficient

In [24]:
# manually: beta_1
beta_1 = cov_xy / x.var(ddof=ddof)
beta_1

-0.13340349139532007

In [26]:
beta_0 = y.mean() - beta_1 * x.mean()
beta_0

8.611798935248236

### Using statsmodels

In [27]:
df = pd.DataFrame({"price": x,
                   "rating": y})
df

Unnamed: 0,price,rating
0,7,7
1,20,6
2,13,8
3,59,1
4,40,3
5,51,2
6,37,3


In [29]:
# add constant
df = df.assign(const=1)

In [33]:
# run OLS
lin_reg = sm.OLS(df["rating"],
                 df[["const", "price"]]).fit()

In [34]:
lin_reg.summary()



0,1,2,3
Dep. Variable:,rating,R-squared:,0.946
Model:,OLS,Adj. R-squared:,0.935
Method:,Least Squares,F-statistic:,87.03
Date:,"Tue, 04 Feb 2020",Prob (F-statistic):,0.000238
Time:,14:11:30,Log-Likelihood:,-6.1262
No. Observations:,7,AIC:,16.25
Df Residuals:,5,BIC:,16.14
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,8.6118,0.531,16.204,0.000,7.246,9.978
price,-0.1334,0.014,-9.329,0.000,-0.170,-0.097

0,1,2,3
Omnibus:,,Durbin-Watson:,1.559
Prob(Omnibus):,,Jarque-Bera (JB):,0.457
Skew:,0.582,Prob(JB):,0.796
Kurtosis:,2.539,Cond. No.,76.1
