In [70]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

In [71]:
df = pd.DataFrame()

In [72]:
N = 10000
np.random.seed(42)
df['X1'] = np.random.normal(0, 1, N)
df['X2'] = df['X1'] + np.random.normal(0, 1, N)
df['X3'] = df['X1']**2
df['X4'] = df['X1']**3
df['X5'] = np.abs(df['X1'])
df['X6'] = df['X1']**2 + np.random.normal(0, 0.5, N)

# Con sklearn

In [73]:
from sklearn.linear_model import LinearRegression

In [74]:
model = LinearRegression()

In [75]:
model.fit(df['X1'].values.reshape(-1, 1), df['X2'].values.reshape(-1, 1))

LinearRegression()

In [76]:
model.coef_, model.intercept_

(array([[0.99146746]]), array([0.01351583]))

# Con statsmodel

In [77]:
import statsmodels.formula.api as smf

In [97]:
formula = 'X2 ~ X1'
# endogenas vs exogenas
# Ordinary Least Squares
lm = smf.ols(formula=formula, data=df).fit()
print(lm.params)

lm.rsquared

Intercept    0.013516
X1           0.991467
dtype: float64


0.4969570836013304

In [98]:
lm.summary()

0,1,2,3
Dep. Variable:,X2,R-squared:,0.497
Model:,OLS,Adj. R-squared:,0.497
Method:,Least Squares,F-statistic:,9877.0
Date:,"Mon, 14 Sep 2020",Prob (F-statistic):,0.0
Time:,00:16:36,Log-Likelihood:,-14199.0
No. Observations:,10000,AIC:,28400.0
Df Residuals:,9998,BIC:,28420.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0135,0.010,1.350,0.177,-0.006,0.033
X1,0.9915,0.010,99.383,0.000,0.972,1.011

0,1,2,3
Omnibus:,2.425,Durbin-Watson:,2.009
Prob(Omnibus):,0.297,Jarque-Bera (JB):,2.478
Skew:,-0.008,Prob(JB):,0.29
Kurtosis:,3.075,Cond. No.,1.0


# Calculo de R-squared

In [80]:
error = lm.params['X1'] * df['X1'] + lm.params['Intercept'] - df['X2']

In [103]:
error.mean(), error.std(ddof=2)

(-6.075140390748857e-17, 1.0010236907114534)

In [219]:
R_square = (df['X2'].var()-error.var())/df['X2'].var()
R_square

0.4969570836013304

# Intervalos de confianza - https://www2.isye.gatech.edu/~yxie77/isye2028/lecture12.pdf

In [159]:
from scipy.stats import norm

In [162]:
Sxx = ((df['X1'] - df['X1'].mean())**2).sum()

In [164]:
Sxx

10068.360554106486

### Para pendiente

In [165]:
coef_std = (error.var(ddof=2)/Sxx)**0.5

In [166]:
# std_err en la tabla
coef_std

0.009976196069731437

In [182]:
# [0.025	0.975] en tabla
alpha = 0.025
norm.ppf(alpha, lm.params['X1'], coef_std), norm.ppf(1-alpha, lm.params['X1'], coef_std)

(0.9719144769051349, 1.0110204469039021)

### Para intercept

In [190]:
# para intercept

intercept_std = (error.var(ddof=2) * (1/len(df) + ((df['X1'].mean())**2)/Sxx))**0.5
lm.params['Intercept'], intercept_std

(0.013515827705424775, 0.010010259587521457)

In [191]:
norm.ppf(alpha, lm.params['Intercept'], intercept_std), norm.ppf(1-alpha, lm.params['Intercept'], intercept_std)

(-0.0061039205620140626, 0.03313557597286361)

# P-values

In [206]:
# t
t_coef = lm.params['X1'] - 0/coef_std
t_coef

0.9914674619045185

In [210]:
# t
t_intercept = (lm.params['Intercept'] - 0)/intercept_std
t_intercept

1.3501975235760393

In [214]:
(1-norm.cdf(t_intercept))*2

0.17695263234530945

In [217]:
norm.cdf(0, lm.params['Intercept'], intercept_std)*2

0.17695263234530956

In [157]:
norm.cdf(6.084-2.999, 6.084, 2.029)

0.06969478725194628

In [None]:
lm.params['X1']

In [202]:
norm.cdf(lm.params['X1'] - 1.350, lm.params['X1'], intercept_std)

0.0

In [203]:
lm.params['X1']

0.9914674619045185

# Feature engineering

In [10]:
formula = 'X3 ~ X1'
lm = smf.ols(formula=formula, data=df).fit()
print(lm.params)

lm.rsquared

Intercept    1.006836
X1          -0.002302
dtype: float64


2.596250281539092e-06

In [34]:
formula = 'X3 ~ np.square(X1)'
lm = smf.ols(formula=formula, data=df).fit()
print(lm.params)

lm.rsquared

Intercept       -4.510281e-16
np.square(X1)    1.000000e+00
dtype: float64


1.0